# Scraping CraigsList

In [None]:
# If needed:
!pip install requests beautifulsoup4 pandas lxml --quiet


In [None]:
import time, re, math, sys
from typing import List, Dict, Optional
import requests
from bs4 import BeautifulSoup
import pandas as pd

# -------- CONFIG --------
BASE_SITE = "https://newhaven.craigslist.org"   # pick a region; e.g., "https://hartford.craigslist.org"
SEARCH_PATH = "/search/cta"                      # cars+trucks = cta
QUERY_PARAMS = {
    # some useful filters (adjust as you like):
    "hasPic": 1,             # require pictures
    "min_auto_year": 2012,   # min year
    "srchType": "T",         # search titles only
}
RESULTS_PER_PAGE = 120       # CL shows up to 120 per page
MAX_PAGES = 3                # demo: ~360 rows max
REQUEST_DELAY_SECS = 1.0     # be polite
USER_AGENT = "UConn-OPIM-Student-Scraper/1.0 (educational use; contact instructor)"

# Regex helpers
YEAR_RE = re.compile(r"\b(19|20)\d{2}\b")
MILE_RE = re.compile(r"\b(\d{1,3}(?:,\d{3})*)\b")

def build_url(start: int = 0) -> str:
    params = QUERY_PARAMS.copy()
    if start > 0:
        params["s"] = start
    # Build querystring
    qs = "&".join(f"{k}={v}" for k,v in params.items())
    return f"{BASE_SITE}{SEARCH_PATH}?{qs}"

def fetch_html(url: str) -> Optional[str]:
    headers = {"User-Agent": USER_AGENT}
    resp = requests.get(url, headers=headers, timeout=20)
    if resp.status_code != 200:
        print(f"[warn] {resp.status_code} for {url}", file=sys.stderr)
        return None
    return resp.text

def parse_listing(li) -> Dict:
    """
    Handle both old ('result-row') and newer static classes (craigslist keeps changing markup).
    We extract: title, price, link, hood (neighborhood), year (from title), odometer (if present).
    """
    # Title + link
    title_el = li.select_one("a.result-title") or li.select_one("a.posting-title") or li.select_one("a[href*='/cto/']")
    title = title_el.get_text(strip=True) if title_el else None
    link = title_el["href"] if title_el and title_el.has_attr("href") else None
    if link and link.startswith("/"):
        link = BASE_SITE + link

    # Price
    price_el = li.select_one("span.result-price") or li.select_one("span.price")
    price = None
    if price_el:
        try:
            price = int(re.sub(r"[^\d]", "", price_el.get_text()))
        except:
            pass

    # Neighborhood (hood)
    hood_el = li.select_one("span.result-hood")
    hood = hood_el.get_text(strip=True).strip("()") if hood_el else None

    # Year (from title)
    year = None
    if title:
        m = YEAR_RE.search(title)
        if m: year = int(m.group(0))

    # Odometer: sometimes appears in snippet text, sometimes on detail page; for demo, grab from snippet if present
    # Craigslist occasionally shows 'odometer' as a span in search results; try a few selectors:
    odo = None
    for sel in ["span.odometer", "span[class*=odometer]"]:
        el = li.select_one(sel)
        if el:
            try:
                odo = int(re.sub(r"[^\d]", "", el.get_text()))
                break
            except:
                pass

    return {
        "title": title,
        "price": price,
        "year_in_title": year,
        "hood": hood,
        "url": link
    }

def parse_search_page(html: str) -> List[Dict]:
    soup = BeautifulSoup(html, "lxml")
    # Try multiple patterns for result items:
    items = soup.select("li.result-row") or soup.select("li.cl-static-search-result") or soup.select("li[class*=result]")
    rows = []
    for li in items:
        try:
            rows.append(parse_listing(li))
        except Exception as e:
            print(f"[warn] parse listing failed: {e}", file=sys.stderr)
    return rows

def scrape(max_pages=MAX_PAGES) -> pd.DataFrame:
    all_rows = []
    for page in range(max_pages):
        start = page * RESULTS_PER_PAGE
        url = build_url(start=start)
        print(f"[info] fetching page {page+1} → {url}")
        html = fetch_html(url)
        if not html:
            break
        rows = parse_search_page(html)
        if not rows:
            print("[info] no rows found; stopping.")
            break
        all_rows.extend(rows)
        time.sleep(REQUEST_DELAY_SECS)
    df = pd.DataFrame(all_rows).drop_duplicates(subset=["url"]).reset_index(drop=True)
    # basic cleaning
    # derive rough make/model from title (very naive, improves in lab)
    def split_make_model(t):
        if not isinstance(t, str): return (None, None)
        # Remove year if present; then first token ~ make, next few tokens ~ model
        t2 = YEAR_RE.sub("", t).strip()
        parts = re.split(r"\s+", t2)
        if len(parts) == 0: return (None, None)
        make = parts[0].title()
        model = " ".join(parts[1:3]).title() if len(parts) > 1 else None
        return (make, model)

    if not df.empty:
        df[["make_guess","model_guess"]] = df["title"].apply(lambda t: pd.Series(split_make_model(t)))
    return df

if __name__ == "__main__":
    df = scrape()
    print(df.head(10))
    out = "/content/craigslist_used_cars.csv"
    df.to_csv(out, index=False)
    print(f"[done] saved {len(df)} rows to {out}")


[info] fetching page 1 → https://newhaven.craigslist.org/search/cta?hasPic=1&min_auto_year=2012&srchType=T
[info] fetching page 2 → https://newhaven.craigslist.org/search/cta?hasPic=1&min_auto_year=2012&srchType=T&s=120
[info] fetching page 3 → https://newhaven.craigslist.org/search/cta?hasPic=1&min_auto_year=2012&srchType=T&s=240
                                               title price  year_in_title  \
0                             Chevy Spark 2013$3,500  None         2013.0   
1  2009 Mazda 6. Runs and Drives Great.$1,800Wall...  None         2009.0   
2               Honda HRV 2018 clean$13,800New Haven  None         2018.0   
3                       Jetta VW 2017$4,700New Haven  None         2017.0   
4       2019 Nissan Versa 116 k miles$5,400New Haven  None         2019.0   
5                                               None  None            NaN   
6       2012 Ford Taurus SEL AWD Mint$3,900watertown  None         2012.0   
7  2015 Porsche Panamera 4 - Full Bumper-to-Bumpe..

In [None]:
import pandas as pd
df = pd.read_csv("/content/craigslist_used_cars.csv")
print(df.shape)
df.head()


(70, 7)


Unnamed: 0,title,price,year_in_title,hood,url,make_guess,model_guess
0,"Chevy Spark 2013$3,500",,2013.0,,https://newhaven.craigslist.org/cto/d/westport...,Chevy,"Spark $3,500"
1,"2009 Mazda 6. Runs and Drives Great.$1,800Wall...",,2009.0,,https://newhaven.craigslist.org/cto/d/wallingf...,Mazda,6. Runs
2,"Honda HRV 2018 clean$13,800New Haven",,2018.0,,https://newhaven.craigslist.org/cto/d/new-have...,Honda,"Hrv Clean$13,800New"
3,"Jetta VW 2017$4,700New Haven",,2017.0,,https://newhaven.craigslist.org/cto/d/new-have...,Jetta,"Vw $4,700New"
4,"2019 Nissan Versa 116 k miles$5,400New Haven",,2019.0,,https://newhaven.craigslist.org/cto/d/new-have...,Nissan,Versa 116


We can use an AGENT to clean this all up!!! Scraping wasn't perfect

# Run a nightly job on Github Actions

In [None]:
name: Scrape Craigslist Cars
on:
  schedule:
    - cron: "0 6 * * *"   # 06:00 UTC daily
  workflow_dispatch:

jobs:
  scrape:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
      - uses: actions/setup-python@v5
        with:
          python-version: "3.11"
      - run: pip install requests beautifulsoup4 pandas lxml
      - run: python scrape_craigslist.py
      - uses: actions/upload-artifact@v4
        with:
          name: craigslist_used_cars_csv
          path: /home/runner/work/${{ github.event.repository.name }}/${{ github.event.repository.name }}/craigslist_used_cars.csv
