# Cars.com Listing Scraper

Instructions: 
* Run first cell below and select the make of the car
* Run second cell and then select model, min/max year, zip code, and search distance
* Run the final two cells to display the data frame and interactive plot

In [1]:
from bs4 import BeautifulSoup as soup  # ...
from urllib.request import urlopen as uReq  # ...
import ipywidgets as widgets
import pandas as pd
from datetime import datetime
import time
import matplotlib.pyplot as plt
import numpy as np
import math

# Create dropdown menu widget for user to select make 
widgetMake = widgets.Dropdown(
    options=['Acura', 'Alfa Romeo', 'Audi', 'BMW', 'Buick', 'Cadilac', 'Chevrolet', 'Chrysler', 'Dodge', 'Fiat', 'Ford', 'Genesis', 'GMC', 'Honda', 'Hummer', 'Hyundai', 'Infiniti', 'Jaguar', 'Jeep', 'Kia', 'Land Rover', 'Lexus', 'Lincoln', 'Maserati', 'Mazda', 'Mercedes Benz', 'MINI', 'Mitsubishi', 'Nissan', 'Porsche', 'Scion', 'Subaru', 'Tesla', 'Toyota', 'Volkswagen', 'Volvo'],
    description='Select Make:',
    disabled=False,
)
# Display the widget
display(widgetMake)

Dropdown(description='Select Make:', options=('Acura', 'Alfa Romeo', 'Audi', 'BMW', 'Buick', 'Cadilac', 'Chevr…

In [2]:
car_dict = {
    "Acura": ["CL", "ILX", "ILX Hybrid", "Integra", "Legend", "MDX", "MDX Sport Hybrid", "NSX", "RDX", "RL", "RLX", "RLX Sport Hybrid", "RSX", "TL", "TLX", "TSX", "ZDX"],
    "Alfa Romeo": ["Giulia", "Stelvio"],
    "Audi": ["A3", "A4", "A5", "A6", "A7", "A8", "Q3", "Q5", "Q7", "Q8", "R8", "S3", "S4", "S5", "S6", "S7", "S8", "SQ5", "TT", "TTS"],
    "BMW": ["128", "135", "228", "228 Gran Coupe", "230", "320", "325", "328", "328 Gran Turismo", "328d", "330", "330 Gran Turismo", "330e", "335", "335 Gran Turismo", "340", "340 Gran Turismo", "428", "428 Gran Coupe", "430", "430 Gran Coupe", "435", "435 Gran Coupe", "440", "440 Gran Coupe", "525", "528", "530", "530e", "535", "535 Gran Turismo", "535d", "540", "550", "640", "640 Gran Coupe", "640 Gran Turismo", "645", "650", "650 Gran Coupe", "740", "740e", "750", "840", "840 Gran Coupe", "ALPINA B7", "M2", "M235", "M235 Gran Coupe", "M240", "M3", "M340", "M4", "M440", "M5", "M550", "M6", "M6 Gran Coupe", "M760", "M8", "M8 Gran Coupe", "M850", "M850 Gran Coupe", "X1", "X2", "X3", "X3 M", "X3 PHEV", "X4", "X4 M", "X5 M", "X5 PHEV", "X5 eDrive", "X6", "X6 M", "X7", "Z3", "Z4", "Z4 M", "Z8", "i3", "i8"],
    "Buick": ["Enclave", "Encore", "Encore GX", "Envision", "LaCrosse", "Lucerne", "Regal", "Verano"],
    "Cadillac": ["ATS", "CT4", "CT5", "CT6", "CTS", "DTS", "Escalade", "Escalade ESV", "Escalade EXT", "SRX", "XT4", "XT5", "XT6", "XTS"],
    "Chevrolet": ["Avalanche", "Blazer", "Bolt EV", "Camaro", "Colorado", "Corvette", "Corvette Stingray", "Cruze", "Cruze Limited", "Equinox", "Impala", "Impala Limited", "Malibu", "Malibu Limited", "Silverado 1500", "Silverado 1500 LD", "Silverado 2500", "Silverado 3500", "Sonic", "Spark", "Suburban", "Tahoe", "Trailblazer", "Traverse", "Trax", "Volt"],
    "Chrysler": ["200", "300", "300C", "300M", "Aspen", "Crossfire", "PT Cruiser", "Pacifica", "Pacifica Hybrid", "Sebring", "Voyager"],
    "Dodge": ["Avenger", "Caliber", "Caravan", "Challenger", "Charger", "Dakota", "Dart", "Durango", "Grand Caravan", "Journey", "Magnum", "Nitro", "Ram 1500", "Ram 2500", "Ram 3500", "Stratus", "Viper"],
    "Fiat": ["500", "500C", "500L", "500X", "500e"],
    "Ford": ["Bronco", "Bronco Sport", "C-Max Energi", "C-Max Hybrid", "Crown Victoria", "E150", "E250", "E350 Super Duty", "EcoSport", "Edge", "Escape", "Excursion", "Expedition", "Expedition EL", "Expedition Max", "Explorer", "Explorer Sport Trac", "F-150", "F-250", "F-350", "F-450", "Fiesta", "Flex", "Focus", "Focus RS", "Focus ST", "Fusion", "Fusion Energi", "Fusion Hybrid", "Mustang", "Ranger", "Shelby GT350", "Shelby GT500", "Taurus", "Thunderbird", "Transit Connect", "Transit-150", "Transit-250", "Transit-350"],
    "Genesis": ["G70", "G80", "G90", "GV70", "GV80"],
    "GMC": ["Acadia", "Acadia Limited", "Canyon", "Envoy", "Sierra 1500", "Seierra 1500 Limited", "Sierra 2500", "Sierra 3500", "Terrain", "Yukon", "Yukon XL"],
    "Honda": ["Accord", "Accord Hybrid", "CR-V", "CR-V Hybrid", "CR-Z", "Civic", "Civic Hybrid", "Civic Si", "Civic Type R", "Crosstour", "Element", "Fit", "HR-V", "Insight", "Odyssey", "Passport", "Pilot", "Ridgeline"],
    "Hummer": ["H1", "H2", "H3"],
    "Hyundai": ["Accent", "Elantra", "Elantra GT", "Genesis", "Genesis Coupe", "Ioniq EV", "Ioniq Hybrid", "Kona", "Kona EV", "Palisade", "Sante Fe", "Sante Fe Sport", "Sonata", "Sonata Hybrid", "Tuscon", "Veloster"],
    "Infiniti": ["EX35", "EX37", "FX35", "FX37", "FX45", "FX50", "G20", "G25", "G25x", "G35", "G35x", "G37", "G37x", "I30", "I35", "JX35", "M30", "M35", "M35x", "M37", "M37x", "M45", "Q40", "Q45", "Q50", "Q50 Hybrid", "Q60", "Q70", "Q70L", "QX30", "QX50", "QX60", "QX60 Hybrid", "QX70", "QX80", ],    
    "Jaguar": ["E-PACE", "F-PACE", "F-TYPE", "I-PACE", "S-TYPE", "X-TYPE", "XE", "XF", "XJ", "XJ6", "XJ8", "XJR", "XJS", "XK", "XK8", "XKE", "XKR"],
    "Jeep": ["Cherokee", "Commander", "Compass", "Gladiator", "Grand Cherokee", "Liberty", "Patriot", "Renegade", "Wrangler", "Wrangler JK", "Wrangler JK Unlimited", "Wrangler Unlimited"],        
    "Kia": ["Amanti", "Borrego", "Cadenza", "Carnival", "Forte", "Forte Koup", "K5", "K900", "Niro", "Niro EV", "Optima", "Optima Hybrid", "Rio", "Rondo", "Sedona", "Seltos", "Sorento", "Soul", "Sportage", "Stinger", "Telluride"],
    "Land Rover": ["Defender", "Discovery", "Discovery Sport", "LR2", "LR3", "LR4", "Range Rover", "Range Rover Evoque", "Range Rover Sport", "Range Rover Velar"],
    "Lexus": ["CT 200h", "ES 250", "ES 300", "ES 300h", "ES 330", "ES 350", "GS 350", "GX 460", "GX 470", "IS 200t", "IS 250", "IS 300", "IS 350", "LC 500", "LS 430", "LS 460", "LS 500", "LX 470", "LX 570", "NX 200t", "NX 300", "NX 300h", "RC 200t", "RC 300", "RC 350", "RC F", "RX 300", "RX 330", "RX 350", "RX 350L", "RX 400h", "RX 450h", "RX 450hL", "SC 430", "UX 200", "UX 250h"],
    "Lincoln": ["Aviator", "Continental", "Corsair", "MKC", "MKS", "MKT", "MKX", "MKZ", "MKZ Hyrbid", "Nautilus", "Navigator", "Navigator L", "Town Car"],
    "Maserati": ["Ghibli", "GranTurismo", "Levante", "Quattroporte"],
    "Mazda": ["CX-3", "CX-30", "CX-5", "CX-7", "CX-9", "MX-5 Miata", "MX-5 Miata RF", "Mazda2", "Mazda3", "Mazda5", "Mazda6", "RX-7", "RX-8", "Tribute"],
    "Mercedes Benz": ["A-Class", "AMG C 43", "AMG C 63", "AMG CLA 35", "AMG CLA 45", "AMG CLS 53", "AMG CLS 63", "AMG E 43", "AMG E 53", "AMG E 63", "AMG G 63", "AMG G 65", "AMG GLA 35", "AMG GLA 45", "AMG GLC 43", "AMG GLC 63", "AMG GLE 43", "AMG GLE 53", "AMG GLE 63", "AMG GLS 63", "AMG GT", "AMG S", "C-Class", "CL-Class", "CLA 250", "CLA-Class", "CLK-Class", "CLS-Class", "E-Class", "G-Class", "GL-Class", "GLA 250", "GLA-Class", "GLB 250", "GLB 300", "GLC-Class", "GLE-Class", "GLK-Class", "GLS 450", "GLS 550", "M-Class", "S-Class", "SL-Class", "SLK-Class", "Sprinter"],
    "MINI": ["Clubman", "Convertible", "Cooper", "Cooper S", "Countryman", "Hardtop", "Paceman", "Roadster"],
    "Mitsubishi": ["Eclipse", "Eclipse Cross", "Galant", "Lancer", "Lancer Evolution", "Mirage", "Mirage G4", "Outlander", "Outlander PHEV", "Outlander Sport"],
    "Nissan": ["350Z", "370Z", "Altima", "Altima Hybrid", "Armada", "Cube", "Frontier", "GT-R", "Juke", "Kicks", "Leaf", "Maxima", "Murano", "NV200", "Pathfinder", "Quest", "Rogue", "Rogue Select", "Rogue Sport", "Sentra", "Titan", "Titan XD", "Versa", "Versa Note", "Xterra"],
    "Porsche": ["718 Boxster", "718 Cayman", "911", "Boxster", "Cayenne", "Cayman", "Macan", "Panamera", "Taycan"],
    "Scion": ["FR-S", "iA", "iM", "iQ", "tC", "xA", "xB", "xD"],        
    "Subaru": ["Ascent", "BRZ", "Crosstrek", "Forester", "Impreza", "Impreza WRX", "Impreza WRX STI", "Legacy", "Outback", "Tribeca", "WRX", "WRX STI", "XV Crosstrek Hybrid"],
    "Tesla": ["Model 3", "Model S", "Model X", "Model Y", "Roadster"],
    "Toyota": ["4Runner", "Avalon", "Avalon Hybrid", "C-HR", "Camary", "Camary Hybrid", "Corolla", "Carolla Hatchback", "Carolla Hybrid", "Carolla iM", "FJ Cruiser", "Highlander", "Highlander Hybrid", "Land Cruiser", "Matrix", "Prius", "Prius Prime", "Prius c", "Prius v", "RAV4", "RAV4 Hybrid", "Sequoia", "Sienna", "Supra", "Tacoma", "Tundra", "Venza", "Yaris", "Yaris Sedan", "Yaris iA"],
    "Volkswagen": ["Arteon", "Atlas", "Atlas Cross Sport", "Bettle", "CC", "Cabrio", "Eos", "GTI", "Golf", "Golf Alltrack", "Golf GTI", "Golf R", "Golf SportWagen", "Jetta", "Jetta GLI", "Jetta SportWagen", "Passat", "Tiguan", "Tiguan Limited", "Touareg", "e-Golf"],
    "Volvo": ["C30", "C70", "S40", "S60", "S60 Inscription", "S70", "S80", "S90", "V40", "V50", "V60", "V60 Cross Country", "V70", "V90", "V90 Cross Country", "XC40", "XC60", "XC70", "XC90"]
}

# Create dropdown menu widget for user to select model 
widgetModel = widgets.Dropdown(
    options=car_dict.get(widgetMake.value),
    description='Select Model:',
    disabled=False,
)
# Display the widget
display(widgetModel)

# Create dropdown menu widget for user to select min. year 
widgetMinYear = widgets.Dropdown(
    options=["None", "2021", "2020", "2019", "2018", "2017", "2016", "2015", "2014", "2013", "2012", "2011", "2010", "2009", "2008", "2007", "2006", "2005", "2004", "2003", "2002", "2001", "2000", "1999", "1998", "1997", "1996", "1995", "1994", "1993", "1992", "1991", "1990", "1989", "1988", "1987", "1986", "1985", "1983", "1982", "1981", "1980"],
    description='Min. Year:',
    disabled=False,
)
# Display the widget
display(widgetMinYear)

# Create dropdown menu widget for user to max. year 
widgetMaxYear = widgets.Dropdown(
    options=["None", "2021", "2020", "2019", "2018", "2017", "2016", "2015", "2014", "2013", "2012", "2011", "2010", "2009", "2008", "2007", "2006", "2005", "2004", "2003", "2002", "2001", "2000", "1999", "1998", "1997", "1996", "1995", "1994", "1993", "1992", "1991", "1990", "1989", "1988", "1987", "1986", "1985", "1983", "1982", "1981", "1980"],
    description='Max. Year:',
    disabled=False,
)
# Display the widget
display(widgetMaxYear)

# Create dropdown menu widget for user to select zip code 
widgetZipCode = widgets.Text(
    description='Zip Code:',
    disabled=False,
)
# Display the widget
display(widgetZipCode)

# Create dropdown menu widget for user to select distance 
widgetDistance = widgets.Dropdown(
    options=["All (nationwide)", "10 miles", "20 miles", "30 miles", "40 miles", "50 miles", "75 miles", "100 miles", "150 miles", "200 miles", "250 miles", "500 miles"],
    description='Distance:',
    disabled=False,
)
# Display the widget
display(widgetDistance)

Dropdown(description='Select Model:', options=('Avenger', 'Caliber', 'Caravan', 'Challenger', 'Charger', 'Dako…

Dropdown(description='Min. Year:', options=('None', '2021', '2020', '2019', '2018', '2017', '2016', '2015', '2…

Dropdown(description='Max. Year:', options=('None', '2021', '2020', '2019', '2018', '2017', '2016', '2015', '2…

Text(value='', description='Zip Code:')

Dropdown(description='Distance:', options=('All (nationwide)', '10 miles', '20 miles', '30 miles', '40 miles',…

In [5]:
# build url based on form input
url_1 = 'https://www.cars.com/shopping/results/?list_price_max=&makes[]='
url_3 = '&maximum_distance='
url_5 = '&models[]='
url_9 = '&page=1&page_size=250&stock_type=used&year_max='
# determine url portion needed based on min/max year selections
if widgetMaxYear.value=='None':
    url_10 = ''
else:
    url_10 = widgetMaxYear.value
url_11 = '&year_min='
if widgetMinYear.value=='None':
    url_12 = ''
else:
    url_12 = widgetMinYear.value
url_13 = '&zip='

input_make = (widgetMake.value).lower()
input_make_url = input_make.replace(" ", "_")
input_model = (widgetModel.value).lower()
input_model_url = input_model.replace(" ", "_")
page_url = url_1 + input_make_url + url_3 + ((widgetDistance.value).split(" ")[0]).lower() + url_5 + input_make_url + '-' + input_model_url + url_9 + url_10 + url_11 + url_12 + url_13 + widgetZipCode.value

#
#page_url = 'https://www.cars.com/shopping/results/?list_price_max=&makes[]=audi&maximum_distance=500&models[]=audi-a4&page=1&page_size=250&stock_type=used&year_max=&year_min=2010&zip=98004'

# ...
uClient = uReq(page_url)

# ...
page_soup = soup(uClient.read(), "html.parser")
uClient.close()

# determine total number of listings 
counter_span = page_soup.find('span', {'class': "total-entries"})
counter_span_str = counter_span.get_text()[1:]
counter_span_num = counter_span_str.split(" ")[0]
num_listings = int(counter_span_num.replace(",", ""))



# 
df = pd.DataFrame(columns = ['Year', 'Name', 'Mileage', 'List Price', 'Distance', 'Dealer', 'Link'],  index = range(0, num_listings))



# retreive all listings from the first page
i_pageResults = page_soup.findAll("div", {"class": "vehicle-details"})
# loop though each vehicle listing 
for k in range(0, len(i_pageResults)):
  # scrape listing link
  link_a = i_pageResults[k].find('a', {'class': "vehicle-card-link"})['href']
  link_full = 'https://www.cars.com/' + link_a
  df.at[k, 'Link'] = link_full
  # scrape dealer name
  dealer_div = i_pageResults[k].find('div', {'class': "dealer-name"})
  dealer_str = dealer_div.get_text()[1:]
  dealer = dealer_str[:-1]
  df.at[k, 'Dealer'] = dealer
  # scrape listing title 
  k_title_h2 = i_pageResults[k].find('h2', {'class': "title"})
  k_title = k_title_h2.get_text()
  df.at[k, 'Name'] = k_title[5:]
  df.at[k, 'Year'] = k_title[:4]
  # scrape mileage
  miles_div = i_pageResults[k].find('div', {'class': "mileage"})
  miles_str = miles_div.get_text()
  miles_digits = miles_str.split(" ")[0]
  miles = miles_digits.replace(",", "")
  miles_int = int(miles)
  df.at[k, 'Mileage'] = miles_int
  # scrape distance (from zip code)
  dist_div = i_pageResults[k].find('div', {'class': "miles-from"})
  dist_str = dist_div.get_text()
  dist_digits = dist_str.split(" ")[0]
  dist = dist_digits.replace(",", "")
  if dist=='':
    dist_int = '9999'
  else:
    dist_int = int(dist)
  df.at[k, 'Distance'] = dist_int
  # scrape price
  price_span = i_pageResults[k].find('span', {'class': "primary-price"})
  price_str = price_span.get_text()
  price_usd = price_str.replace(",", "")
  price = price_usd.replace("$", "")
  if price_str=='Not Priced':
    df.at[k, 'List Price'] = 'Not Priced'
  else:
    df.at[k, 'List Price'] = int(price)
    

#
k+=1
# determine how many pages remain to scrape 
pages_remaining = math.ceil((num_listings-250)/250)
page_num = 2
# if there is more than 1 page of results, continue scraping
while (pages_remaining>0):
  # scrape 
  p_url = page_url.split("page=")[0] + 'page=' + str(page_num) + (page_url.split("page=")[1])[1:]
  # ...
  pClient = uReq(p_url)

  # ...
  p_page_soup = soup(pClient.read(), "html.parser")
  pClient.close()

  # retreive all listings from the p'th page
  p_pageResults = p_page_soup.findAll("div", {"class": "vehicle-details"})
  # loop though each vehicle listing 
  ### change f to p 
  for f in range(0, len(p_pageResults)):
    # check to see if any of the fields are null
    p_title_h2 = p_pageResults[f].find('h2', {'class': "title"})
    miles_div = p_pageResults[f].find('div', {'class': "mileage"})
    price_span = p_pageResults[f].find('span', {'class': "primary-price"})
    dist_div = p_pageResults[f].find('div', {'class': "miles-from"})
    p_title_h2_isNull = p_title_h2==None
    miles_div_isNull = miles_div==None
    price_span_isNull = price_span==None
    dist_div_isNull = dist_div==None
    if (p_title_h2_isNull or miles_div_isNull or price_span_isNull or dist_div_isNull):
      k+=1
    else:
      # scrape listing link
      link_a = p_pageResults[f].find('a', {'class': "vehicle-card-link"})['href']
      link_full = 'https://www.cars.com/' + link_a
      df.at[k, 'Link'] = link_full
      # scrape dealer name
      dealer_div = p_pageResults[f].find('div', {'class': "dealer-name"})
      dealer_str = dealer_div.get_text()[1:]
      dealer = dealer_str[:-1]
      df.at[k, 'Dealer'] = dealer
      # scrape listing title 
      p_title = p_title_h2.get_text()
      df.at[k, 'Name'] = p_title[5:]
      df.at[k, 'Year'] = p_title[:4]
      # scrape mileage  
      miles_str = miles_div.get_text()
      miles_digits = miles_str.split(" ")[0]
      miles = miles_digits.replace(",", "")
      miles_int = int(miles)
      df.at[k, 'Mileage'] = miles_int
      # scrape distance (from zip code)     
      dist_str = dist_div.get_text()
      dist_digits = dist_str.split(" ")[0]
      dist = dist_digits.replace(",", "")
      if dist=='':
        dist_int = '9999'
      else:
        dist_int = int(dist)
      df.at[k, 'Distance'] = dist_int
      # scrape price   
      price_str = price_span.get_text()
      price_usd = price_str.replace(",", "")
      price = price_usd.replace("$", "")
      if price_str=='Not Priced':
        df.at[k, 'List Price'] = 'Not Priced'
      else:
        df.at[k, 'List Price'] = int(price)
      # increment placement var
      k+=1

  # increment/decrement counter vars
  pages_remaining-=1
  page_num+=1


  


# clean final df 
df1 = df.dropna(thresh=2)
df2 = df1[df1['List Price'] != 'Not Priced']
df3 = df2[df2['Distance'] != '9999']
df4 = df3.reset_index()
df_final = df4.drop(columns=['index'])
pd.set_option("display.max_rows", None, "display.max_columns", None)
print(df_final)


     Year                                               Name Mileage  \
0    2011                                  Dodge Ram 1500 ST  101546   
1    2011                                 Dodge Ram 1500 SLT  154514   
2    2010                               Dodge Ram 1500 Sport   75374   
3    2010                                 Dodge Ram 1500 SLT  176386   
4    2006                                 Dodge Ram 1500 SLT  105140   
5    2008                             Dodge Ram 1500 Laramie  120181   
6    2010                             Dodge Ram 1500 Laramie   95702   
7    2008                        Dodge Ram 1500 SLT Quad Cab  132478   
8    2011                                    Dodge Ram 1500   101067   
9    2011                                 Dodge Ram 1500 SLT  157353   
10   2008                                 Dodge Ram 1500 SLT   92909   
11   1999                            Dodge Ram 1500 Club Cab  186016   
12   2008                        Dodge Ram 1500 SLT Quad Cab  15

In [6]:
import plotly.graph_objects as go

data = df_final
data['Plot Caption'] = ''
for j in range(0, len(data)):
  data.at[j, 'Plot Caption'] = data.iloc[j]['Year'] + ' ' + data.iloc[j]['Name'] + ' (' + data.iloc[j]['Dealer'] + ' - ' + str(data.iloc[j]['Distance']) + ' mi.)'

fig = go.Figure(data=go.Scatter(
    x = data['Mileage'],
    y = data['List Price'],
    mode='markers',
    marker=dict(
        size=16,
        color=data['Distance'], #set color equal to a variable
        colorscale='Tealgrn', # one of plotly colorscales
        showscale=True
    ),
    text=data['Plot Caption']
))

fig.show()