In [155]:
import selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import JavascriptException
import pandas as pd
import numpy as np
import csv
import re

In [90]:
# Initialize blank.csv with header. 
# Only needed when cars.csv doesn't exist. If so, rename to cars.csv

def create_blank():
    columns = [
        "link",
        "listing_title",
        "listing_mileage",
        "primary_price",
        "deal_gauge",
        "exterior_color",
        "interior_color",
        "drivetrain",
        "mpg",
        "fuel_type",
        "transmission",
        "engine",
        "vin",
        "stock_number",
        "vehicle_history",
        "seller_name",
    ]

    file_path = "blank.csv"
    with open(file_path, "w", newline="") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(columns)

# create_blank()

In [156]:
df = pd.read_csv("cars.csv")

In [162]:
# Create a Selenium WebDriver
chrome_service = Service(executable_path='tools1_env/bin/chromedriver')
chrome_options = Options()
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36")
chrome_options.add_argument("--headless") # Run without GUI
driver = webdriver.Chrome(options=chrome_options)

In [163]:
%%time

# Edit this value:
PAGES_TO_SCRAPE = 1



page_start = len(df)//20 + 1
# If df has 100 rows, we want to start on page 6 since that would be the 101-120 listings.

# Loop through search result pages

# Print total listings to scrape
print(f"~ {(page_start-1)*20 + 1} - {(page-1+PAGES_TO_SCRAPE-1)*20 + 20} ~")
# 1&1 = 1-20  1&2 = 1-40  2&1 = 21-40  ...

f = open('cars.csv', 'a') # append mode
writer = csv.writer(f)

for page in range(page_start, PAGES_TO_SCRAPE+1):
    base_url = f"https://www.cars.com/shopping/results/?page_size=20&page={page}&maximum_distance=10&sort=listed_at&zip=80210"
    # Need sort=listed_at ("Oldest listed") and page_size=20.
    # maximum_distance and zip should be consistent for data integrity.
    driver.get(base_url)
    driver.implicitly_wait(10)
    
    print(f"{(page-1)*20 + 1} - {(page-1)*20 + 20}") # current block of listings
    # 1=1-20 2=21-40 ...
    
    # Find the vehicle card elements, then iterate through them
    vehicle_cards = driver.find_elements(By.CLASS_NAME, "vehicle-card-link")
    for vehicle_card in vehicle_cards:
        # Follow link to individual listing page
        link = vehicle_card.get_attribute("href")
        vehicle_card.send_keys(Keys.CONTROL + Keys.RETURN) # in new tab
        # Switch to the new tab
        driver.switch_to.window(driver.window_handles[1])
        driver.implicitly_wait(5)    
        
        # deal_guage doesn't exist for all listings
        try:
            deal_guage = driver.execute_script('return document.querySelector(".deal-gauge-list-price-description").textContent'),
        except JavascriptException:
            deal_gauge = np.nan
        
        # vehicle_history isn't always populated
        vehicle_history_section = driver.find_element(By.CLASS_NAME, 'sds-page-section.vehicle-history-section')
        if 'fancy-description-list' in vehicle_history_section.get_attribute('innerHTML'):
            vehicle_history = vehicle_history_section.find_element(By.CLASS_NAME, 'fancy-description-list').get_attribute('innerHTML')
            vehicle_history = vehicle_history.replace("\n", " ")
            # Use regex to clean html
            vehicle_history = ' '.join(vehicle_history.split()) # remove whitespace
            vehicle_history = re.sub(r'<\w+[^>]*>', '', vehicle_history) # remove opening HTML tags
            history_items = vehicle_history.split('</dd> ')
            # Create a dictionary to store the key-value pairs
            history_dict = {}
            for item in history_items:
                key, value = item.split('</dt> ')
                key = key.replace('<dt>', '')
                history_dict[key] = value
            # Remove the last </dd> tag from the last value in the dictionary
            last_key = list(history_dict.keys())[-1]
            history_dict[last_key] = history_dict[last_key].replace('</dd>', '')
            vehicle_history = history_dict
            
        else:
            vehicle_history = np.nan
        
        # mpg is listed differently for EVs, and also not always listed
        try:
            mpg = driver.find_element(By.XPATH, '//dt[text()="MPG"]/following-sibling::dd').text,
        except NoSuchElementException:
            key_specs_container = driver.find_element(By.ID, 'key-specs-container')
            mpg_section = key_specs_container.find_element(By.XPATH, './/div[@data-qa="mpge"]')
            city_mpg_element = mpg_section.find_element(By.XPATH, './/strong[contains(@class, "key-spec-value")][1]').text
            hwy_mpg_element = mpg_section.find_element(By.XPATH, './/strong[contains(@class, "key-spec-value")][2]').text
            mpg = f"MPGe {city_mpg_element} city ; {hwy_mpg_element} hwy."
        except Exception:
            mpg = np.nan
        
        # fuel_type not always listed
        try:
            fuel_type = driver.find_element(By.XPATH, '//dt[text()="Fuel type"]/following-sibling::dd').text
        except NoSuchElementException:
            fuel_type = np.nan
            # will want to verify that fuel type is electric from mpg html dump
    
        # Create a dictionary with the extracted data, then append it.
        data_to_append = {
            "link": link,
            "listing_title": driver.find_element(By.CLASS_NAME, "listing-title").text,
            "listing_mileage": driver.find_element(By.CLASS_NAME, "listing-mileage").text,
            "primary_price": driver.execute_script('return document.querySelector(".primary-price").textContent'),
            "deal_gauge": deal_gauge,
            "exterior_color": driver.find_element(By.XPATH, '//dt[text()="Exterior color"]/following-sibling::dd').text,
            "interior_color": driver.find_element(By.XPATH, '//dt[text()="Interior color"]/following-sibling::dd').text,
            "drivetrain": driver.find_element(By.XPATH, '//dt[text()="Drivetrain"]/following-sibling::dd').text,
            "mpg": mpg,
            "fuel_type": fuel_type,
            "transmission": driver.find_element(By.XPATH, '//dt[text()="Transmission"]/following-sibling::dd').text,
            "engine": driver.find_element(By.XPATH, '//dt[text()="Engine"]/following-sibling::dd').text,
            "vin": driver.find_element(By.XPATH, '//dt[text()="VIN"]/following-sibling::dd').text,
            "stock_number": driver.find_element(By.XPATH, '//dt[text()="Stock #"]/following-sibling::dd').text,
            "vehicle_history": vehicle_history,
            "seller_name": driver.find_element(By.CLASS_NAME, 'seller-name').text.strip(),
        }
        
        writer.writerow(data_to_append)
        #print(f"{data_to_append} \n") # DEBUG PRINT 
        
        # Close new tab
        driver.close()
        driver.switch_to.window(driver.window_handles[0])

f.close()
print("Done.")

~ 1 - 20 ~
1 - 20
{'link': 'https://www.cars.com/vehicledetail/5ee3ae04-693c-4b24-87be-fca023ba3120/', 'listing_title': '2018 Hyundai Santa Fe Sport 2.4L', 'listing_mileage': '79,497 mi.', 'primary_price': '$17,899', 'deal_gauge': nan, 'exterior_color': 'Nightfall Blue', 'interior_color': 'Beige', 'drivetrain': 'Front-wheel Drive', 'mpg': ('21–27',), 'fuel_type': 'Gasoline', 'transmission': '6-Speed Automatic', 'engine': '2.4L I4 16V GDI DOHC', 'vin': '5XYZT3LB3JG542718', 'stock_number': '13669', 'vehicle_history': {'Accidents or damage': 'None reported', '1-owner vehicle': 'Yes', 'Personal use only': 'No', 'Open recall': 'At least 1 open recall reported'}, 'seller_name': 'ETHIO Motors'} 

{'link': 'https://www.cars.com/vehicledetail/deafee71-3d69-4870-ad31-a908b509ba44/', 'listing_title': '1949 Dodge Coronet', 'listing_mileage': '73,213 mi.', 'primary_price': '$9,998', 'deal_gauge': nan, 'exterior_color': 'Blue', 'interior_color': 'Gray', 'drivetrain': 'Rear-wheel Drive', 'mpg': ('–',

{'link': 'https://www.cars.com/vehicledetail/3a09e54c-1b82-43a9-bc18-19e46cc5d349/', 'listing_title': '2018 Audi Q5 2.0T Premium Plus', 'listing_mileage': '63,768 mi.', 'primary_price': '$38,950', 'deal_gauge': nan, 'exterior_color': 'Ibis White', 'interior_color': 'Black', 'drivetrain': 'All-wheel Drive', 'mpg': ('23–27',), 'fuel_type': 'Gasoline', 'transmission': '7-Speed Automatic with Auto-Shift', 'engine': '2.0L I4 16V GDI DOHC Turbo', 'vin': 'WA1BNAFY3J2033197', 'stock_number': 'KBB3433', 'vehicle_history': {'Accidents or damage': 'None reported', '1-owner vehicle': 'No', 'Personal use only': 'Yes', 'Open recall': 'At least 1 open recall reported'}, 'seller_name': 'Custom Cars West'} 

{'link': 'https://www.cars.com/vehicledetail/a76ecc5e-1254-47d9-90de-f0bfe1adfb88/', 'listing_title': '2019 Audi Q5 2.0T Premium', 'listing_mileage': '28,520 mi.', 'primary_price': '$27,998', 'deal_gauge': nan, 'exterior_color': 'Matador Red Metallic', 'interior_color': 'Black', 'drivetrain': 'All-

In [None]:
# Remove duplicates in cars.csv

df = pd.read_csv('cars.csv')
df = df.drop_duplicates()
df.to_csv('cars.csv', index=False)

TODO:
- Separate listing_title out into make model year etc
- Web scrape listing price history (and list date)
- Focus on used non-EV cars
- If decide to do a secondary dataset with only EVs, can also use new EVs
- try-exception for "Not Priced" listings
