In [1]:
import selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import JavascriptException
import pandas as pd
import numpy as np
import csv
import re

In [2]:
# columns needed for csv.DictWriter
field_names = [
    "link",
    "listing_title",
    "listing_mileage",
    "primary_price",
    "deal_gauge",
    "exterior_color",
    "interior_color",
    "drivetrain",
    "mpg",
    "fuel_type",
    "transmission",
    "engine",
    "vin",
    "stock_number",
    "vehicle_history",
    "seller_name",
    "price_history",
]

# Helper function for extracting elements
def extract_element(driver, by, value):
    try:
        return driver.find_element(by, value).text
    except NoSuchElementException:
        return np.nan

# Initialize blank.csv with header. 
# Only needed when cars.csv doesn't exist. If so, rename to cars.csv
def create_blank():
    file_path = "blank.csv"
    with open(file_path, "w", newline="") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(field_names)

# create_blank()

In [9]:
# Create a Selenium WebDriver
chrome_service = Service(executable_path='tools1_env/bin/chromedriver')
chrome_options = Options()
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36")
# chrome_options.add_argument("--headless") # Run without GUI
# Running in headless seems to break javascript.
driver = webdriver.Chrome(options=chrome_options)

In [10]:
%%time

# Edit these values:
PAGES_TO_SCRAPE = 50
page_start = 161

# If df has 100 rows, we want to start on page 6 since that would be the 101-120 listings.
#df = pd.read_csv('cars.csv')
#page_start = len(df)//20 + 1
# After running the scraper for a while, the numbers got desynced. Will set page_start manually for the rest.

# Print total listings to scrape
print(f"Listings: {(page_start-1)*20 + 1} - {(page_start-1+PAGES_TO_SCRAPE-1)*20 + 20}")
# 1&1 = 1-20  1&2 = 1-40  2&1 = 21-40  ...

csvfile = 'cars.csv'
with open(csvfile, 'a', newline='') as csvfile: # append mode
    writer = csv.DictWriter(csvfile, fieldnames=field_names)

    # Loop through search result pages
    print(f"Pages: {page_start} - {page_start+PAGES_TO_SCRAPE-1}")
    for page in range(page_start, page_start+PAGES_TO_SCRAPE):
        base_url = f"https://www.cars.com/shopping/results/?fuel_slugs[]=gasoline&page_size=20&page={page}&maximum_distance=10&sort=listed_at&stock_type=used&zip=80210"
        # Need sort=listed_at ("Oldest listed") and page_size=20.
        # maximum_distance and zip should be consistent for data integrity.
        # fuel_slugs[]=gasoline focus on gasoline vehicles.
        # stock_type=used focus on used vehicles.
        driver.get(base_url)
        driver.implicitly_wait(10)

        print(f"{page} : {(page-1)*20 + 1} - {(page-1)*20 + 20}") # current block of listings
        # 1=1-20 2=21-40 ...

        # Find the vehicle card elements, then iterate through them
        vehicle_cards = driver.find_elements(By.CLASS_NAME, "vehicle-card-link")
        for vehicle_card in vehicle_cards:
            # Follow link to individual listing page
            link = vehicle_card.get_attribute("href")
            if "?attribution_type=premier" in link:
                continue  # Skip sponsored listing
            vehicle_card.send_keys(Keys.CONTROL + Keys.RETURN) # in new tab
            # Switch to the new tab
            driver.switch_to.window(driver.window_handles[1])
            driver.implicitly_wait(2)    

            
            # javascript 
            deal_gauge = np.nan
            try:
                deal_gauge = driver.execute_script('return document.querySelector(".deal-gauge-list-price-description").textContent')
            except JavascriptException:
                pass
            
            primary_price = np.nan
            try:
                primary_price = driver.execute_script('return document.querySelector(".primary-price").textContent')
            except JavascriptException:
                pass
            
            # vehicle_history fancy-description-list
            vehicle_history_section = driver.find_element(By.CLASS_NAME, 'sds-page-section.vehicle-history-section')
            if 'fancy-description-list' in vehicle_history_section.get_attribute('innerHTML'):
                vehicle_history = vehicle_history_section.find_element(By.CLASS_NAME, 'fancy-description-list').get_attribute('innerHTML')
                vehicle_history = vehicle_history.replace("\n", " ")
                # Use regex to clean html
                vehicle_history = ' '.join(vehicle_history.split()) # remove whitespace
                vehicle_history = re.sub(r'<\w+[^>]*>', '', vehicle_history) # remove opening HTML tags
                history_items = vehicle_history.split('</dd> ')
                # Create a dictionary to store the key-value pairs
                history_dict = {}
                for item in history_items:
                    key, value = item.split('</dt> ')
                    key = key.replace('<dt>', '')
                    history_dict[key] = value
                # Remove the last </dd> tag from the last value in the dictionary
                last_key = list(history_dict.keys())[-1]
                history_dict[last_key] = history_dict[last_key].replace('</dd>', '')
                vehicle_history = history_dict

            else:
                vehicle_history = np.nan
            
            # Price history table
            price_history_data = []
            try:
                price_history_table = driver.find_element(By.CSS_SELECTOR, 'div.price-history table')
                rows = price_history_table.find_elements(By.TAG_NAME, 'tr')
                for row in rows:
                    cells = row.find_elements(By.TAG_NAME, 'td')
                    # Assuming each row has three cells (date, price change, list price)
                    if len(cells) == 3:
                        price_history_date = cells[0].get_attribute('textContent').strip()
                        price_history_price_change = cells[1].get_attribute('textContent').strip()
                        price_history_list_price = cells[2].get_attribute('textContent').strip()
                        
                        row_data = (price_history_date, price_history_price_change, price_history_list_price)
                        price_history_data.append(row_data)
            except NoSuchElementException:
                pass
            
            # Create a dictionary with the extracted data, then append it to csv.
            data_to_append = {
                "link": link,
                "listing_title": extract_element(driver, By.CLASS_NAME, "listing-title"),
                "listing_mileage": extract_element(driver, By.CLASS_NAME, "listing-mileage"),
                "primary_price": primary_price,
                "deal_gauge": deal_gauge,
                "exterior_color": extract_element(driver, By.XPATH, '//dt[text()="Exterior color"]/following-sibling::dd'),
                "interior_color": extract_element(driver, By.XPATH, '//dt[text()="Interior color"]/following-sibling::dd'),
                "drivetrain": extract_element(driver, By.XPATH, '//dt[text()="Drivetrain"]/following-sibling::dd'),
                "mpg": extract_element(driver, By.XPATH, '//dt[text()="MPG"]/following-sibling::dd'),
                "fuel_type": extract_element(driver, By.XPATH, '//dt[text()="Fuel type"]/following-sibling::dd'),
                "transmission": extract_element(driver, By.XPATH, '//dt[text()="Transmission"]/following-sibling::dd'),
                "engine": extract_element(driver, By.XPATH, '//dt[text()="Engine"]/following-sibling::dd'),
                "vin": extract_element(driver, By.XPATH, '//dt[text()="VIN"]/following-sibling::dd'),
                "stock_number": extract_element(driver, By.XPATH, '//dt[text()="Stock #"]/following-sibling::dd'),
                "vehicle_history": vehicle_history,
                "seller_name": extract_element(driver, By.CLASS_NAME, 'seller-name'),
                "price_history": price_history_data,
            }

            writer.writerow(data_to_append)
            #print(f"{data_to_append} \n") # DEBUG PRINT 

            # Close new tab
            driver.close()
            driver.switch_to.window(driver.window_handles[0])

print("Done.")

Listings: 3081 - 4080
Pages: 155 - 204
155 : 3081 - 3100
156 : 3101 - 3120
157 : 3121 - 3140
158 : 3141 - 3160
159 : 3161 - 3180
160 : 3181 - 3200
161 : 3201 - 3220


NoSuchWindowException: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=117.0.5938.149)
Stacktrace:
#0 0x55e0cd027933 <unknown>
#1 0x55e0ccd016f7 <unknown>
#2 0x55e0cccda558 <unknown>
#3 0x55e0ccd6f96f <unknown>
#4 0x55e0ccd833ab <unknown>
#5 0x55e0ccd6a3d3 <unknown>
#6 0x55e0ccd3ce64 <unknown>
#7 0x55e0ccd3dc4e <unknown>
#8 0x55e0ccfed558 <unknown>
#9 0x55e0ccff14a0 <unknown>
#10 0x55e0ccffb97c <unknown>
#11 0x55e0ccff20b8 <unknown>
#12 0x55e0ccfbdcdf <unknown>
#13 0x55e0cd016048 <unknown>
#14 0x55e0cd016219 <unknown>
#15 0x55e0cd026ac3 <unknown>
#16 0x7f5278121ac3 <unknown>


In [8]:
# Remove duplicates in cars.csv
df = pd.read_csv('cars.csv')
print(f"{df.duplicated().sum()} duplicates removed.")
df = df.drop_duplicates()
df.to_csv('cars.csv', index=False)

0 duplicates removed.


TODO:
- Separate listing_title out into make model year etc
- For listing that dont have price history, obtain date range of possible "Listed" date based on dates above and below them?

Deal Gauge value based on Avg. market price range:
- Over = "This is a fair deal. Why?"
- Within = "This is a good deal."
- Under = "Great Deal $[...] under"

In [None]:
# Extra code for handling non-gasoline vehicles

# # mpg is listed differently for EVs, and also not always listed
#         try:
#             mpg = driver.find_element(By.XPATH, '//dt[text()="MPG"]/following-sibling::dd').text,
#         except NoSuchElementException:
#             key_specs_container = driver.find_element(By.ID, 'key-specs-container')
#             mpg_section = key_specs_container.find_element(By.XPATH, './/div[@data-qa="mpge"]')
#             city_mpg_element = mpg_section.find_element(By.XPATH, './/strong[contains(@class, "key-spec-value")][1]').text
#             hwy_mpg_element = mpg_section.find_element(By.XPATH, './/strong[contains(@class, "key-spec-value")][2]').text
#             mpg = f"MPGe {city_mpg_element} city ; {hwy_mpg_element} hwy."
#         except Exception:
#             mpg = np.nan
        
#         # fuel_type not always listed
#         try:
#             fuel_type = driver.find_element(By.XPATH, '//dt[text()="Fuel type"]/following-sibling::dd').text
#         except NoSuchElementException:
#             fuel_type = np.nan
#             # will want to verify that fuel type is electric from mpg html dump