In [36]:
import selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import JavascriptException
import pandas as pd
import numpy as np
import csv
import re

In [37]:
# columns needed for csv.DictWriter
field_names = [
    "link",
    "listing_title",
    "listing_mileage",
    "primary_price",
    "deal_gauge",
    "exterior_color",
    "interior_color",
    "drivetrain",
    "mpg",
    "fuel_type",
    "transmission",
    "engine",
    "vin",
    "stock_number",
    "vehicle_history",
    "seller_name",
    "price_history_data",
]

# Initialize blank.csv with header. 
# Only needed when cars.csv doesn't exist. If so, rename to cars.csv
def create_blank():
    file_path = "blank.csv"
    with open(file_path, "w", newline="") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(field_names)

# create_blank()

In [38]:
# Create a Selenium WebDriver
chrome_service = Service(executable_path='tools1_env/bin/chromedriver')
chrome_options = Options()
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36")
# chrome_options.add_argument("--headless") # Run without GUI
# ? Running in headless seems to break javascript needed for deal_gauge.
driver = webdriver.Chrome(options=chrome_options)

In [39]:
%%time

# Edit this value:
PAGES_TO_SCRAPE = 1

# If df has 100 rows, we want to start on page 6 since that would be the 101-120 listings.
df = pd.read_csv('cars.csv')
page_start = len(df)//20 + 1
page_start = 1 #############testing###############################################################

# Print total listings to scrape
print(f"~ {(page_start-1)*20 + 1} - {(page_start-1+PAGES_TO_SCRAPE-1)*20 + 20} ~")
# 1&1 = 1-20  1&2 = 1-40  2&1 = 21-40  ...

csvfile = 'cars.csv'
with open(csvfile, 'a', newline='') as csvfile: # append mode
    writer = csv.DictWriter(csvfile, fieldnames=field_names)

    # Loop through search result pages
    for page in range(page_start, PAGES_TO_SCRAPE+1):
        base_url = f"https://www.cars.com/shopping/results/?fuel_slugs[]=gasoline&page_size=20&page={page}&maximum_distance=10&sort=listed_at&stock_type=used&zip=80210"
        # Need sort=listed_at ("Oldest listed") and page_size=20.
        # maximum_distance and zip should be consistent for data integrity.
        # fuel_slugs[]=gasoline focus on gasoline vehicles.
        # stock_type=used focus on used vehicles.
        driver.get(base_url)
        driver.implicitly_wait(10)

        print(f"{(page-1)*20 + 1} - {(page-1)*20 + 20}") # current block of listings
        # 1=1-20 2=21-40 ...

        # Find the vehicle card elements, then iterate through them
        vehicle_cards = driver.find_elements(By.CLASS_NAME, "vehicle-card-link")
        for vehicle_card in vehicle_cards:
            # Follow link to individual listing page
            link = vehicle_card.get_attribute("href")
            vehicle_card.send_keys(Keys.CONTROL + Keys.RETURN) # in new tab
            # Switch to the new tab
            driver.switch_to.window(driver.window_handles[1])
            driver.implicitly_wait(2)    

            # deal_gauge doesn't exist for all listings
            deal_gauge = np.nan
            try:
                deal_gauge = driver.execute_script('return document.querySelector(".deal-gauge-list-price-description").textContent')
            except JavascriptException:
                pass

            # vehicle_history isn't always populated
            vehicle_history_section = driver.find_element(By.CLASS_NAME, 'sds-page-section.vehicle-history-section')
            if 'fancy-description-list' in vehicle_history_section.get_attribute('innerHTML'):
                vehicle_history = vehicle_history_section.find_element(By.CLASS_NAME, 'fancy-description-list').get_attribute('innerHTML')
                vehicle_history = vehicle_history.replace("\n", " ")
                # Use regex to clean html
                vehicle_history = ' '.join(vehicle_history.split()) # remove whitespace
                vehicle_history = re.sub(r'<\w+[^>]*>', '', vehicle_history) # remove opening HTML tags
                history_items = vehicle_history.split('</dd> ')
                # Create a dictionary to store the key-value pairs
                history_dict = {}
                for item in history_items:
                    key, value = item.split('</dt> ')
                    key = key.replace('<dt>', '')
                    history_dict[key] = value
                # Remove the last </dd> tag from the last value in the dictionary
                last_key = list(history_dict.keys())[-1]
                history_dict[last_key] = history_dict[last_key].replace('</dd>', '')
                vehicle_history = history_dict

            else:
                vehicle_history = np.nan

            # fuel_type not always listed?
            fuel_type = np.nan
            try:
                fuel_type = driver.find_element(By.XPATH, '//dt[text()="Fuel type"]/following-sibling::dd').text
            except NoSuchElementException:
                pass
            
            # Price history table will only exist when price has changed from listing price.
            try:
                price_history_table = driver.find_element(By.CSS_SELECTOR, 'div.price-history table')
                rows = price_history_table.find_elements(By.TAG_NAME, 'tr')
                price_history_data = []
                for row in rows:
                    cells = row.find_elements(By.TAG_NAME, 'td')
                    # Assuming each row has three cells (date, price change, list price)
                    if len(cells) == 3:
                        price_history_date = cells[0].get_attribute('textContent').strip()
                        price_history_price_change = cells[1].get_attribute('textContent').strip()
                        price_history_list_price = cells[2].get_attribute('textContent').strip()
                        
                        row_data = (price_history_date, price_history_price_change, price_history_list_price)
                        price_history_data.append(row_data)
            except NoSuchElementException:
                pass


            # Create a dictionary with the extracted data, then append it to csv.
            data_to_append = {
                "link": link,
                "listing_title": driver.find_element(By.CLASS_NAME, "listing-title").text,
                "listing_mileage": driver.find_element(By.CLASS_NAME, "listing-mileage").text,
                "primary_price": driver.execute_script('return document.querySelector(".primary-price").textContent'),
                "deal_gauge": deal_gauge,
                "exterior_color": driver.find_element(By.XPATH, '//dt[text()="Exterior color"]/following-sibling::dd').text,
                "interior_color": driver.find_element(By.XPATH, '//dt[text()="Interior color"]/following-sibling::dd').text,
                "drivetrain": driver.find_element(By.XPATH, '//dt[text()="Drivetrain"]/following-sibling::dd').text,
                "mpg": driver.find_element(By.XPATH, '//dt[text()="MPG"]/following-sibling::dd').text,
                "fuel_type": fuel_type,
                "transmission": driver.find_element(By.XPATH, '//dt[text()="Transmission"]/following-sibling::dd').text,
                "engine": driver.find_element(By.XPATH, '//dt[text()="Engine"]/following-sibling::dd').text,
                "vin": driver.find_element(By.XPATH, '//dt[text()="VIN"]/following-sibling::dd').text,
                "stock_number": driver.find_element(By.XPATH, '//dt[text()="Stock #"]/following-sibling::dd').text,
                "vehicle_history": vehicle_history,
                "seller_name": driver.find_element(By.CLASS_NAME, 'seller-name').text.strip(),
                "price_history_data": price_history_data,
            }

            writer.writerow(data_to_append)
            print(f"{data_to_append} \n") # DEBUG PRINT 

            # Close new tab
            driver.close()
            driver.switch_to.window(driver.window_handles[0])

print("Done.")

~ 1 - 20 ~
1 - 20
{'link': 'https://www.cars.com/vehicledetail/123efbf0-a9ed-413b-8f20-14534bcd14eb/?attribution_type=premier', 'listing_title': '2019 BMW X5 xDrive40i', 'listing_mileage': '34,637 mi.', 'primary_price': '$39,487', 'deal_gauge': 'This is a good deal.', 'exterior_color': 'Brown', 'interior_color': '–', 'drivetrain': 'All-wheel Drive', 'mpg': '20–26', 'fuel_type': 'Gasoline', 'transmission': '8-Speed Automatic', 'engine': '3.0L I6 24V GDI DOHC Turbo', 'vin': '5UXCR6C59KLL39398', 'stock_number': 'S54248', 'vehicle_history': {'Accidents or damage': 'At least 1 accident or damage reported', '1-owner vehicle': 'No', 'Personal use only': 'No'}, 'seller_name': 'The Sharpest Rides', 'price_history_data': [('6/11/22', 'Listed', '$52,387'), ('7/12/22', '-$1,000', '$51,387'), ('8/11/22', '+$3,000', '$54,387'), ('8/20/22', '-$1,000', '$53,387'), ('8/24/22', '-$2,000', '$51,387'), ('9/21/22', '-$1,000', '$50,387'), ('10/09/22', '-$1,000', '$49,387'), ('11/25/22', '-$3,000', '$46,387'

{'link': 'https://www.cars.com/vehicledetail/4e93540c-a442-428a-86ec-ff0b244c5c90/', 'listing_title': '2014 Chevrolet Malibu 1LS', 'listing_mileage': '82,995 mi.', 'primary_price': '$9,999', 'deal_gauge': 'Great Deal $1,336 under', 'exterior_color': 'Gray', 'interior_color': 'Black', 'drivetrain': 'Front-wheel Drive', 'mpg': '25–36', 'fuel_type': 'Gasoline', 'transmission': '6-Speed Automatic', 'engine': '2.5L I4 16V GDI DOHC', 'vin': '1G11B5SL0EF116413', 'stock_number': '116413', 'vehicle_history': {'Accidents or damage': 'At least 1 accident or damage reported', '1-owner vehicle': 'No', 'Personal use only': 'Yes'}, 'seller_name': 'CTS Auto', 'price_history_data': [('3/12/22', 'Listed', '$12,999'), ('5/17/22', '-$500', '$12,499'), ('5/31/22', '-$500', '$11,999'), ('10/08/22', '-$100', '$11,899'), ('6/16/23', '-$100', '$11,799'), ('9/13/23', '-$1,800', '$9,999')]} 

{'link': 'https://www.cars.com/vehicledetail/4182da36-7d31-4519-9a8a-5315d1591cbf/', 'listing_title': '2019 Honda HR-V Sp

{'link': 'https://www.cars.com/vehicledetail/7c1bce9c-a3ee-4024-b193-52c4f22b562e/', 'listing_title': '2017 Kia Sportage LX', 'listing_mileage': '86,900 mi.', 'primary_price': '$18,750', 'deal_gauge': 'This is a good deal.', 'exterior_color': 'Blue', 'interior_color': 'Black', 'drivetrain': 'All-wheel Drive', 'mpg': '21–25', 'fuel_type': 'Gasoline', 'transmission': '6-Speed Automatic', 'engine': '2.4L I4 16V GDI DOHC', 'vin': 'KNDPMCAC4H7094027', 'stock_number': 'KBB3480', 'vehicle_history': {'Accidents or damage': 'At least 1 accident or damage reported', '1-owner vehicle': 'No', 'Personal use only': 'Yes', 'Open recall': 'At least 1 open recall reported'}, 'seller_name': 'Custom Cars West', 'price_history_data': [('8/04/23', 'Listed', '$26,998'), ('10/17/23', '-$1,000', '$25,998')]} 

{'link': 'https://www.cars.com/vehicledetail/2e313df1-6ee1-42e5-83e5-6c82fbd19dbf/', 'listing_title': '2020 Chevrolet Blazer 2LT', 'listing_mileage': '82,853 mi.', 'primary_price': '$22,998', 'deal_gaug

In [None]:
# Remove duplicates in cars.csv
df = pd.read_csv('cars.csv')
df = df.drop_duplicates()
df.to_csv('cars.csv', index=False)

TODO:
- Separate listing_title out into make model year etc
- Web scrape listing price history (and list date)
- For listing that dont have price history, obtain date range of possible "Listed" date based on dates above and below them
- try-exception for "Not Priced" listings - skip them?
- try block for every variable so scraper never crashes


Deal Gauge value based on Avg. market price range:
- Over = "This is a fair deal. Why?"
- Within = "This is a good deal."
- Under = "Great Deal $[...] under"

In [None]:
# Extra code for handling non-gasoline vehicles

# # mpg is listed differently for EVs, and also not always listed
#         try:
#             mpg = driver.find_element(By.XPATH, '//dt[text()="MPG"]/following-sibling::dd').text,
#         except NoSuchElementException:
#             key_specs_container = driver.find_element(By.ID, 'key-specs-container')
#             mpg_section = key_specs_container.find_element(By.XPATH, './/div[@data-qa="mpge"]')
#             city_mpg_element = mpg_section.find_element(By.XPATH, './/strong[contains(@class, "key-spec-value")][1]').text
#             hwy_mpg_element = mpg_section.find_element(By.XPATH, './/strong[contains(@class, "key-spec-value")][2]').text
#             mpg = f"MPGe {city_mpg_element} city ; {hwy_mpg_element} hwy."
#         except Exception:
#             mpg = np.nan
        
#         # fuel_type not always listed
#         try:
#             fuel_type = driver.find_element(By.XPATH, '//dt[text()="Fuel type"]/following-sibling::dd').text
#         except NoSuchElementException:
#             fuel_type = np.nan
#             # will want to verify that fuel type is electric from mpg html dump