In [1]:
import requests
from bs4 import BeautifulSoup
import re
import json
import pandas as pd


In [2]:

def scrape_vehicle_data(url):
    # Send a GET request to the URL
    response = requests.get(url)

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.text, "html.parser")

    # Find all <script> tags with type "application/ld+json"
    json_scripts = soup.find_all("script", type="application/ld+json")

    # Define an empty list to store extracted vehicle information
    vehicles = []

    # Define a set to store seen vehicle names
    seen_vehicles = set()

    # Loop through each <script> tag
    for script in json_scripts:
        # Extract the text content of the <script> tag
        if json_text := script.string:
            # Remove special characters from the JSON text
            json_text_cleaned = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', json_text)

            # Parse the JSON data into a Python dictionary
            json_data = json.loads(json_text_cleaned)

            # Check if the JSON data represents a list of vehicles (ItemList)
            if "@type" in json_data and json_data["@type"] == "ItemList":
                # Get the list of vehicles
                vehicle_list = json_data["itemListElement"]

                # Iterate over each vehicle entry
                for vehicle_item in vehicle_list:
                    # Extract vehicle information
                    vehicle_info = vehicle_item["item"]

                    # Extract the vehicle name
                    vehicle_name = vehicle_info["name"]

                    # If the vehicle name is not already seen, add it to the list and mark it as seen
                    if vehicle_name not in seen_vehicles:
                        vehicles.append(vehicle_info)
                        seen_vehicles.add(vehicle_name)

    # Create a DataFrame from the list of vehicle dictionaries
    df = pd.DataFrame(vehicles)

    # Extract 'name' and 'price' from the 'offers' section, then drop the 'offers' column and rename the 'name' column
    df['price'] = df['offers'].apply(lambda x: x['price'])
    df.drop(columns=['offers'], inplace=True)
    df.rename(columns={'name': 'Vehicle Name'}, inplace=True)

    df = df[['Vehicle Name', 'price']]


    # Adjust DataFrame index to start from 1
    df.index += 1

    # Convert 'price' column to floating-point numbers
    df['price'] = df['price'].astype(float)

    # Convert 'price' column to integers
    df['price'] = df['price'].astype(int)

    return df

In [3]:
# Define the list of URLs to scrape
urls = [
    "https://www.acadiatoyota.com/en/new-inventory",
    "https://www.frederictontoyota.com/en/new-inventory",
    "https://www.summersidetoyota.com/en/new-inventory",
    "https://www.charlottetowntoyota.ca/en/new-catalog",
    "https://www.trurotoyota.com/en/new-inventory",
    "https://www.anchortoyota.ca/vehicles/new/?st=year,desc&view=grid&sc=new",
    "https://www.kentvilletoyota.com/en/shop-online?paymentFrequency=52&purchaseMethodOrder=4352&preferredMake=ALL",
    "https://oreganstoyotabridgewater.com/inventory/?do-search=1",
    "https://oreganstoyotahalifax.com/inventory/?search.vehicle-inventory-type-ids.0=1",
    "https://oreganstoyotadartmouth.com/inventory/?search.vehicle-inventory-type-ids.0=1",
    "https://trimactoyota.ca/new-inventory/",
    "https://www.macdonaldtoyota.ca/new/inventory/search.html",
    "https://www.westerntoyota.com/en/new-catalog",
    "https://www.grandtoyota.ca/en/new-catalog",
    "https://www.gandertoyota.com/new/inventory/search.html",
    "https://www.centraltoyota.ca/new/inventory/search.html",
    "https://www.toyotaplaza.ca/new/",
    "https://www.edmundstontoyota.com/en/new-inventory",
    "https://www.restigouchetoyota.com/en/new-inventory",
    "https://www.bathursttoyota.ca/en/new-inventory",
    "https://www.rousseltoyota.com/en/new-inventory",
    "https://www.amhersttoyota.com/vehicles/new/?view=grid&sc=new",
    "https://www.woodstocknbtoyota.com/en/for-sale/all/new"
]

# Create an empty list to store DataFrames
dfs = []

# Create an empty list to store URLs that couldn't be scraped
failed_urls = []


In [4]:
# Iterate over each URL and scrape vehicle data
for url in urls:
    try:
        df = scrape_vehicle_data(url)  # Assuming scrape_vehicle_data function is defined as mentioned earlier
        df['Source Website'] = url  # Add a column for the source website
        dfs.append(df)
        print(f"Successfully scraped data from {url}")
    except Exception as e:
        print(f"Failed to scrape data from {url}: {e}")
        failed_urls.append(url)

# Concatenate all DataFrames into a single DataFrame
final_df = pd.concat(dfs, ignore_index=True)

# # Create a DataFrame for failed URLs
# failed_df = pd.DataFrame({'Failed URLs': failed_urls})


Successfully scraped data from https://www.acadiatoyota.com/en/new-inventory
Failed to scrape data from https://www.frederictontoyota.com/en/new-inventory: 'offers'
Failed to scrape data from https://www.summersidetoyota.com/en/new-inventory: 'offers'
Successfully scraped data from https://www.charlottetowntoyota.ca/en/new-catalog
Failed to scrape data from https://www.trurotoyota.com/en/new-inventory: 'offers'
Failed to scrape data from https://www.anchortoyota.ca/vehicles/new/?st=year,desc&view=grid&sc=new: 'offers'
Failed to scrape data from https://www.kentvilletoyota.com/en/shop-online?paymentFrequency=52&purchaseMethodOrder=4352&preferredMake=ALL: 'offers'
Failed to scrape data from https://oreganstoyotabridgewater.com/inventory/?do-search=1: 'offers'
Failed to scrape data from https://oreganstoyotahalifax.com/inventory/?search.vehicle-inventory-type-ids.0=1: 'offers'
Failed to scrape data from https://oreganstoyotadartmouth.com/inventory/?search.vehicle-inventory-type-ids.0=1: '

In [5]:
final_df.to_csv('./data_part_1.csv',)

In [6]:
# failed_df.to_csv('./failed_part_1.csv',)