# Web Scraping Used Cars on sgcarmart.com
## 1. Introduction

### This document outlines the process of web scraping data from sgcarmart.com, the largest online car marketplace in Singapore, to analyze the used car market.

### Respecting sgcarmart.com's Rules
The scraping script will adhere to the guidelines outlined in sgcarmart.com's robots.txt file. Here's a summary of the restrictions:

Crawlers must wait at least 5 seconds between requests (Crawl-delay: 5).
Specific directories are off-limits for scraping, including:
cgi-bin/
images/
mail/
dealer/
directory/premium/
includes/
phpads/
update/
upload/

### Data Extraction
The script will focus on extracting the following information for each used car listing:

Car Listing URL 'LISTING_URL', 
Car Brand and Model 'BRAND', 
Price 'PRICE', 
Depreciation Value Yearly 'DEPRE_YEARLY', 
Registered Date 'REG_DATE', 
Mileage in KM 'MILEAGE_KM', 
Year of Manufacture 'MANUFACTURED_YEAR', 
Road Tax Yearly 'ROAD_TAX_YEARLY', 
Automatic or Manual Tranmission 'TRANSMISSION', 
Deregistration Value as of Web Scraping DTD 'DEREG_VALUE_FROM_SCRAPE_DATE', 
Web Scraping DTD 'SCRAPE_DATE', 
Open Market Value (OMV) 'OMV', 
Additional Registration Fee (ARF) 'ARF', 
Certificate of Entitlement (COE) from Web Scraping DTD 'COE_FROM_SCRAPE_DATE', 
Number of Days till COE Expires 'DAYS_OF_COE_LEFT', 
Engine Capacity in CC 'ENGINE_CAPACITY_CC', 
Car Curb Weight in KG 'CURB_WEIGHT_KG', 
Number of Past Owners 'NO_OF_OWNERS', 
Vehicle Type 'VEHICLE_TYPE'

This data will be used for further analysis of the used car market in Singapore.

## 2. Import Libraries

In [16]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import time
import re
from datetime import datetime

## 3. Pre-defined Functions

In [81]:
# A set of functions is defined to extract specific attributes from a parsed individual car listing URL, with each function returning the corresponding attribute.

#Brand Retriever Function
def brand_retrieval(parsed_listing_url):
    """
    Retrieves the model name from the parsed car listing URL.

    Parameters:
    parsed_listing_url (BeautifulSoup object): The parsed HTML content of the car listing page.

    Returns:
    str or np.nan: The model name. Returns np.nan if the model name is not found.
    """
    try:
        # Find all script tags
        script_tags = parsed_listing_url.find_all('script')
        
        for script_tag in script_tags:
            script_text = script_tag.string
            if script_text:
                # Search for the model and vehicleConfiguration keywords
                model_match = re.search(r'"model"\s*:\s*"([^"]+)"', script_text)
                vehicle_config_match = re.search(r'"vehicleConfiguration"\s*:\s*"([^"]+)"', script_text)
                
                if model_match and vehicle_config_match:
                    model_name = model_match.group(1)
                    vehicle_config = vehicle_config_match.group(1)
                    
                    # Ensure the model name is between "model" and "vehicleConfiguration"
                    if model_name and vehicle_config:
                        return model_name
        return np.nan
    except (AttributeError, TypeError):
        return np.nan

# Price Retriever Function
def price_retrieval(parsed_listing_url):
    """
    Retrieves the price from the parsed car listing URL.

    Parameters:
    parsed_listing_url (BeautifulSoup): The parsed HTML of the car listing page.

    Returns:
    int or np.nan: The price of the car. Returns np.nan if the price information is not available or cannot be parsed.
    """
    price_elements = parsed_listing_url.find_all(class_="font_red")
    if price_elements:
        price_text = price_elements[0].text.strip()
        price_parts = price_text.split('$')
        if len(price_parts) < 2:
            return np.nan
        try:
            return int(price_parts[1].replace(',', ''))
        except ValueError:
            return np.nan
    else:
        return np.nan

# Deprecration Value Per Year Retriever Function
def depreciation_yearly_retrieval(parsed_listing_url):
    """
    Retrieves the yearly depreciation value from the parsed car listing URL.

    Parameters:
    parsed_listing_url (BeautifulSoup): The parsed HTML of the car listing page.

    Returns:
    int or np.nan: The yearly depreciation value. Returns np.nan if the depreciation information is not available or cannot be parsed.
    """
    # Find all rows with the class 'row_bg'
#    row_bg_elements = parsed_listing_url.find_all('tr', class_="row_bg")
#    for row in row_bg_elements:
#        # Find the 'Depreciation' label within the row
#        depreciation_label = row.find('td', string='Depreciation')
#        if depreciation_label:
#            # Find the next sibling element which contains the depreciation value
#            depreciation_value = depreciation_label.find_next_sibling('td', valign="top")
#            if depreciation_value:
#                depreciation_value_text = depreciation_value.text.strip()
#                depreciation_parts = depreciation_value_text.split('$')
#                if len(depreciation_parts) < 2:
#                    return np.nan
#                try:
#                    return int(depreciation_parts[1].replace(',', '').split('/')[0])
#                except ValueError:
#                    return np.nan
#    return np.nan

    # Find the <strong>Depreciation</strong> element
    depreciation_label = parsed_listing_url.find('strong', string='Depreciation')
    if depreciation_label:
        # Find the parent <td> element
        depreciation_td = depreciation_label.find_parent('td')
        if depreciation_td:
            # Find the next sibling <td> element with valign='top'
            depreciation_value = depreciation_td.find_next_sibling('td', valign='top')
            if depreciation_value:
                depreciation_value_text = depreciation_value.text.strip()
                depreciation_parts = depreciation_value_text.split('$')
                if len(depreciation_parts) < 2:
                    return np.nan
                try:
                    return int(depreciation_parts[1].replace(',', '').split('/')[0])
                except ValueError:
                    return np.nan
            else:
                return np.nan
        else:
            return np.nan
    else:
        return np.nan

# Road Tax Per Year Retriever
def road_tax_retrieval(parsed_listing_url):
    """
    Retrieves the yearly road tax from the parsed car listing URL.

    Parameters:
    parsed_listing_url (BeautifulSoup): The parsed HTML of the car listing page.

    Returns:
    int or np.nan: The yearly road tax. Returns np.nan if the road tax information is not available or cannot be parsed.
    """
    each_info_elements = parsed_listing_url.find_all(class_='eachInfo')
    for element in each_info_elements:
        row_title = element.find(class_='row_title')
        if row_title and 'Road Tax' in row_title.text:
            row_info = element.find(class_='row_info')
            if row_info:
                road_tax_text = row_info.text.strip()
                if '/yr' in road_tax_text:
                    try:
                        return int(road_tax_text.split('/yr')[0].replace('$', '').replace(',', '').strip())
                    except ValueError:
                        return np.nan
    return np.nan

# Registered Date Retriever
def reg_date_retrieval(parsed_listing_url):
    """
    Retrieves the registration date from the parsed car listing URL.

    Parameters:
    parsed_listing_url (BeautifulSoup): The parsed HTML of the car listing page.

    Returns:
    str or np.nan: The registration date of the car. Returns np.nan if the registration date information is not available or cannot be parsed.
    """
    row_bg_elements = parsed_listing_url.find_all(class_='row_bg')
    if len(row_bg_elements) > 1:
        td_elements = row_bg_elements[1].find_all('td')
        if len(td_elements) > 3:
            reg_date = td_elements[3].text.split()[0].split('(')[0]
            return reg_date
    return np.nan

# Days of COE Retriever
def days_of_coe_retrieval(parsed_listing_url):
    """
    Retrieves the days of COE left from the parsed car listing URL.

    Parameters:
    parsed_listing_url (BeautifulSoup object): The parsed HTML content of the car listing page.

    Returns:
    str or np.nan: The days of COE left. Returns np.nan if the information is not found.
    """
    try:
        row_bg_elements = parsed_listing_url.find_all(class_='row_bg')
        if len(row_bg_elements) > 1:
            td_elements = row_bg_elements[1].find_all('td')
            if len(td_elements) > 3:
                days_of_coe_left_text = td_elements[3].text.split('(')[1].split('COE')[0].strip()
                return yr_mm_dd_cleaner(days_of_coe_left_text)
        return np.nan
    except (IndexError, AttributeError, ValueError):
        return np.nan

def yr_mm_dd_cleaner(str1):
    """
    Accepts a string that may or may include the elements yr, mths, and days,
    and converts the whole string into the number of days.

    Parameters:
    str1 (str): A string representing the duration in the format "Xyr Xmth Xdays".

    Returns:
    int: The number of days.
    """
    days = 0
    for unit in ['yr', 'mth', 'day']:
        value = int(next((int(char) for char in str1 if char.isdigit() and unit in str1), 0))
        days += value * {'yr': 365, 'mth': 30, 'day': 1}[unit]
    return days

# Mileage Retriever
def mileage_retrieval(parsed_listing_url):
    """
    Retrieves the mileage in kilometers from the parsed listing URL.

    Parameters:
    parsed_listing_url (BeautifulSoup): The parsed HTML of the car listing page.

    Returns:
    int or np.nan: The mileage in kilometers. Returns np.nan if the mileage information is not available.
    """
    each_info_elements = parsed_listing_url.find_all(class_='eachInfo')
    for element in each_info_elements:
        row_title = element.find(class_='row_title')
        if row_title and 'Mileage' in row_title.text:
            row_info = element.find(class_='row_info')
            if row_info:
                mileage_text = row_info.text.strip()
                mileage_parts = mileage_text.split('km')
                if len(mileage_parts) < 2:
                    return np.nan
                try:
                    return int(mileage_parts[0].replace(',', '').strip())
                except ValueError:
                    return np.nan
    return np.nan

# Manufactured Year Retriever
def manufactured_year_retrieval(parsed_listing_url):
    """
    Retrieves the manufactured year from the parsed car listing URL.

    Parameters:
    parsed_listing_url (BeautifulSoup): The parsed HTML of the car listing page.

    Returns:
    str or np.nan: The manufactured year of the car. Returns np.nan if the manufactured year information is not available or cannot be parsed.
    """
    row_info_elements = parsed_listing_url.find_all(class_='row_info')
    if len(row_info_elements) > 6:
        manufactured_year = row_info_elements[6].text.strip()
        return manufactured_year.split()[0]
    return np.nan

# Transmission Retriever
def transmission_retrieval(parsed_listing_url):
    """
    Retrieves the transmission type from the parsed car listing URL.

    Parameters:
    parsed_listing_url (BeautifulSoup): The parsed HTML of the car listing page.

    Returns:
    str or np.nan: The transmission type of the car. Returns np.nan if the transmission information is not available or cannot be parsed.
    """
    row_info_elements = parsed_listing_url.find_all(class_='row_info')
    if len(row_info_elements) > 7:
        transmission = row_info_elements[7].text.strip()
        return transmission.split()[0]
    return np.nan

# Deregistration Value Retriever
def dereg_value_retrieval(parsed_listing_url):
    """
    Retrieves the deregistration value from the parsed car listing URL.

    Args:
        parsed_listing_url (BeautifulSoup): The parsed HTML of the car listing.

    Returns:
        int or np.nan: The deregistration value. Returns np.nan if the deregistration value information is not available or cannot be parsed.
    """
    each_info_elements = parsed_listing_url.find_all(class_='eachInfo')
    for element in each_info_elements:
        row_title = element.find(class_='row_title')
        if row_title and 'Dereg Value' in row_title.text:
            row_info = element.find(class_='row_info')
            if row_info:
                dereg_value_text = row_info.text.strip()
                dereg_value_parts = dereg_value_text.split('$')
                if len(dereg_value_parts) < 2:
                    return np.nan
                try:
                    return int(dereg_value_parts[1].replace(',', '').split()[0])
                except ValueError:
                    return np.nan
    return np.nan

# Open Market Value Retriever
def omv_retrieval(parsed_listing_url):
    """
    Retrieves the Open Market Value (OMV) from the parsed car listing URL.

    Args:
        parsed_listing_url (BeautifulSoup): The parsed HTML of the car listing.

    Returns:
        int or np.nan: The Open Market Value (OMV). Returns np.nan if the OMV information is not available or cannot be parsed.
    """
    row_info_elements = parsed_listing_url.find_all(class_='row_info')
    if len(row_info_elements) > 8:
        omv_text = row_info_elements[8].text.strip()
        omv_parts = omv_text.split('$')
        if len(omv_parts) < 2:
            return np.nan
        try:
            return int(omv_parts[1].replace(',', ''))
        except ValueError:
            return np.nan
    else:
        return np.nan
    
# ARF Retriever
def arf_retrieval(parsed_listing_url):
    """
    Retrieves the Additional Registration Fee (ARF) from the parsed car listing URL.

    Args:
        parsed_listing_url (BeautifulSoup): The parsed HTML of the car listing.

    Returns:
        int or np.nan: The Additional Registration Fee (ARF). Returns np.nan if the ARF information is not available or cannot be parsed.
    """
    row_info_elements = parsed_listing_url.find_all(class_='row_info')
    if len(row_info_elements) > 9:
        arf_text = row_info_elements[9].text.strip()
        arf_parts = arf_text.split('$')
        if len(arf_parts) < 2:
            return np.nan
        try:
            return int(arf_parts[1].replace(',', ''))
        except ValueError:
            return np.nan
    else:
        return np.nan

# COE Price retriever 
def coe_retrieval(parsed_listing_url):
    """
    Retrieves the Certificate of Entitlement (COE) price from the parsed car listing URL.

    Args:
        parsed_listing_url (BeautifulSoup): The parsed HTML of the car listing.

    Returns:
        int or np.nan: The Certificate of Entitlement (COE) price. Returns np.nan if the COE price information is not available or cannot be parsed.
    """
    row_info_elements = parsed_listing_url.find_all(class_='row_info')
    if len(row_info_elements) > 3:
        coe_text = row_info_elements[3].text.strip()
        coe_parts = coe_text.split('$')
        if len(coe_parts) < 2:
            return np.nan
        try:
            return int(coe_parts[1].replace(',', ''))
        except ValueError:
            return np.nan
    else:
        return np.nan

# Engine Capacity Retriever
def engine_capacity_retrieval(parsed_listing_url):
    """
    Retrieves the engine capacity in cubic centimeters (CC) from the parsed car listing URL.

    Args:
        parsed_listing_url (BeautifulSoup): The parsed HTML of the car listing.

    Returns:
        int or np.nan: The engine capacity in cubic centimeters (CC). Returns np.nan if the engine capacity information is not available or cannot be parsed.
    """
    row_info_elements = parsed_listing_url.find_all(class_='row_info')
    if len(row_info_elements) > 4:
        engine_capacity_text = row_info_elements[4].text.strip()
        engine_capacity_parts = engine_capacity_text.split('cc')
        if len(engine_capacity_parts) < 2:
            return np.nan
        try:
            return int(engine_capacity_parts[0].replace(',', '').strip())
        except ValueError:
            return np.nan
    else:
        return np.nan

# Curb Weight Retriever
def curb_weight_retrieval(parsed_listing_url):
    """
    Retrieves the curb weight in kilograms (KG) from the parsed car listing URL.

    Args:
        parsed_listing_url (BeautifulSoup): The parsed HTML of the car listing.

    Returns:
        int or np.nan: The curb weight in kilograms (KG). Returns np.nan if the curb weight information is not available or cannot be parsed.
    """
    row_info_elements = parsed_listing_url.find_all(class_='row_info')
    if len(row_info_elements) > 5:
        curb_weight_text = row_info_elements[5].text.strip()
        curb_weight_parts = curb_weight_text.split()
        if len(curb_weight_parts) < 2:
            return np.nan
        try:
            return int(curb_weight_parts[0].replace(',', '').strip())
        except ValueError:
            return np.nan
    else:
        return np.nan

def number_of_owners_retrieval(parsed_listing_url):
    """
    Retrieves the number of owners from the parsed car listing URL.

    Parameters:
    parsed_listing_url (BeautifulSoup object): The parsed HTML content of the car listing page.

    Returns:
    int or np.nan: The number of owners. Returns np.nan if the number of owners is not found.
    """
    try:
        # Find the div containing the "No. of Owners" label
        owner_info_div = parsed_listing_url.find(string="No. of Owners")
        if owner_info_div:
            # The number of owners is in the next sibling div with class 'row_info'
            no_of_owners_text = owner_info_div.find_next("div", class_="row_info").text.strip()
            no_of_owners = int(no_of_owners_text)
        else:
            no_of_owners = np.nan
    except (AttributeError, ValueError, TypeError):
        no_of_owners = np.nan
    return no_of_owners

# Type of Vehicle Retriever
def type_of_vehicle_retrieval(parsed_listing_url):
    """
    Retrieves the type of vehicle from the parsed car listing URL.

    Parameters:
    parsed_listing_url (BeautifulSoup object): The parsed HTML content of the car listing page.

    Returns:
    str or np.nan: The type of vehicle. Returns np.nan if the type of vehicle information is not available or cannot be parsed.
    """
    try:
        type_of_vehicle = parsed_listing_url.find(class_='row_bg1').find_all('td')[1].text.strip()
        return type_of_vehicle
    except (IndexError, AttributeError):
        return np.nan
    
def type_of_fuel_retrieval(parsed_listing_url):
    """
    Retrieves the fuel type from the parsed car listing URL.

    Parameters:
    parsed_listing_url (BeautifulSoup object): The parsed HTML content of the car listing page.

    Returns:
    str or np.nan: The fuel type. Returns np.nan if the fuel type information is not available or cannot be parsed.
    """
    try:
        # Find the script tag containing the JSON-like data
        script_tags = parsed_listing_url.find_all("script")
        for script_tag in script_tags:
            script_text = str(script_tag.string)
            if script_text and '"fuelType": "' in script_text:
                # Find the "fuelType" field
                fuel_type_start = script_text.find('"fuelType": "') + len('"fuelType": "')
                fuel_type_end = script_text.find('"', fuel_type_start)
                fuel_type = script_text[fuel_type_start:fuel_type_end]
                return fuel_type
        return np.nan
    except (AttributeError, IndexError, TypeError):
        return np.nan

## 4. Get Links For All Postings

Links for all the car postings will be stored in a list before accessing them one by one for data extraction

In [18]:
# Create listings URLs to iterate through
main_page_listing_list = [] # create list to store search pages
for idx, link in enumerate(range(500)):
    url = "https://www.sgcarmart.com/used-cars/listing?avl=a&limit=100&vts[]=2&page=" + str(idx+1) #search by of car listings per page
    main_page_listing_list.append(url)

In [19]:
print(main_page_listing_list,'\n','\n', len(main_page_listing_list))

['https://www.sgcarmart.com/used-cars/listing?avl=a&limit=100&vts[]=2&page=1', 'https://www.sgcarmart.com/used-cars/listing?avl=a&limit=100&vts[]=2&page=2', 'https://www.sgcarmart.com/used-cars/listing?avl=a&limit=100&vts[]=2&page=3', 'https://www.sgcarmart.com/used-cars/listing?avl=a&limit=100&vts[]=2&page=4', 'https://www.sgcarmart.com/used-cars/listing?avl=a&limit=100&vts[]=2&page=5', 'https://www.sgcarmart.com/used-cars/listing?avl=a&limit=100&vts[]=2&page=6', 'https://www.sgcarmart.com/used-cars/listing?avl=a&limit=100&vts[]=2&page=7', 'https://www.sgcarmart.com/used-cars/listing?avl=a&limit=100&vts[]=2&page=8', 'https://www.sgcarmart.com/used-cars/listing?avl=a&limit=100&vts[]=2&page=9', 'https://www.sgcarmart.com/used-cars/listing?avl=a&limit=100&vts[]=2&page=10', 'https://www.sgcarmart.com/used-cars/listing?avl=a&limit=100&vts[]=2&page=11', 'https://www.sgcarmart.com/used-cars/listing?avl=a&limit=100&vts[]=2&page=12', 'https://www.sgcarmart.com/used-cars/listing?avl=a&limit=100

## 5. Retrieval of Individual Listing URLs from Search Pages

In [14]:
# Base url, or you can think of this as the individual car listing prefix
base_url = 'https://www.sgcarmart.com/used_cars/'
listing_urls = set()  # Use a set to avoid duplicates

# Acquiring indvidual car listings    
for main_link in main_page_listing_list:
   
    # Make a request to the website and get the object
    content = requests.get(main_link)

    # Parse the HTML text
    soup = BeautifulSoup(content.text, 'lxml')

    # Find every single URL in the webpage , refer to this post: # https://stackoverflow.com/questions/46490626/getting-all-links-from-a-page-beautiful-soup
    # This returns a list of every tag that contains a link in one main link (each element in main page listing)

    # Find all the script tags in the webpage
    script_tags = soup.find_all('script')

    # Extract the desired links from the script tags
    for script_tag in script_tags:
        script_text = str(script_tag.string)
        if script_text:
            # Find all occurrences of 'info.php?ID=' and 'DL=' in the script text
            start_indices = [i for i in range(len(script_text)) if script_text.startswith('info.php?ID=', i)]
            for start_index in start_indices:
                end_index = script_text.find('"', start_index)
                link = script_text[start_index:end_index]
                if 'DL=' in link:
                    # Remove unwanted part if present
                    if 'u0026GASRC=sgcm\\' in link:
                        link = link.replace('u0026GASRC=sgcm\\', '')
                    link = link.replace('\\u0026', '&').replace('%5C%5C', '')
                    listing_urls.add(base_url + link)  # Add to set to ensure uniqueness

    # Add a delay to prevent getting blocked
    time.sleep(5)
    
# Convert the set back to a list
listing_urls = list(listing_urls)

# Print the retrieved links
for link in listing_urls:
    print(link)

https://www.sgcarmart.com/used_cars/info.php?ID=1293490&DL=4524\
https://www.sgcarmart.com/used_cars/info.php?ID=1335107&DL=2876\
https://www.sgcarmart.com/used_cars/info.php?ID=1319958&DL=1188\
https://www.sgcarmart.com/used_cars/info.php?ID=1335957&DL=4014\
https://www.sgcarmart.com/used_cars/info.php?ID=1319609&DL=2875\
https://www.sgcarmart.com/used_cars/info.php?ID=1317859&DL=4516\
https://www.sgcarmart.com/used_cars/info.php?ID=1336144&DL=2547\
https://www.sgcarmart.com/used_cars/info.php?ID=1333407&DL=1136\
https://www.sgcarmart.com/used_cars/info.php?ID=1238413&DL=2958\
https://www.sgcarmart.com/used_cars/info.php?ID=1310552&DL=4731\
https://www.sgcarmart.com/used_cars/info.php?ID=1336120&DL=4298\
https://www.sgcarmart.com/used_cars/info.php?ID=1332855&DL=4343\
https://www.sgcarmart.com/used_cars/info.php?ID=1300697&DL=3392\
https://www.sgcarmart.com/used_cars/info.php?ID=1312030&DL=4111\
https://www.sgcarmart.com/used_cars/info.php?ID=1263451&DL=2339\
https://www.sgcarmart.com

In [15]:
print(len(listing_urls))
#print(len(set(listing_urls)))
#print(len(list(set(listing_urls))))

9926
9926
9926


In [20]:
import csv

def save_urls_to_csv(listing_urls, filename='listing_urls.csv'):
    """
    Save a list of URLs to a CSV file with an index.

    Parameters:
    listing_urls (list): List of URLs to save.
    filename (str): The name of the CSV file. Default is 'listing_urls.csv'.
    """
    with open(filename, 'w', newline='') as csvfile:
        csvwriter = csv.writer(csvfile)
        csvwriter.writerow(['Index', 'URL'])  # Write the header
        for index, url in enumerate(listing_urls, start=1):
            csvwriter.writerow([index, url])
    print(f"Links have been saved to {filename} with indices")

In [21]:
save_urls_to_csv(listing_urls)

Links have been saved to listing_urls.csv with indices


In [22]:
print(listing_urls[:10])

['https://www.sgcarmart.com/used_cars/info.php?ID=1293490&DL=4524\\', 'https://www.sgcarmart.com/used_cars/info.php?ID=1335107&DL=2876\\', 'https://www.sgcarmart.com/used_cars/info.php?ID=1319958&DL=1188\\', 'https://www.sgcarmart.com/used_cars/info.php?ID=1335957&DL=4014\\', 'https://www.sgcarmart.com/used_cars/info.php?ID=1319609&DL=2875\\', 'https://www.sgcarmart.com/used_cars/info.php?ID=1317859&DL=4516\\', 'https://www.sgcarmart.com/used_cars/info.php?ID=1336144&DL=2547\\', 'https://www.sgcarmart.com/used_cars/info.php?ID=1333407&DL=1136\\', 'https://www.sgcarmart.com/used_cars/info.php?ID=1238413&DL=2958\\', 'https://www.sgcarmart.com/used_cars/info.php?ID=1310552&DL=4731\\']


## 6. Create DataFrame

def scrape_car_data(listing_urls, filename='sgcarmart_used_cars_prices'):
    """
    Scrapes car data from the provided list of listing URLs and saves the data to a CSV file.

    Parameters:
    listing_urls (list): A list of URLs for the individual car listings.
    filename (str, optional): The name of the output CSV file (without the .csv extension). Defaults to 'sgcarmart_used_cars_prices'.
    """
    # Creating an empty DataFrame for attributes of interest
    df = pd.DataFrame(columns=[
        'LISTING_URL', 'BRAND', 'PRICE', 'DEPRE_YEARLY', 'REG_DATE', 'MILEAGE_KM',
        'MANUFACTURED_YEAR', 'ROAD_TAX_YEARLY', 'TRANSMISSION', 'DEREG_VALUE_FROM_SCRAPE_DATE',
        'SCRAPE_DATE', 'OMV', 'ARF', 'COE_FROM_SCRAPE_DATE', 'DAYS_OF_COE_LEFT',
        'ENGINE_CAPACITY_CC', 'CURB_WEIGHT_KG', 'NO_OF_OWNERS', 'VEHICLE_TYPE', 'FUEL_TYPE'
    ])

    i = 0 # Indexing rows in the DF
    for i, listingurl in enumerate(listing_urls):
        response = requests.get(listingurl)
        listing_url = BeautifulSoup(response.text, 'lxml')

        df.loc[i, 'LISTING_URL'] = listingurl
        df.loc[i, 'BRAND'] = brand_retrieval(listing_url)
        df.loc[i, 'PRICE'] = price_retrieval(listing_url)
        df.loc[i, 'DEPRE_YEARLY'] = depreciation_yearly_retrieval(listing_url)
        df.loc[i, 'REG_DATE'] = reg_date_retrieval(listing_url)
        df.loc[i, 'MILEAGE_KM'] = mileage_retrieval(listing_url)
        df.loc[i, 'MANUFACTURED_YEAR'] = manufactured_year_retrieval(listing_url)
        df.loc[i, 'ROAD_TAX_YEARLY'] = road_tax_retrieval(listing_url)
        df.loc[i, 'TRANSMISSION'] = transmission_retrieval(listing_url)
        df.loc[i, 'DEREG_VALUE_FROM_SCRAPE_DATE'] = dereg_value_retrieval(listing_url)
        df.loc[i, 'SCRAPE_DATE'] = datetime.now().strftime("%d/%m/%Y")
        df.loc[i, 'OMV'] = omv_retrieval(listing_url)
        df.loc[i, 'ARF'] = arf_retrieval(listing_url)
        df.loc[i, 'COE_FROM_SCRAPE_DATE'] = coe_retrieval(listing_url)
        df.loc[i, 'DAYS_OF_COE_LEFT'] = days_of_coe_retrieval(listing_url)
        df.loc[i, 'ENGINE_CAPACITY_CC'] = engine_capacity_retrieval(listing_url)
        df.loc[i, 'CURB_WEIGHT_KG'] = curb_weight_retrieval(listing_url)
        df.loc[i, 'NO_OF_OWNERS'] = number_of_owners_retrieval(listing_url)
        df.loc[i, 'VEHICLE_TYPE'] = type_of_vehicle_retrieval(listing_url)
        df.loc[i, 'FUEL_TYPE'] = type_of_fuel_retrieval(listing_url)
        
        time.sleep(5)  # Prevent getting blocked by the website

    df.to_csv(f"{filename}.csv", index=True)

In [82]:
def scrape_car_data(listing_urls_csv, start_index, end_index, output_file):
    """
    Scrapes car data from the provided CSV file of listing URLs and saves it to a specified output file.

    Parameters:
    listing_urls_csv (str): The name of the input CSV file containing the URLs.
    start_index (int): The starting index of the URLs to process.
    end_index (int): The ending index of the URLs to process.
    output_file (str): The name of the output CSV file to save the scraped data.
    """
    # Read the CSV file into a DataFrame
    urls_df = pd.read_csv(listing_urls_csv)
    
    # Creating an empty DataFrame for attributes of interest
    df = pd.DataFrame(columns=[
        'LISTING_URL', 'BRAND', 'PRICE', 'DEPRE_YEARLY', 'REG_DATE', 'MILEAGE_KM',
        'MANUFACTURED_YEAR', 'ROAD_TAX_YEARLY', 'TRANSMISSION', 'DEREG_VALUE_FROM_SCRAPE_DATE',
        'SCRAPE_DATE', 'OMV', 'ARF', 'COE_FROM_SCRAPE_DATE', 'DAYS_OF_COE_LEFT',
        'ENGINE_CAPACITY_CC', 'CURB_WEIGHT_KG', 'NO_OF_OWNERS', 'VEHICLE_TYPE', 'FUEL_TYPE'
    ])

    for i, row in urls_df.iloc[start_index:end_index].iterrows():
        listingurl = row['URL']
        response = requests.get(listingurl)
        listing_url = BeautifulSoup(response.text, 'lxml')

        df.loc[i + start_index, 'LISTING_URL'] = listingurl
        df.loc[i + start_index, 'BRAND'] = brand_retrieval(listing_url)
        df.loc[i + start_index, 'PRICE'] = price_retrieval(listing_url)
        df.loc[i + start_index, 'DEPRE_YEARLY'] = depreciation_yearly_retrieval(listing_url)
        df.loc[i + start_index, 'REG_DATE'] = reg_date_retrieval(listing_url)
        df.loc[i + start_index, 'MILEAGE_KM'] = mileage_retrieval(listing_url)
        df.loc[i + start_index, 'MANUFACTURED_YEAR'] = manufactured_year_retrieval(listing_url)
        df.loc[i + start_index, 'ROAD_TAX_YEARLY'] = road_tax_retrieval(listing_url)
        df.loc[i + start_index, 'TRANSMISSION'] = transmission_retrieval(listing_url)
        df.loc[i + start_index, 'DEREG_VALUE_FROM_SCRAPE_DATE'] = dereg_value_retrieval(listing_url)
        df.loc[i + start_index, 'SCRAPE_DATE'] = datetime.now().strftime("%d/%m/%Y")
        df.loc[i + start_index, 'OMV'] = omv_retrieval(listing_url)
        df.loc[i + start_index, 'ARF'] = arf_retrieval(listing_url)
        df.loc[i + start_index, 'COE_FROM_SCRAPE_DATE'] = coe_retrieval(listing_url)
        df.loc[i + start_index, 'DAYS_OF_COE_LEFT'] = days_of_coe_retrieval(listing_url)
        df.loc[i + start_index, 'ENGINE_CAPACITY_CC'] = engine_capacity_retrieval(listing_url)
        df.loc[i + start_index, 'CURB_WEIGHT_KG'] = curb_weight_retrieval(listing_url)
        df.loc[i + start_index, 'NO_OF_OWNERS'] = number_of_owners_retrieval(listing_url)
        df.loc[i + start_index, 'VEHICLE_TYPE'] = type_of_vehicle_retrieval(listing_url)
        df.loc[i + start_index, 'FUEL_TYPE'] = type_of_fuel_retrieval(listing_url)

        time.sleep(1)  # Prevent getting blocked by the website

    df.to_csv(output_file, index=False)

In [83]:
listing_urls_length = len(listing_urls)  # or use len(listing_urls) if listing_urls is defined
interval = 200
file_names = []

for i in range(0, listing_urls_length, interval):
    start = i
    end = min(i + interval, listing_urls_length)
    file_index = (i // interval) + 1
    file_name = f'car_data_part{file_index:02d}.csv'
    file_names.append(file_name)
    scrape_car_data('listing_urls.csv', start, end, file_name)
    time.sleep(5)  # Pause for 5 seconds after each interval

# Concatenate the DataFrames from the generated CSV files
dataframes = [pd.read_csv(file_name) for file_name in file_names]

# Ensure all columns are aligned by reindexing
all_columns = pd.Index([])
for df in dataframes:
    all_columns = all_columns.union(df.columns)

dataframes = [df.reindex(columns=all_columns) for df in dataframes]

combined_df = pd.concat(dataframes, ignore_index=True, join='outer')

# Save the combined DataFrame to a new CSV file
combined_df.to_csv('sgcarmart_used_cars_prices.csv', index=False)

In [84]:
# Concatenate the DataFrames from the generated CSV files
dataframes = [pd.read_csv(file_name) for file_name in file_names]
combined_df = pd.concat(dataframes, ignore_index=True)

# Save the combined DataFrame to a new CSV file
combined_df.to_csv('sgcarmart_used_cars_prices.csv', index=True)

# Check the number of listings
print(f"Total number of listings: {combined_df.shape[0]}")

Total number of listings: 9926


#scrape_car_data(listing_urls)
df = pd.read_csv('sgcarmart_used_cars_prices.csv',index_col=0)
df