# Web Scraping Used Cars on sgcarmart.com
## 1. Introduction

### This document outlines the process of web scraping data from sgcarmart.com, the largest online car marketplace in Singapore, to analyze the used car market.

### Respecting sgcarmart.com's Rules
The scraping script will adhere to the guidelines outlined in sgcarmart.com's robots.txt file. Here's a summary of the restrictions:

Crawlers must wait at least 5 seconds between requests (Crawl-delay: 5).
Specific directories are off-limits for scraping, including:
cgi-bin/
images/
mail/
dealer/
directory/premium/
includes/
phpads/
update/
upload/

### Data Extraction
The script will focus on extracting the following information for each used car listing:

Car Listing URL 'LISTING_URL', 
Car Brand and Model 'BRAND', 
Price 'PRICE', 
Depreciation Value Yearly 'DEPRE_YEARLY', 
Registered Date 'REG_DATE', 
Mileage in KM 'MILEAGE_KM', 
Year of Manufacture 'MANUFACTURED_YEAR', 
Road Tax Yearly 'ROAD_TAX_YEARLY', 
Automatic or Manual Tranmission 'TRANSMISSION', 
Deregistration Value as of Web Scraping DTD 'DEREG_VALUE_FROM_SCRAPE_DATE', 
Web Scraping DTD 'SCRAPE_DATE', 
Open Market Value (OMV) 'OMV', 
Additional Registration Fee (ARF) 'ARF', 
Certificate of Entitlement (COE) from Web Scraping DTD 'COE_FROM_SCRAPE_DATE', 
Number of Days till COE Expires 'DAYS_OF_COE_LEFT', 
Engine Capacity in CC 'ENGINE_CAPACITY_CC', 
Car Curb Weight in KG 'CURB_WEIGHT_KG', 
Number of Past Owners 'NO_OF_OWNERS', 
Vehicle Type 'VEHICLE_TYPE'

This data will be used for further analysis of the used car market in Singapore.

## 2. Import Libraries

In [12]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import time
import re
from datetime import datetime

## 3. Pre-defined Functions

In [13]:
# A set of functions is defined to extract specific attributes from a parsed individual car listing URL, with each function returning the corresponding attribute.

# Brand Retriever Function 
def brand_retrieval(parsed_listing_url):
    """
    Retrieves the brand name from the parsed car listing URL.

    Parameters:
    parsed_listing_url (BeautifulSoup object): The parsed HTML content of the car listing page.

    Returns:
    str or np.nan: The brand name. Returns np.nan if the brand name is not found.
    """
    try:
        brand_tag = parsed_listing_url.find("a", class_="nounderline globaltitle")
        if brand_tag:
            #print('Brand: ', brand_tag.text.strip())
            brand = brand_tag.text.strip()
            clean_brand = re.sub(r'\s*\(.*?\)', '', brand)# Remove any text within parentheses, including the space before the parentheses, from the BRAND column
            return clean_brand
        else:
            return np.nan
    except (AttributeError, TypeError):
        return np.nan

# Price Retriever Function
def price_retrieval(parsed_listing_url):
    """
    Retrieves the price from the parsed car listing URL.

    Parameters:
    parsed_listing_url (BeautifulSoup object): The parsed HTML content of the car listing page.

    Returns:
    int or np.nan: The price of the car. Returns np.nan if the price information is not available or cannot be parsed.
    """
    price_text = parsed_listing_url.find_all(class_="font_red")[0].text.strip()
    price_parts = price_text.split('$')
    if len(price_parts) < 2:
        return np.nan

    try:
        price = int(''.join(price_parts[1].split(',')))
    except (IndexError, ValueError):
        return np.nan
    return price

# Deprecration Value Per Year Retriever Function
def depreciation_yearly_retrieval(parsed_listing_url):
    """
    Retrieves the yearly depreciation value from the parsed car listing URL.

    Parameters:
    parsed_listing_url (BeautifulSoup object): The parsed HTML content of the car listing page.

    Returns:
    int or np.nan: The yearly depreciation value. Returns np.nan if the depreciation information is not available or cannot be parsed.
    """
    depreciation_text = parsed_listing_url.find_all(class_="label")[1].findNextSibling().text.strip()
    depreciation_parts = depreciation_text.split('$')
    if len(depreciation_parts) < 2:
        return np.nan

    try:
        depreciation_yearly = int(''.join(depreciation_parts[1].split('/yr')[0].split(',')))
    except (IndexError, ValueError):
        return np.nan
    return depreciation_yearly

# Road Tax Per Year Retriever
def road_tax_retrieval(parsed_listing_url):
    """
    Retrieves the yearly road tax from the parsed car listing URL.

    Parameters:
    parsed_listing_url (BeautifulSoup object): The parsed HTML content of the car listing page.

    Returns:
    int or np.nan: The yearly road tax. Returns np.nan if the road tax information is not available or cannot be parsed.
    """
    road_tax_text = parsed_listing_url.find_all(class_='row_info')[1].text.strip()
    if '/yr' not in road_tax_text:
        return np.nan

    road_tax_parts = road_tax_text.replace('/yr', '').strip().split('$')
    if len(road_tax_parts) < 2:
        return np.nan

    try:
        road_tax_yearly = int(''.join(road_tax_parts[1].split(',')))
    except (IndexError, ValueError):
        return np.nan
    return road_tax_yearly

# Registered Date Retriever
def reg_date_retrieval(parsed_listing_url):
    reg_date = parsed_listing_url.find_all(class_='row_bg')[1].find_all('td')[3].text.split()[0].split('(')[0]
    #print('Reg date: ', reg_date)
    return reg_date

# Days of COE Retriever
def days_of_coe_retrieval(parsed_listing_url):
    """
    Retrieves the days of COE left from the parsed car listing URL.

    Parameters:
    parsed_listing_url (BeautifulSoup object): The parsed HTML content of the car listing page.

    Returns:
    str or np.nan: The days of COE left. Returns np.nan if the information is not found.
    """
    try:
        row_bg_elements = parsed_listing_url.find_all(class_='row_bg')
        if len(row_bg_elements) > 1:
            td_elements = row_bg_elements[1].find_all('td')
            if len(td_elements) > 3:
                days_of_coe_left_text = td_elements[3].text.split('(')[1].split('COE')[0].strip()
                return yr_mm_dd_cleaner(days_of_coe_left_text)
        return np.nan
    except (IndexError, AttributeError, ValueError):
        return np.nan

#def days_of_coe_retrieval(parsed_listing_url):
#    days_of_coe_left_yy_mm_dd_format_for_cleaner_function=\
#    parsed_listing_url.find_all(class_='row_bg')[1].find_all('td')[3].text.split('(')[1].split('COE')[0].strip()
#    #print('COE :', days_of_coe_left_yy_mm_dd_format_for_cleaner_function)
#    return yr_mm_dd_cleaner(days_of_coe_left_yy_mm_dd_format_for_cleaner_function)


def yr_mm_dd_cleaner(str1):
    """
    Accepts a string that may or may include the elements yr, mths, and days,
    and converts the whole string into the number of days.

    Parameters:
    str1 (str): A string representing the duration in the format "Xyr Xmth Xdays".

    Returns:
    int: The number of days.
    """
    days = 0
    for unit in ['yr', 'mth', 'day']:
        value = int(next((int(char) for char in str1 if char.isdigit() and unit in str1), 0))
        days += value * {'yr': 365, 'mth': 30, 'day': 1}[unit]
    return days

# Mileage Retriever
def mileage_retrieval(parsed_listing_url):
    """
    Retrieves the mileage in kilometers from the parsed listing URL.

    Parameters:
    parsed_listing_url (BeautifulSoup object): The parsed HTML content of the car listing page.

    Returns:
    int or np.nan: The mileage in kilometers. Returns np.nan if the mileage information is not available.
    """
    mileage_text = parsed_listing_url.find_all(class_='row_info')[0].text.strip()
    mileage_parts = mileage_text.split('km')
    if len(mileage_parts) < 2:
        return np.nan

    try:
        mileage_km = int(''.join(mileage_parts[0].strip().replace(',', '')))
    except (IndexError, ValueError):
        return np.nan
    return mileage_km

# Manufactured Year Retriever
def manufactured_year_retrieval(parsed_listing_url):
    manufactured_year = parsed_listing_url.find_all(class_='row_info')[6].text.strip()
    return manufactured_year.split()[0]

# Transmission Retriever
def transmission_retrieval(parsed_listing_url):
    transmission = parsed_listing_url.find_all(class_='row_info')[7].text.strip()
    return transmission.split()[0]

# Deregistration Value Retriever
def dereg_value_retrieval(parsed_listing_url):
    """
    Retrieves the deregistration value from the parsed car listing URL.

    Parameters:
    parsed_listing_url (BeautifulSoup object): The parsed HTML content of the car listing page.

    Returns:
    int or np.nan: The deregistration value. Returns np.nan if the deregistration value information is not available or cannot be parsed.
    """
    dereg_value_text = parsed_listing_url.find_all(class_='row_info')[2].text.strip()
    dereg_value_parts = dereg_value_text.split('$')
    if len(dereg_value_parts) < 2:
        return np.nan

    try:
        dereg_value_from_scrape_date = int(''.join(dereg_value_parts[1].split()[0].split(',')))
    except (IndexError, ValueError):
        return np.nan

    return dereg_value_from_scrape_date

# Open Market Value Retriever
def omv_retrieval(parsed_listing_url):
    """
    Retrieves the Open Market Value (OMV) from the parsed car listing URL.

    Parameters:
    parsed_listing_url (BeautifulSoup object): The parsed HTML content of the car listing page.

    Returns:
    int or np.nan: The Open Market Value (OMV). Returns np.nan if the OMV information is not available or cannot be parsed.
    """
    omv_text = parsed_listing_url.find_all(class_='row_info')[8].text.strip()
    omv_parts = omv_text.split('$')
    if len(omv_parts) < 2:
        return np.nan

    try:
        omv = int(''.join(omv_parts[1].split(',')))
    except (IndexError, ValueError):
        return np.nan

    return omv  

# ARF Retriever
def arf_retrieval(parsed_listing_url):
    """
    Retrieves the Additional Registration Fee (ARF) from the parsed car listing URL.

    Parameters:
    parsed_listing_url (BeautifulSoup object): The parsed HTML content of the car listing page.

    Returns:
    int or np.nan: The Additional Registration Fee (ARF). Returns np.nan if the ARF information is not available or cannot be parsed.
    """
    arf_text = parsed_listing_url.find_all(class_='row_info')[9].text.strip()
    arf_parts = arf_text.split('$')
    if len(arf_parts) < 2:
        return np.nan

    try:
        arf = int(''.join(arf_parts[1].split(',')))
    except (IndexError, ValueError):
        return np.nan

    return arf

# COE Price retriever 
def coe_retrieval(parsed_listing_url):
    """
    Retrieves the Certificate of Entitlement (COE) price from the parsed car listing URL.

    Parameters:
    parsed_listing_url (BeautifulSoup object): The parsed HTML content of the car listing page.

    Returns:
    int or np.nan: The Certificate of Entitlement (COE) price. Returns np.nan if the COE price information is not available or cannot be parsed.
    """
    coe_text = parsed_listing_url.find_all(class_='row_info')[3].text.strip()
    coe_parts = coe_text.split('$')
    if len(coe_parts) < 2:
        return np.nan

    try:
        coe_from_scrape_date = int(''.join(coe_parts[1].split(',')))
    except (IndexError, ValueError):
        return np.nan

    return coe_from_scrape_date

# Engine Capacity Retriever
def engine_capacity_retrieval(parsed_listing_url):
    """
    Retrieves the engine capacity in cubic centimeters (CC) from the parsed car listing URL.

    Parameters:
    parsed_listing_url (BeautifulSoup object): The parsed HTML content of the car listing page.

    Returns:
    int or np.nan: The engine capacity in cubic centimeters (CC). Returns np.nan if the engine capacity information is not available or cannot be parsed.
    """
    engine_capacity_text = parsed_listing_url.find_all(class_='row_info')[4].text.strip()
    engine_capacity_parts = engine_capacity_text.split('cc')
    if len(engine_capacity_parts) < 2:
        return np.nan

    try:
        engine_capacity = int(''.join(engine_capacity_parts[0].strip().replace(',', '')))
    except (IndexError, ValueError):
        return np.nan

    return engine_capacity

# Curb Weight Retriever
def curb_weight_retrieval(parsed_listing_url):
    """
    Retrieves the curb weight in kilograms (KG) from the parsed car listing URL.

    Parameters:
    parsed_listing_url (BeautifulSoup object): The parsed HTML content of the car listing page.

    Returns:
    int or np.nan: The curb weight in kilograms (KG). Returns np.nan if the curb weight information is not available or cannot be parsed.
    """
    curb_weight_text = parsed_listing_url.find_all(class_='row_info')[5].text.strip()
    curb_weight_parts = curb_weight_text.split()
    if len(curb_weight_parts) < 2:
        return np.nan

    try:
        curb_weight = int(''.join(curb_weight_parts[0].replace(',', '')))
    except (IndexError, ValueError):
        return np.nan

    return curb_weight

def number_of_owners_retrieval(parsed_listing_url):
    """
    Retrieves the number of owners from the parsed car listing URL.

    Parameters:
    parsed_listing_url (BeautifulSoup object): The parsed HTML content of the car listing page.

    Returns:
    int or np.nan: The number of owners. Returns np.nan if the number of owners is not found.
    """
    try:
        # Find the div containing the "No. of Owners" label
        owner_info_div = parsed_listing_url.find(string="No. of Owners")
        if owner_info_div:
            # The number of owners is in the next sibling div with class 'row_info'
            no_of_owners_text = owner_info_div.find_next("div", class_="row_info").text.strip()
            no_of_owners = int(no_of_owners_text)
        else:
            no_of_owners = np.nan
    except (AttributeError, ValueError, TypeError):
        no_of_owners = np.nan
    return no_of_owners

# Type of Vehicle Retriever
def type_of_vehicle_retrieval(parsed_listing_url):
    """
    Retrieves the type of vehicle from the parsed car listing URL.

    Parameters:
    parsed_listing_url (BeautifulSoup object): The parsed HTML content of the car listing page.

    Returns:
    str or np.nan: The type of vehicle. Returns np.nan if the type of vehicle information is not available or cannot be parsed.
    """
    try:
        type_of_vehicle = parsed_listing_url.find(class_='row_bg1').find_all('td')[1].text.strip()
        return type_of_vehicle
    except (IndexError, AttributeError):
        return np.nan

## 4. Get Links For All Postings

Links for all the car postings will be stored in a list before accessing them one by one for data extraction

In [14]:
# Create listings URLs to iterate through
main_page_listing_list = [] # create list to store search pages
for idx, link in enumerate(range(100)):
    url = "https://www.sgcarmart.com/used_cars/listing.php?BRSR=" + str(idx * 100) + "&RPG=100&AVL=2&VEH=2" #search by of 100 car listings per page
    main_page_listing_list.append(url)

In [15]:
print(main_page_listing_list,'\n','\n', len(main_page_listing_list))

['https://www.sgcarmart.com/used_cars/listing.php?BRSR=0&RPG=100&AVL=2&VEH=2', 'https://www.sgcarmart.com/used_cars/listing.php?BRSR=100&RPG=100&AVL=2&VEH=2', 'https://www.sgcarmart.com/used_cars/listing.php?BRSR=200&RPG=100&AVL=2&VEH=2', 'https://www.sgcarmart.com/used_cars/listing.php?BRSR=300&RPG=100&AVL=2&VEH=2', 'https://www.sgcarmart.com/used_cars/listing.php?BRSR=400&RPG=100&AVL=2&VEH=2', 'https://www.sgcarmart.com/used_cars/listing.php?BRSR=500&RPG=100&AVL=2&VEH=2', 'https://www.sgcarmart.com/used_cars/listing.php?BRSR=600&RPG=100&AVL=2&VEH=2', 'https://www.sgcarmart.com/used_cars/listing.php?BRSR=700&RPG=100&AVL=2&VEH=2', 'https://www.sgcarmart.com/used_cars/listing.php?BRSR=800&RPG=100&AVL=2&VEH=2', 'https://www.sgcarmart.com/used_cars/listing.php?BRSR=900&RPG=100&AVL=2&VEH=2', 'https://www.sgcarmart.com/used_cars/listing.php?BRSR=1000&RPG=100&AVL=2&VEH=2', 'https://www.sgcarmart.com/used_cars/listing.php?BRSR=1100&RPG=100&AVL=2&VEH=2', 'https://www.sgcarmart.com/used_cars/li

## 5. Retrieval of Individual Listing URLs from Search Pages

In [16]:
# Base url, or you can think of this as the individual car listing prefix
base_url = 'https://www.sgcarmart.com/used_cars/'
listing_urls = set()  # Use a set to avoid duplicates

# Acquiring indvidual car listings    
for main_link in main_page_listing_list:
   
    # Make a request to the website and get the object
    content = requests.get(main_link)

    # Parse the HTML text
    soup = BeautifulSoup(content.text, 'lxml')

    # Find every single URL in the webpage , refer to this post: # https://stackoverflow.com/questions/46490626/getting-all-links-from-a-page-beautiful-soup
    # This returns a list of every tag that contains a link in one main link (each element in main page listing)

    # Find all the script tags in the webpage
    script_tags = soup.find_all('script')

    # Extract the desired links from the script tags
    for script_tag in script_tags:
        script_text = str(script_tag.string)
        if script_text:
            # Find all occurrences of 'info.php?ID=' and 'DL=' in the script text
            start_indices = [i for i in range(len(script_text)) if script_text.startswith('info.php?ID=', i)]
            for start_index in start_indices:
                end_index = script_text.find('"', start_index)
                link = script_text[start_index:end_index]
                if 'DL=' in link:
                    # Remove unwanted part if present
                    if 'u0026GASRC=sgcm\\' in link:
                        link = link.replace('u0026GASRC=sgcm\\', '')
                    link = link.replace('\\u0026', '&').replace('%5C%5C', '')
                    listing_urls.add(base_url + link)  # Add to set to ensure uniqueness

    # Add a delay to prevent getting blocked
    time.sleep(5)
    
# Convert the set back to a list
listing_urls = list(listing_urls)

# Print the retrieved links
for link in listing_urls:
    print(link)

https://www.sgcarmart.com/used_cars/info.php?ID=1316564&DL=2689\
https://www.sgcarmart.com/used_cars/info.php?ID=1286467&DL=1242\
https://www.sgcarmart.com/used_cars/info.php?ID=1227438&DL=3811\
https://www.sgcarmart.com/used_cars/info.php?ID=1333066&DL=1277\
https://www.sgcarmart.com/used_cars/info.php?ID=1333470&DL=3590\
https://www.sgcarmart.com/used_cars/info.php?ID=1247890&DL=3837\
https://www.sgcarmart.com/used_cars/info.php?ID=1311531&DL=3318\
https://www.sgcarmart.com/used_cars/info.php?ID=1315089&DL=3875\
https://www.sgcarmart.com/used_cars/info.php?ID=1276589&DL=2976\
https://www.sgcarmart.com/used_cars/info.php?ID=1334765&DL=1000\
https://www.sgcarmart.com/used_cars/info.php?ID=1331925&DL=3841\
https://www.sgcarmart.com/used_cars/info.php?ID=1303818&DL=3337\
https://www.sgcarmart.com/used_cars/info.php?ID=1223241&DL=4454\
https://www.sgcarmart.com/used_cars/info.php?ID=1326636&DL=1339\
https://www.sgcarmart.com/used_cars/info.php?ID=1288409&DL=3495\
https://www.sgcarmart.com

In [17]:
print(len(listing_urls))
print(len(set(listing_urls)))
print(len(list(set(listing_urls))))

2001
2001
2001


In [18]:
print(listing_urls[:10])

['https://www.sgcarmart.com/used_cars/info.php?ID=1316564&DL=2689\\', 'https://www.sgcarmart.com/used_cars/info.php?ID=1286467&DL=1242\\', 'https://www.sgcarmart.com/used_cars/info.php?ID=1227438&DL=3811\\', 'https://www.sgcarmart.com/used_cars/info.php?ID=1333066&DL=1277\\', 'https://www.sgcarmart.com/used_cars/info.php?ID=1333470&DL=3590\\', 'https://www.sgcarmart.com/used_cars/info.php?ID=1247890&DL=3837\\', 'https://www.sgcarmart.com/used_cars/info.php?ID=1311531&DL=3318\\', 'https://www.sgcarmart.com/used_cars/info.php?ID=1315089&DL=3875\\', 'https://www.sgcarmart.com/used_cars/info.php?ID=1276589&DL=2976\\', 'https://www.sgcarmart.com/used_cars/info.php?ID=1334765&DL=1000\\']


## 6. Create DataFrame

In [19]:
def scrape_car_data(listing_urls, filename='sgcarmart_used_cars_prices'):
    """
    Scrapes car data from the provided list of listing URLs and saves the data to a CSV file.

    Parameters:
    listing_urls (list): A list of URLs for the individual car listings.
    filename (str, optional): The name of the output CSV file (without the .csv extension). Defaults to 'sgcarmart_used_cars_prices'.
    """
    # Creating an empty DataFrame for attributes of interest
    df = pd.DataFrame(columns=[
        'LISTING_URL', 'BRAND', 'PRICE', 'DEPRE_YEARLY', 'REG_DATE', 'MILEAGE_KM',
        'MANUFACTURED_YEAR', 'ROAD_TAX_YEARLY', 'TRANSMISSION', 'DEREG_VALUE_FROM_SCRAPE_DATE',
        'SCRAPE_DATE', 'OMV', 'ARF', 'COE_FROM_SCRAPE_DATE', 'DAYS_OF_COE_LEFT',
        'ENGINE_CAPACITY_CC', 'CURB_WEIGHT_KG', 'NO_OF_OWNERS', 'VEHICLE_TYPE'
    ])

    i = 0 # Indexing rows in the DF
    for i, listingurl in enumerate(listing_urls):
        response = requests.get(listingurl)
        listing_url = BeautifulSoup(response.text, 'lxml')

        df.loc[i, 'LISTING_URL'] = listingurl
        df.loc[i, 'BRAND'] = brand_retrieval(listing_url)
        df.loc[i, 'PRICE'] = price_retrieval(listing_url)
        df.loc[i, 'DEPRE_YEARLY'] = depreciation_yearly_retrieval(listing_url)
        df.loc[i, 'REG_DATE'] = reg_date_retrieval(listing_url)
        df.loc[i, 'MILEAGE_KM'] = mileage_retrieval(listing_url)
        df.loc[i, 'MANUFACTURED_YEAR'] = manufactured_year_retrieval(listing_url)
        df.loc[i, 'ROAD_TAX_YEARLY'] = road_tax_retrieval(listing_url)
        df.loc[i, 'TRANSMISSION'] = transmission_retrieval(listing_url)
        df.loc[i, 'DEREG_VALUE_FROM_SCRAPE_DATE'] = dereg_value_retrieval(listing_url)
        df.loc[i, 'SCRAPE_DATE'] = datetime.now().strftime("%d/%m/%Y")
        df.loc[i, 'OMV'] = omv_retrieval(listing_url)
        df.loc[i, 'ARF'] = arf_retrieval(listing_url)
        df.loc[i, 'COE_FROM_SCRAPE_DATE'] = coe_retrieval(listing_url)
        df.loc[i, 'DAYS_OF_COE_LEFT'] = days_of_coe_retrieval(listing_url)
        df.loc[i, 'ENGINE_CAPACITY_CC'] = engine_capacity_retrieval(listing_url)
        df.loc[i, 'CURB_WEIGHT_KG'] = curb_weight_retrieval(listing_url)
        df.loc[i, 'NO_OF_OWNERS'] = number_of_owners_retrieval(listing_url)
        df.loc[i, 'VEHICLE_TYPE'] = type_of_vehicle_retrieval(listing_url)
        
        time.sleep(5)  # Prevent getting blocked by the website

    df.to_csv(f"{filename}.csv", index=True)

In [20]:
scrape_car_data(listing_urls)
df = pd.read_csv('sgcarmart_used_cars_prices.csv',index_col=0)
df