**Group Data Drillers**

1. ALIATUL IZZAH BINTI JASMAN (A22EC0136)
2. MULYANI BINTI SARIPUDDIN (A22EC0223)
3. MUHAMMAD ANAS BIN MOHD PIKRI (A21SC0464)
4. THEVAN RAJU A/L JEGANATH (A22EC0286)


**Carlist.my Car Listings Web Scraper**

This Python script is designed to scrape car listings from Carlist.my,
extracting detailed information about vehicles for sale across Malaysia.
The crawler uses multithreading to efficiently navigate through multiple
pages of listings and save the extracted data into a structured CSV file.

Key Features:
- Target Website: https://www.carlist.my
- Data Extracted:
    - Car name, price, currency
    - Location and region
    - Brand, model, year
    - Mileage, fuel type, color
    - Body type, seating capacity, condition
    - Image URL, description, listing URL
- Concurrency using ThreadPoolExecutor for faster scraping
- Retry logic for failed HTTP requests
- Dynamic detection of total pages
- Output: carlists_data.csv with structured rows and headers

Usage:
Run the script in this notebook, and the CSV file will be created with all car listing data.


In [None]:
!pip install beautifulsoup4
!pip install requests




In [None]:
import requests
import time
import os
from bs4 import BeautifulSoup
import json
import csv
import concurrent.futures
from bs4 import BeautifulSoup



In [None]:
base_url = 'https://www.carlist.my/cars-for-sale/malaysia'

In [None]:
page =requests.get(base_url)

In [None]:
BeautifulSoup(page.text,'html')

<!DOCTYPE html>
<!--[if IE 8]><html lang="en" class="ie8"><![endif]--><!--[if !IE 8]><!--><html lang="en"><!--<![endif]-->
<head>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="text/html;charset=utf-8" http-equiv="Content-Type"/>
<meta content="width=device-width, initial-scale=1, user-scalable=no" name="viewport"/>
<title>Find All New, Used/Second Hand &amp; Recon Cars for Sale | Carlist.my</title>
<!-- Global site tag (gtag.js) - Google Analytics -->
<script async="" defer="" src="https://www.googletagmanager.com/gtag/js?id=UA-7749517-1"></script>
<script>
    window.dataLayer = window.dataLayer || [];

    function gtag() {
        dataLayer.push(arguments);
    }

    gtag('js', new Date());

    gtag('config', 'UA-7749517-1', {'optimize_id': 'GTM-PRFGB2M', 'send_page_view': false});
</script> <meta content="_csrf" name="csrf-param"/>
<meta content="NNPPzS5oeoYXnSYqN_-DA_6Zt9EoMOYPQPgXgFMBGWB8oPv5YQkvxFTRUG9kzdwutdrFvUxdtl46j0PGJzgrFA=

In [None]:
page_size = 50  # Number of cars per page
max_retries = 3  # Retry limit for failed requests
max_workers = 8  # Max concurrent workers to scrape pages

total_rows = 0
csv_file = 'carlists_data.csv'

# Create CSV with headers
with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow([
        'Car Name', 'Price (MYR)', 'Currency', 'Location', 'Region',
        'Brand', 'Model', 'Year', 'Mileage', 'Fuel Type', 'Color',
        'Body Type', 'Seating Capacity', 'Condition',
        'Image', 'Description', 'URL',
    ])

# Define function to fetch a page and handle retries
def fetch_page(url, retries=0):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()  # Raise exception for 4xx/5xx HTTP status codes
        return response.text
    except requests.exceptions.RequestException as e:
        if retries < max_retries:
            print(f"Retrying {url}... Attempt {retries + 1}")
            return fetch_page(url, retries + 1)
        else:
            print(f"Failed to fetch {url}: {e}")
            return None

# Define function to parse car data from the page
def parse_car_data(listings):
    car_data = []
    for listing in listings:
        try:
            car_info = json.loads(listing.string.strip())
            if isinstance(car_info, list):
                for item in car_info:
                    if 'itemListElement' in item:
                        for car in item['itemListElement']:
                            car_name = f"{car['item']['brand']['name']} {car['item']['model']}"
                            car_price = car['item']['offers']['price']
                            car_url = car['item']['url']
                            car_location = car['item']['offers']['seller']['homeLocation']['address']['addressLocality']
                            car_region = car['item']['offers']['seller']['homeLocation']['address']['addressRegion']
                            car_brand = car['item']['brand']['name']
                            car_model = car['item']['model']
                            car_year = car['item']['vehicleModelDate']
                            car_mileage = car['item']['mileageFromOdometer']['value']
                            car_fuel_type = car['item']['fuelType']
                            car_color = car['item']['color']
                            car_body_type = car['item']['bodyType']
                            car_seating_capacity = car['item']['seatingCapacity']
                            condition = car['item']['itemCondition'].split('/')[-1]
                            image = car['item']['image'][0] if isinstance(car['item']['image'], list) else car['item']['image']
                            description = car['item']['description'].strip()
                            price_currency = car['item']['offers']['priceCurrency']

                            car_data.append([
                                car_name, car_price, price_currency, car_location, car_region,
                                car_brand, car_model, car_year, car_mileage, car_fuel_type,
                                car_color, car_body_type, car_seating_capacity,
                                condition, image, description, car_url,
                            ])
        except Exception as e:
            print(f"Error parsing car data: {e}")
    return car_data

# Define function to scrape a page
def scrape_page(page):
    url = f'{base_url}?page_size={page_size}&page_number={page}'
    print(f"Scraping page {page}...")
    html = fetch_page(url)
    if html:
        soup = BeautifulSoup(html, 'html.parser')
        listings = soup.find_all('script', type='application/ld+json')
        if listings:
            car_data = parse_car_data(listings)
            return car_data
    return []

# Function to save car data to CSV
def save_to_csv(car_data):
    global total_rows
    with open(csv_file, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        for row in car_data:
            writer.writerow(row)
            total_rows += 1

# Function to detect the total number of pages
def get_total_pages():
    url = f"{base_url}?page_size={page_size}&page_number=1"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    last_page_tag = soup.select_one("ul.pagination li:last-child a")
    if last_page_tag and last_page_tag.get("href"):
        try:
            last_page = int(last_page_tag.get("href").split("page_number=")[-1])
            return last_page
        except Exception:
            pass
    return 100  # fallback if not detected

# Main function to manage the scraping process
def main():
    start_time = time.time()

    total_pages = get_total_pages()  # Dynamically fetch the total number of pages

    # Loop through pages in parallel using ThreadPoolExecutor
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_page = {executor.submit(scrape_page, page): page for page in range(1, total_pages + 1)}
        for future in concurrent.futures.as_completed(future_to_page):
            page = future_to_page[future]
            try:
                car_data = future.result()
                if car_data:
                    save_to_csv(car_data)
                    print(f"Page {page} scraped and saved, total rows: {total_rows}")
            except Exception as e:
                print(f"Error scraping page {page}: {e}")

    end_time = time.time()
    execution_time = end_time - start_time
    file_size_kb = os.path.getsize(csv_file) / 1024

    print(f"\nDone scraping.")
    print(f"Total rows: {total_rows}")
    print(f"Time taken: {execution_time:.2f} seconds")
    print(f"File size: {file_size_kb:.2f} KB")

# Ensure the main function runs when executed directly
if __name__ == "__main__":
    main()


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Page 1055 scraped and saved, total rows: 52600
Scraping page 1061...
Page 1053 scraped and saved, total rows: 52650
Scraping page 1062...
Page 1052 scraped and saved, total rows: 52700
Scraping page 1063...Page 1057 scraped and saved, total rows: 52750

Scraping page 1064...
Page 1058 scraped and saved, total rows: 52800
Scraping page 1065...
Scraping page 1066...
Page 1062 scraped and saved, total rows: 52850
Page 1059 scraped and saved, total rows: 52900
Scraping page 1067...
Scraping page 1068...
Scraping page 1069...
Scraping page 1070...
Page 1061 scraped and saved, total rows: 52950
Page 1060 scraped and saved, total rows: 53000
Page 1048 scraped and saved, total rows: 53050
Page 1056 scraped and saved, total rows: 53100
Scraping page 1071...
Page 1063 scraped and saved, total rows: 53150
Scraping page 1072...
Page 1064 scraped and saved, total rows: 53200
Scraping page 1073...
Page 1065 scraped and saved, total row