# Scraping Audi Data from PakWheels.com

## Importing necessary libraries

In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime

## Method for printing logs in separate file

In [5]:
def logging_progress(message):
    with open('./logs/code_logs.txt','a') as file:
        file.write(f"{datetime.now()} : {message}\n")


## Method to extract urls of all audis

In [21]:
def extract_url_of_each_car(url):
    try:
        logging_progress('Intializing Extracting of URLS of each audi.')
        urls = []
        soup = BeautifulSoup(requests.get(url).content, 'html.parser')
        new_audis_lists = soup.find_all(class_="generic-car-widgets-container")
        for audi_container in new_audis_lists:
            car_list = audi_container.find_all(class_='cards')
            for car_container in car_list:
                tag = car_container.find('a')
                link = "https://www.pakwheels.com"+tag.get("href")
                urls.append(link)
        logging_progress('URLS extraction completed.')
        return urls
    except Exception as e:
        logging_progress(f"Error occurred during url extracting!!!!!")
        return []

## Method to extract data of audis

In [8]:
def extract_data(urls):
    cars_data_list = []
    logging_progress('Initializing Extracting of each audi data.')
    for u in urls:
        car_info_dict = {}
        soup = BeautifulSoup(requests.get(u).content, 'html.parser')
        heading_container = soup.find(class_='sect-heading-cont')
        car_info_dict['Model'] = heading_container.find(class_='mb0').text.strip().replace('Specifications', '')
        car_info_block = soup.find(id='model-detailed-specs')
        car_data = car_info_block.find_all('td')
        for i in range(0, len(car_data), 2):
            car_info_dict[car_data[i].text.strip()] = car_data[i+1].text.strip()
        cars_data_list.append(car_info_dict)  
    logging_progress('Data extraction completed.')
    return pd.DataFrame(cars_data_list)  

## Method to transform data(Deriving new column Fuel efficiency on the basis of mileage)

In [9]:
def transform_data(df):
    logging_progress('Initializing transforming data(including new column fuel effieciency according to mileage).')
    mileage_bounds = df['Mileage'].str.extract(r'(\d+)\s*-\s*(\d+)')
    mileage_bounds = mileage_bounds.astype(float)
    df['Average_Mileage'] = mileage_bounds.mean(axis=1)
    
    def categorize_efficiency(mileage):
        if mileage > 15:
            return "High Efficiency"
        elif 10 <= mileage < 15:
            return "Medium Efficiency"
        else:
            return "Low Efficiency"
    df['Fuel Efficiency'] = df['Average_Mileage'].apply(categorize_efficiency)
    logging_progress('Data transformation completed.')

## loading data to csv file

In [10]:
def load_data(df):
    logging_progress('Initializing Loading of transform data.')
    df.to_csv('./output/Audi-data.csv')
    logging_progress('Data loaded.')

## Executing Pipeline

In [18]:
if __name__ == '__main__':
    # Define the base URL for extracting car data (Audi cars in this case)
    url = 'https://www.pakwheels.com/new-cars/audi/'
    # Extract URLs of individual car models from the main page
    urls = extract_url_of_each_car(url)
    # Scrape detailed specifications for each car using the extracted URLs
    data = extract_data(urls)
    # Transform the extracted data (e.g., categorize fuel efficiency, clean data, etc.)
    transform_data(data)
    # Load the transformed data into storage (e.g., save to CSV, database, etc.)
    load_data(data)
    logging_progress('ETL Process completed.')
