# **WEBSCRAPING CODE**

In [1]:
!pip install bs4

Defaulting to user installation because normal site-packages is not writeable
Collecting bs4
  Downloading bs4-0.0.2-py2.py3-none-any.whl.metadata (411 bytes)
Downloading bs4-0.0.2-py2.py3-none-any.whl (1.2 kB)
Installing collected packages: bs4
Successfully installed bs4-0.0.2


# Mumbai

In [None]:
# Import necessary libraries
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
import pandas as pd
import time

# --- 1. Setup and Load Page ---
my_url = "https://www.cars24.com/buy-used-tata-cars-mumbai/?sort=bestmatch&serveWarrantyCount=true&listingSource=Homepage_Filters"
print("Opening browser...")
driver = webdriver.Edge() # Or webdriver.Chrome(), webdriver.Firefox() etc.
driver.get(my_url)
driver.maximize_window() 
print("Page loaded. Waiting for initial content...")

# List to hold the dictionaries of car data
car_data = [] 

# --- STABLE CONTAINER SELECTOR ---
# We will try the 'data-testid' first as it's the most stable
container_xpath = "//div[@data-testid='car-listing']" 
fallback_xpath = "//a[contains(@class, 'carCardWrapper')]" # Fallback if data-testid fails

try:
    # Wait for at least one car card container to appear initially
    WebDriverWait(driver, 20).until(
        EC.presence_of_element_located((By.XPATH, container_xpath))
    )
    print("Initial listings found using 'data-testid'. Starting to scroll...")
except TimeoutException:
    print("'data-testid' timed out. Trying fallback selector (class*='carCardWrapper')...")
    try:
        # --- FALLBACK SELECTOR ---
        container_xpath = fallback_xpath # Switch to the fallback
        WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.XPATH, container_xpath))
        )
        print("Initial listings found using fallback selector. Starting to scroll...")
    except TimeoutException:
        print("Page timed out AGAIN. Neither selector worked. Site structure has likely changed.")
        driver.quit()
        raise SystemExit("Could not load initial listings.") # Stop notebook execution
    # --- END FALLBACK ---

# --- 2. Scroll to Bottom to Load All Cars ---
print("Scrolling to the bottom...")
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(2.5) 
    new_height = driver.execute_script("return document.body.scrollHeight")
    if new_height == last_height:
        print("Reached bottom of the page.")
        break 
    last_height = new_height
    # print(f"Scrolling... new height {new_height}") 

# --- 3. Find ALL Containers and Extract Data Directly ---
try:
    # Find all containers using the XPath that successfully loaded
    containers = driver.find_elements(By.XPATH, container_xpath)
    print(f"Found {len(containers)} total car listings after scrolling.")
    print("Extracting data directly using Selenium...")

    successful_extractions = 0
    for i, container in enumerate(containers):
        # Set defaults for each car
        year, car_name, kms, fuel, transmission, price = 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A' # Added car_name
        
        try:
            # Find internal elements within this container using stable data-testid XPaths
            try:
                name_el = container.find_element(By.XPATH, ".//h2[@data-testid='car-title']")
                full_name = name_el.text.strip()
                if 'Tata' in full_name:
                    year = full_name.split(' ')[0]
                    car_name = full_name # <-- STORE THE FULL CAR NAME
                else:
                    # This filters out any ads or non-Tata results
                    continue 
            except NoSuchElementException:
                continue # Skip if no title, it's not a car card

            # Try finding the rest of the data
            try: kms_el = container.find_element(By.XPATH, ".//li[@data-testid='car-kms-driven']"); kms = kms_el.text.strip()
            except NoSuchElementException: pass
            try: fuel_el = container.find_element(By.XPATH, ".//li[@data-testid='car-fuel-type']"); fuel = fuel_el.text.strip()
            except NoSuchElementException: pass
            try: trans_el = container.find_element(By.XPATH, ".//li[@data-testid='car-transmission']"); transmission = trans_el.text.strip()
            except NoSuchElementException: pass
            try: 
                price_el = container.find_element(By.XPATH, ".//strong[@data-testid='car-price']")
                price = price_el.text.strip()
            except NoSuchElementException: 
                 continue # Skip if no price

            # Append data dictionary to the list
            car_data.append({
                'Year of Manufacture': year,
                'Car Name': car_name, # <-- ADDED CAR NAME
                'Kilometers Driven': kms,
                'Fuel Type': fuel,
                'Transmission': transmission,
                'Price': price
            })
            successful_extractions += 1 # Count successful ones

        except Exception as e:
            # Catch unexpected errors during parsing of a single container
            print(f"Unexpected error parsing container {i+1}: {e}")

except Exception as e:
    print(f"Error finding car containers after scrolling: {e}")
finally:
    # --- 4. Close the Browser ---
    driver.quit()
    print("Browser closed.")

print(f"Finished processing. Extracted data for {successful_extractions} cars.")

# --- 5. Convert Data to DataFrame and Save ---
print("\n---------------------------------")
print("Excel Conversion Process:")

if car_data:
    print(f"Successfully parsed {len(car_data)} car listings. Converting to Excel...")
    
    # Create DataFrame from the list of dictionaries
    df = pd.DataFrame(car_data)
    
    # Ensure columns are in the desired order
    # <-- ADDED 'Car Name' TO THE COLUMN LIST
    df = df[['Year of Manufacture', 'Car Name', 'Kilometers Driven', 'Fuel Type', 'Transmission', 'Price']] 
    
    # Save the DataFrame to an Excel file
    excel_filename = 'cars24_tata_data.xlsx'
    try:
        df.to_excel(excel_filename, index=False, sheet_name='Tata Cars')
        print(f"Data successfully saved to {excel_filename}")
        print("\nDataFrame Head (first 5 rows):")
        display(df.head()) # Use display() in Jupyter for pretty table output
    except Exception as e:
        print(f"Error saving to Excel file '{excel_filename}': {e}")
        print("Make sure the file is not open in Excel.")

else:
    print("No valid car data was captured. No Excel file was created.")

# Delhi

In [None]:
!pip install selenium
!pip install google-colab-selenium
import google_colab_selenium as gs
from bs4 import BeautifulSoup


driver = gs.Chrome()
driver.get("https://www.cars24.com/buy-used-tata-cars-new-delhi/?sort=bestmatch&serveWarrantyCount=true&listingSource=Homepage_Filters")
html = driver.page_source
soup = BeautifulSoup(html,'html.parser')
soup.prettify()
soup.prettify()
carNameType=[]
cars=soup.find_all('div',class_="sc-fLVwEd hRljRx")
for car in cars:
  carNameType.append({
      'Name':car.find('span',class_='sc-braxZu kjFjan').get_text(strip=True),
      "Type":car.find('span',class_='sc-braxZu lmmumg').get_text(strip=True)
  })
carNameType
all_cars=[]
containers = soup.find_all("ul", class_="sc-huvEkS gkjlEH")

for container in containers:
  all_cars.append(
     container.find_all("p", class_="sc-braxZu kvfdZL")
  )
all_cars
headings = ['Mileage', 'Fuel', 'Transmission', 'Registration']

# Convert into list of dictionaries:
structured_data = []
for row in all_cars:
    # Use zip to pair headings with row values
    entry = { heading: value for heading, value in zip(headings, row) }
    structured_data.append(entry)
structured_data
carPrice=[]
Prices=soup.find_all('div',class_="styles_priceWrap__VwWBV")
for Price in Prices:
  carPrice.append({
      'Actual_Price':car.find('span',class_='sc-braxZu gbxhkm'),
      "Discounted_price":car.find('span',class_='sc-braxZu cyPhJl')
  })

allPrice
merged = []
for info, details in zip(carNameType, structured_data):
    combined = info.copy()        # start with name & type
    # add all keys from details into this dict
    for k, v in details.items():
        # If v is a BeautifulSoup Tag, extract the text
        text = v.get_text(strip=True) if hasattr(v, "get_text") else v
        combined[k] = text
    merged.append(combined)
merged
import csv

def export_to_csv(records, filename="output.csv"):
    if not records:
        print("No records to write.")
        return

    # Extract column headers from the keys of the first dictionary
    fieldnames = list(records[0].keys())

    with open(filename, mode='w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()    # Write header row
        writer.writerows(records)  # Write data rows

    print(f"Successfully wrote {len(records)} records to {filename}")


export_to_csv(merged, filename="Tata_Used_Car.csv")
import os

current_directory = os.getcwd()
print("Current Working Directory:", current_directory)

# Bengaluru

In [None]:
!pip install selenium
!pip install google-colab-selenium
import google_colab_selenium as gs
from bs4 import BeautifulSoup


driver = gs.Chrome()
driver.get("https://www.cars24.com/buy-used-tata-cars-banglore/?sort=bestmatch&serveWarrantyCount=true&listingSource=Homepage_Filters")
html = driver.page_source
soup = BeautifulSoup(html,'html.parser')
soup.prettify()
soup.prettify()
carNameType=[]
cars=soup.find_all('div',class_="sc-fLVwEd hRljRx")
for car in cars:
  carNameType.append({
      'Name':car.find('span',class_='sc-braxZu kjFjan').get_text(strip=True),
      "Type":car.find('span',class_='sc-braxZu lmmumg').get_text(strip=True)
  })
carNameType
all_cars=[]
containers = soup.find_all("ul", class_="sc-huvEkS gkjlEH")

for container in containers:
  all_cars.append(
     container.find_all("p", class_="sc-braxZu kvfdZL")
  )
all_cars
headings = ['Mileage', 'Fuel', 'Transmission', 'Registration']

# Convert into list of dictionaries:
structured_data = []
for row in all_cars:
    # Use zip to pair headings with row values
    entry = { heading: value for heading, value in zip(headings, row) }
    structured_data.append(entry)
structured_data
carPrice=[]
Prices=soup.find_all('div',class_="styles_priceWrap__VwWBV")
for Price in Prices:
  carPrice.append({
      'Actual_Price':car.find('span',class_='sc-braxZu gbxhkm'),
      "Discounted_price":car.find('span',class_='sc-braxZu cyPhJl')
  })

allPrice
merged = []
for info, details in zip(carNameType, structured_data):
    combined = info.copy()        # start with name & type
    # add all keys from details into this dict
    for k, v in details.items():
        # If v is a BeautifulSoup Tag, extract the text
        text = v.get_text(strip=True) if hasattr(v, "get_text") else v
        combined[k] = text
    merged.append(combined)
merged
import csv

def export_to_csv(records, filename="output.csv"):
    if not records:
        print("No records to write.")
        return

    # Extract column headers from the keys of the first dictionary
    fieldnames = list(records[0].keys())

    with open(filename, mode='w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()    # Write header row
        writer.writerows(records)  # Write data rows

    print(f"Successfully wrote {len(records)} records to {filename}")


export_to_csv(merged, filename="Tata_Used_Car.csv")
import os

current_directory = os.getcwd()
print("Current Working Directory:", current_directory)

# EDA

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

# LOAD DATA
df = pd.read_excel("Tata Cars 24.xlsx")

# QUICK CLEAN / INSPECTION
print(df.head())
print(df.columns)


   Centre          car_name  Year_of_Manufacture Kilometers_Driven Fuel_Type  \
0  Mumbai    2015 Tata Zest                 2015         40.97k km    Petrol   
1  Mumbai   2024 Tata NEXON                 2024          8.33k km    Petrol   
2  Mumbai   2023 Tata PUNCH                 2023         42.18k km    Petrol   
3  Mumbai   2019 Tata TIGOR                 2019         39.65k km    Petrol   
4  Mumbai  2024 Tata ALTROZ                 2024          2.93k km    Petrol   

  Transmission      Price  
0       Manual  2.45 lakh  
1         Auto      9.86L  
2       Manual      6.27L  
3       Manual      4.24L  
4       Manual      8.67L  
Index(['Centre', 'car_name', 'Year_of_Manufacture', 'Kilometers_Driven',
       'Fuel_Type', 'Transmission', 'Price'],
      dtype='object')


# Centre-wise Fuel Type Count

In [None]:
fuel_count = df.groupby(["centre", "fuel_type"]).size().unstack()

fuel_count.plot(kind='bar')
plt.title("Centre wise Fuel Type Availability")
plt.xlabel("Centre")
plt.ylabel("Count of Cars")
plt.show()


# Centre-wise Transmission Type

In [None]:
trans_count = df.groupby(["centre", "transmission"]).size().unstack()

trans_count.plot(kind='bar')
plt.title("Centre wise Transmission Availability")
plt.xlabel("Centre")
plt.ylabel("Count of Cars")
plt.show()


# Correlation Between Price & KM Driven

In [None]:
corr = df["price"].corr(df["kms_driven"])
print("Correlation between Price & Kms Driven:", corr)

plt.scatter(df["kms_driven"], df["price"])
plt.title("Price vs Kilometres Driven")
plt.xlabel("Kilometres Driven")
plt.ylabel("Price (in ₹)")
plt.show()


# Distribution of Price vs Fuel Type

In [None]:
import seaborn as sns  # if allowed, else reply "NO SEABORN" then I'll rewrite purely in matplotlib

sns.histplot(data=df, x="price", hue="fuel_type", bins=10)
plt.title("Distribution of Price Among Various Fuel Types")
plt.xlabel("Price (in ₹ Lakh)")
plt.ylabel("Number of Cars")
plt.show()
