In [4]:
from bs4 import BeautifulSoup
import numpy as np
import time
from selenium.webdriver import Edge
import pandas as pd
import random
URL = 'https://www.cars24.com/buy-used-cars-'
CITIES = [
    'new-delhi', 'hyderabad', 'mumbai', 'bangalore',
    'chennai', 'jaipur', 'ahmedabad', 'kolkata',
    'noida', 'nagpur','indore','gurgaon','agra',
    'nashik','chandigarh','coimbatore','ghaziabad'
]

Scroll_Pause = 3           # wait time after each scroll
Max_Scrolls = 30             # safety limit

car, varient, km, fuel, transmission, registration, price, location = [], [], [], [], [], [], [], []
driver = Edge()
total_start_time = time.time()
for city in CITIES:
    Target_Records = random.randint(300,500)  # minimum records to load before scraping
    driver.get(URL + city)
    time.sleep(5)
    city_start_time = time.time()
    print(f"\n Loading city: {city}")


    last_count = 0
    scroll_count = 0
    last_height=driver.execute_script('return document.body.scrollHeight')
    while scroll_count < Max_Scrolls:
        soup = BeautifulSoup(driver.page_source, 'lxml')
        cards = soup.find_all('div', class_="styles_contentWrap__9oSrl")
        current_count = len(cards)

        print(f"   -> Loaded cards: {current_count}")

        # Stop if no new cards OR target reached
        if current_count == last_count or current_count >= Target_Records:
            break

        last_count = current_count

        last_card = driver.find_elements("css selector", "div.styles_contentWrap__9oSrl")[-1]

        driver.execute_script("arguments[0].scrollIntoView({block:'center'});",last_card)

        time.sleep(Scroll_Pause)
        new_height=driver.execute_script("return document.body.scrollHeight")
        scroll_count += 1
        if last_height==new_height:
            break
        else:
            last_height=new_height
    print(f"   Finished loading {current_count} cards")
    city_records = 0
    for c in cards:
        car.append(c.find('span', class_="sc-bcXHqh bAcffq").text if c.find('span', class_="sc-bcXHqh bAcffq") else np.nan)
        varient.append(c.find('span', class_="sc-bcXHqh bKVBht").text if c.find('span', class_="sc-bcXHqh bKVBht") else np.nan)

        info = c.find_all('p', class_="sc-bcXHqh kNDBvu")
        km.append(info[0].text if len(info) > 0 else np.nan)
        fuel.append(info[1].text if len(info) > 1 else np.nan)
        transmission.append(info[2].text if len(info) > 2 else np.nan)
        registration.append(info[3].text if len(info) > 3 else np.nan)

        price.append(c.find('p', class_="sc-bcXHqh hvRpEM").text if c.find('p', class_="sc-bcXHqh hvRpEM") else np.nan)
        location.append(city)

        city_records += 1

    city_time = time.time() - city_start_time
    print(f"   Records scraped: {city_records}")
    print(f"   Time taken: {city_time:.2f} seconds")
    
driver.quit()
df = pd.DataFrame({
    'car': car,
    'varient': varient,
    'km': km,
    'fuel': fuel,
    'transmission': transmission,
    'registration': registration,
    'price': price,
    'location': location
})
df.drop_duplicates(inplace=True,ignore_index=True)
df.dropna(ignore_index=True).to_csv('Raw_Data.csv', index=False)

total_time = time.time() - total_start_time
print(f"\nTOTAL TIME: {total_time:.2f} seconds")
print(f"TOTAL RECORDS: {len(df)}")
print("Scraping completed successfully")


 Loading city: new-delhi
   -> Loaded cards: 40
   -> Loaded cards: 60
   -> Loaded cards: 80
   -> Loaded cards: 100
   -> Loaded cards: 120
   -> Loaded cards: 140
   -> Loaded cards: 160
   -> Loaded cards: 180
   -> Loaded cards: 200
   -> Loaded cards: 220
   -> Loaded cards: 240
   -> Loaded cards: 260
   -> Loaded cards: 280
   -> Loaded cards: 300
   -> Loaded cards: 320
   -> Loaded cards: 340
   -> Loaded cards: 360
   -> Loaded cards: 380
   -> Loaded cards: 400
   -> Loaded cards: 420
   -> Loaded cards: 440
   Finished loading 440 cards
   Records scraped: 440
   Time taken: 86.62 seconds

 Loading city: hyderabad
   -> Loaded cards: 20
   -> Loaded cards: 40
   -> Loaded cards: 60
   -> Loaded cards: 80
   -> Loaded cards: 100
   -> Loaded cards: 120
   -> Loaded cards: 140
   -> Loaded cards: 160
   -> Loaded cards: 180
   -> Loaded cards: 200
   -> Loaded cards: 220
   -> Loaded cards: 240
   -> Loaded cards: 260
   -> Loaded cards: 280
   -> Loaded cards: 300
   -> Lo