# Used Car Price Prediction Project Data Collection

### `Data Collection Sources:`

> - Cars 24 website - https://www.cars24.com/
> - Car Dekho website - https://www.cardekho.com/
> - Ola Cars website - https://www.ola.cars/
> - Olx website - https://www.olx.in/


I will be scraping data from mostly these four websites (after checking the legality) for all major locations in India. Once I have the intended data I will then export them and save it in CSV format to be accessed via Microsoft Excel Program and build our **Used Car Price Prediction Project**.

In [1]:
import warnings
warnings.simplefilter("ignore")
warnings.filterwarnings("ignore")

import time, sys
import tqdm.notebook as tqdm
import pandas as pd
import numpy as np

import requests
import selenium
from bs4 import BeautifulSoup
from selenium import webdriver
from urllib.parse import urljoin
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec
from selenium.common.exceptions import TimeoutException, StaleElementReferenceException
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException

Importing all the necessary libraries/dependencies here.

In [2]:
url1 = "https://www.cars24.com/"
url2 = "https://www.cardekho.com/"
url3 = "https://www.ola.cars/"
url4 = "https://www.olx.in/cars_c84"

Assigned `URL` into 4 different varibales for the 4 websites we are using to scrape our used car information.

**Collecting Data from Cars 24 webpage**

In [3]:
page = requests.get(url1)
print("Legality Response number from Cars 24 URL is:", page) # to show the response output from the webpage
soup = BeautifulSoup(page.content)

Legality Response number from Cars 24 URL is: <Response [200]>


In [4]:
driver = webdriver.Chrome('chromedriver.exe')
driver.get(url1)
driver.maximize_window()

loc=["Delhi", "Mumbai", "Pune", "Bangalore", "Hyderabad", "Chennai", "Kolkata"]

for place in loc:
    # select location manually option
    WebDriverWait(driver,2).until(ec.element_to_be_clickable((By.XPATH,'//button[contains(text(),"SELECT MANUALLY")]'))).click()

    # search for location
    searchbox = WebDriverWait(driver,2).until(ec.presence_of_element_located((By.XPATH,'//div[@class="_6QaMX"]/input')))
    searchbox.send_keys(place)

    # confirm click on searched location icon
    WebDriverWait(driver,2).until(ec.element_to_be_clickable((By.XPATH,'//ul[@class="_16Bvy"]/li[1]'))).click()

    # choose the "Buy Used Car" option on the webpage
    WebDriverWait(driver,2).until(ec.element_to_be_clickable((By.XPATH,'//a[contains(text(),"Buy Used Car")]'))).click()
    
    scroll_pause_time = 2
    screen_height = driver.execute_script("return window.screen.height;")

    i = 1
    while True:
        # scroll one screen height each time
        driver.execute_script("window.scrollTo(0, {screen_height}*{i});".format(screen_height=screen_height, i=i))  
        i += 1
        time.sleep(scroll_pause_time)
        # update scroll height each time after scrolled, as the scroll height can change after we scrolled the page
        scroll_height = driver.execute_script("return document.body.scrollHeight;")  
        # break the loop when the height we need to scroll to is larger than the total scroll height
        if (screen_height) * i > scroll_height:
            break 
        
    # fetching urls of every used car on the website
    urls = []
    for i in driver.find_elements_by_xpath("//div[@class='col-4']//a"):
        urls.append(i.get_attribute("href"))

    for i in driver.find_elements_by_xpath("//div[@class='_1l4fi']//a"):
        urls.append(i.get_attribute("href"))

    # obtaining the required data in the empty lists
    location = []
    year = []
    kms_driven = []
    car_model = []
    owners = []
    transmission = []
    fuel_type = []
    price = []

SessionNotCreatedException: Message: session not created: This version of ChromeDriver only supports Chrome version 94
Current browser version is 96.0.4664.45 with binary path C:\Program Files (x86)\Google\Chrome\Application\chrome.exe


In [None]:
for i in urls:
    driver.get(i)
    time.sleep(2)
    
    # fetching location of used cars
    try:
        location_tag = driver.find_element_by_xpath("//p[@class='_2NHUv']//span")
        location.append(location_tag.text)
    except NoSuchElementException:
        location.append('None')
        
    # fetching manufacturing year of used cars
    try:
        year_tag = driver.find_element_by_xpath("//strong[@class='category']//span")
        year.append(year_tag.text)
    except NoSuchElementException:
        year.append('None')
        
    # fetching number of kms driven of used cars
    try:
        year_tag = driver.find_element_by_xpath("//div[@class='keyword']//span")
        year.append(year_tag.text)
    except NoSuchElementException:
        year.append('None')
        
    # fetching model name of used cars
    try:
        year_tag = driver.find_element_by_xpath("//a[@class='_1UhVsV']//span")
        year.append(year_tag.text)
    except NoSuchElementException:
        year.append('None')
        
    # fetching owner details of used cars
    try:
        year_tag = driver.find_element_by_xpath("//span[@class='_1FH0tX']//span")
        year.append(year_tag.text)
    except NoSuchElementException:
        year.append('None')
        
    # fetching transmission details of used cars
    try:
        year_tag = driver.find_element_by_xpath("//h2[@class='yhB1nd']//span")
        year.append(year_tag.text)
    except NoSuchElementException:
        year.append('None')
        
    # fetching fuel type of used cars
    try:
        year_tag = driver.find_element_by_xpath("//div[@class='fMghEO']//span")
        year.append(year_tag.text)
    except NoSuchElementException:
        year.append('None')
        
    # fetching price of used cars
    try:
        year_tag = driver.find_element_by_xpath("//p[@class='/html/body/div[1]/div/div[1]/div[1]/div[2]/div[2]/form/div/']//span")
        year.append(year_tag.text)
    except NoSuchElementException:
        year.append('None')

In [None]:
# Creating a dataframe and checking the data extracted
cars=pd.DataFrame({'place':location,'model':brand,'year':year,'price':price,'km_driven':km,'owners':owners,
                   'fuel':fuel,'transmission':transmission})

In [None]:
page = requests.get(url2)
print("Legality Response number from Car Dekho URL is:", page) # to show the response output from the webpage
soup = BeautifulSoup(page.content)

Since the legality response from the website is not 200 we cannot perform web scraping here and therefore we shall ignore the further process.

In [None]:
page = requests.get(url3)
print("Legality Response number from Ola Cars URL is:", page) # to show the response output from the webpage
soup = BeautifulSoup(page.content)

Here even though the legality response is 200 we are unable to inspect the webpage as right click is disabled on the website therefore web scraping in such a scenario is not possible.

**Collecting Data from OLX Cars webpage**

In [None]:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36'}
page = requests.get(url4, headers=headers)
print("Legality Response number from Olx Cars URL is:", page) # to show the response output from the webpage
soup = BeautifulSoup(page.content, 'html.parser')

In [None]:
driver = webdriver.Chrome('chromedriver.exe')
driver.get(url4)
driver.maximize_window()

# creating empty lists
location = []
year = []
kms_driven = []
car_model = []
owners = []
transmission = []
fuel_type = []
price = []

# getting the URL of the cars
for i in range(0,500):
    url=driver.find_elements_by_xpath("//ul[@class='rl3f9 _3mXOU']/li/a")
    for i in url:
        prod_URL.append(i.get_attribute('href'))
    try:
        next_btn=driver.find_element_by_xpath("//button[@class='rui-39-wj rui-3evoE rui-1JPTg']").click()
    except NoSuchElementException:
        pass
    
for i in prod_URL:
    driver.get(i)    

    #Extracting car name
    try:
        brand.append(driver.find_element_by_xpath("//div[@class='_3_knn']/div/span[2]").text)
    except NoSuchElementException as e:
        brand.append("-")
        
    #Extracting the year
    try:
        year.append(driver.find_element_by_xpath("//span[@data-aut-id='value_year']").text)
    except NoSuchElementException as e:
        year.append("-")
        
    #Extracting the fuel consumed
    try:
        fuel.append(driver.find_element_by_xpath("//span[@data-aut-id='value_petrol']").text)
    except NoSuchElementException as e:
        fuel.append("-")  
        
    #Extracting the transmission
    try:
        transmission.append(driver.find_element_by_xpath("//span[@data-aut-id='value_transmission']").text)
    except NoSuchElementException as e:
        transmission.append("-") 
        
    #Extracting km driven
    try:
        km.append(driver.find_element_by_xpath("//span[@data-aut-id='value_mileage']").text)
    except NoSuchElementException as e:
        km.append("-") 
        
    #Extracting the price details
    try:
        price.append(driver.find_element_by_xpath("//span[@data-aut-id='itemPrice']").text.replace('₹',''))
    except NoSuchElementException as e:
        price.append("-")

In [None]:
# Creating a dataframe and checking the data extracted
cars=pd.DataFrame({'place':location,'model':brand,'year':year,'price':price,'km_driven':km,'owners':owners,
                   'fuel':fuel,'transmission':transmission})

In [None]:
cars.to_csv("Used_Car_Data.csv")

*Exported the collected used car details from dataframe to csv format to be used for machine learning model building.*