# Predicting car prices for used cars using `Car Dekho` data

## Table Of Contents

In [1]:
! pip install pandas
! pip install scikit-learn
! pip install matplotlib
! pip install plotly
! pip install numpy
! pip install seaborn



In [2]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from math import sqrt
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [3]:
## Read all data files. 
car_data = pd.read_csv("./data/car_data.csv", encoding="utf-8")
car_details = pd.read_csv("./data/car_details_v3.csv", encoding="utf-8")
car_dekho_details = pd.read_csv("./data/car_details_from_car_dekho.csv", encoding="utf-8")


In [4]:
## Lets start exploration with `car_data`
## look at top 5 rows
car_data.head()

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,swift,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0


In [5]:
## Overall shape of the table.
car_data.shape

(301, 9)

In [6]:
## Lets look at the data types of the columns

pd.DataFrame(car_data.dtypes, columns=['DataType'])

Unnamed: 0,DataType
Car_Name,object
Year,int64
Selling_Price,float64
Present_Price,float64
Kms_Driven,int64
Fuel_Type,object
Seller_Type,object
Transmission,object
Owner,int64


In [7]:
## Notes
## Observations
### Looks like Year, Selling_Price, Present_Price, Kms_Driven are numeric and doesn't have missing data as DataType is float. 
### Fuel_Type and Seller_Type might be constant strings. 
### Not sure what Owner field means? Is it 0/1 values. 
## Required Changes
## Need to change column names to more readable lower case formats
## Need to change Year values to datetime instead of int64


In [8]:
## Find missing data
car_data.isnull().mean()

Car_Name         0.0
Year             0.0
Selling_Price    0.0
Present_Price    0.0
Kms_Driven       0.0
Fuel_Type        0.0
Seller_Type      0.0
Transmission     0.0
Owner            0.0
dtype: float64

In [9]:
## Great news! No missing data. Lets repeat these steps for other data files. 

In [10]:
## Lets start exploration with `car_data`
## look at top 5 rows
car_details.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4 kmpl,1248 CC,74 bhp,190Nm@ 2000rpm,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14 kmpl,1498 CC,103.52 bhp,250Nm@ 1500-2500rpm,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7 kmpl,1497 CC,78 bhp,"12.7@ 2,700(kgm@ rpm)",5.0
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0 kmpl,1396 CC,90 bhp,22.4 kgm at 1750-2750rpm,5.0
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1 kmpl,1298 CC,88.2 bhp,"11.5@ 4,500(kgm@ rpm)",5.0


In [11]:
## Overall shape of the table.
car_details.shape

(8128, 13)

In [12]:
## Notes
### So number of rows are more here than in `car_data` not sure if the data in files are related.

In [13]:
## Lets look at the data types of the columns

pd.DataFrame(car_details.dtypes, columns=['DataType'])

Unnamed: 0,DataType
name,object
year,int64
selling_price,int64
km_driven,int64
fuel,object
seller_type,object
transmission,object
owner,object
mileage,object
engine,object


In [14]:
## Notes
## Observations
## Required Changes
### max_power, torque, and engine can be converted to int/float by removing the units. 
### fuel, seller_type and transmission can also be converted to int by one hot encoding technique. 
### owner field can be converted to number of previous owners. 

In [15]:
## Find missing data
car_details.isnull().mean()

name             0.000000
year             0.000000
selling_price    0.000000
km_driven        0.000000
fuel             0.000000
seller_type      0.000000
transmission     0.000000
owner            0.000000
mileage          0.027190
engine           0.027190
max_power        0.026452
torque           0.027313
seats            0.027190
dtype: float64

In [16]:
## unfortunately mileage, engine, max_power, torque and seats have ~2% missing data.
## We'll have to see how many records are removed of we remove this data


In [17]:
## lets check how much data we'll loose if we drop rows with missing values. 
print('number of rows with missing values: ', car_details.shape[0])
print('number of rows without missing values: ', car_details.dropna().shape[0])

print('data lost on dropna : ', ((car_details.shape[0] - car_details.dropna().shape[0])/car_details.shape[0]))


number of rows with missing values:  8128
number of rows without missing values:  7906
data lost on dropna :  0.027312992125984252


In [18]:
### so loosing a little less then 3% of the data. which I think should be ok. 


In [19]:
### Lets explore last file. 
## Lets start exploration with `car_data`
## look at top 5 rows
car_dekho_details.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,Maruti 800 AC,2007,60000,70000,Petrol,Individual,Manual,First Owner
1,Maruti Wagon R LXI Minor,2007,135000,50000,Petrol,Individual,Manual,First Owner
2,Hyundai Verna 1.6 SX,2012,600000,100000,Diesel,Individual,Manual,First Owner
3,Datsun RediGO T Option,2017,250000,46000,Petrol,Individual,Manual,First Owner
4,Honda Amaze VX i-DTEC,2014,450000,141000,Diesel,Individual,Manual,Second Owner


In [20]:
car_dekho_details.shape

(4340, 8)

In [21]:
## Lets look at the data types of the columns

pd.DataFrame(car_dekho_details.dtypes, columns=['DataType'])

Unnamed: 0,DataType
name,object
year,int64
selling_price,int64
km_driven,int64
fuel,object
seller_type,object
transmission,object
owner,object


In [22]:
## From the shape and size of the data it seems like this data file has limited number of rows and types of columns.
## All three data files are not related and looks like `car_details` has the required dataset for creating model for price prediction. 

In [23]:
## Data Cleaning and feature engineering 
## We are going to work with data from `car_details_v3.csv` 

In [24]:
## look at top 5 rows
car_details.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4 kmpl,1248 CC,74 bhp,190Nm@ 2000rpm,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14 kmpl,1498 CC,103.52 bhp,250Nm@ 1500-2500rpm,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7 kmpl,1497 CC,78 bhp,"12.7@ 2,700(kgm@ rpm)",5.0
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0 kmpl,1396 CC,90 bhp,22.4 kgm at 1750-2750rpm,5.0
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1 kmpl,1298 CC,88.2 bhp,"11.5@ 4,500(kgm@ rpm)",5.0


In [25]:
## Lets look at the data types of the columns
pd.DataFrame(car_details.dtypes, columns=['DataType'])

Unnamed: 0,DataType
name,object
year,int64
selling_price,int64
km_driven,int64
fuel,object
seller_type,object
transmission,object
owner,object
mileage,object
engine,object


In [26]:
## Find missing data
car_details.isnull().mean()

name             0.000000
year             0.000000
selling_price    0.000000
km_driven        0.000000
fuel             0.000000
seller_type      0.000000
transmission     0.000000
owner            0.000000
mileage          0.027190
engine           0.027190
max_power        0.026452
torque           0.027313
seats            0.027190
dtype: float64

In [27]:
## lets remove missing rows as analyzed earlier. 
car_details.dropna(inplace=True)

In [28]:
## Find missing data
car_details.isnull().mean()

name             0.0
year             0.0
selling_price    0.0
km_driven        0.0
fuel             0.0
seller_type      0.0
transmission     0.0
owner            0.0
mileage          0.0
engine           0.0
max_power        0.0
torque           0.0
seats            0.0
dtype: float64

In [29]:
## lets explore and cleanup columns one at a time. 
## starting with name. 
car_details["name"].describe()

count                       7906
unique                      1982
top       Maruti Swift Dzire VDI
freq                         129
Name: name, dtype: object

In [30]:
##may be we can split the name into Make of the car. 

car_details["make"] = car_details["name"]
car_details["make"]  = car_details["make"].apply(lambda f: f.split(" ")[0])

In [31]:
car_details.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats,make
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4 kmpl,1248 CC,74 bhp,190Nm@ 2000rpm,5.0,Maruti
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14 kmpl,1498 CC,103.52 bhp,250Nm@ 1500-2500rpm,5.0,Skoda
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7 kmpl,1497 CC,78 bhp,"12.7@ 2,700(kgm@ rpm)",5.0,Honda
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0 kmpl,1396 CC,90 bhp,22.4 kgm at 1750-2750rpm,5.0,Hyundai
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1 kmpl,1298 CC,88.2 bhp,"11.5@ 4,500(kgm@ rpm)",5.0,Maruti


In [32]:
car_details["make"] 

0        Maruti
1         Skoda
2         Honda
3       Hyundai
4        Maruti
         ...   
8123    Hyundai
8124    Hyundai
8125     Maruti
8126       Tata
8127       Tata
Name: make, Length: 7906, dtype: object

In [43]:
## Notes
### There are way too many car Make names to do a one hot encoding. One thing we can do is add a binary flag  saying something like top_brand or not. 
top_brand_cars = car_details.groupby(["make"])["make"].count().sort_values(ascending=False).reset_index(name="count").head(20)["make"].tolist()

car_details["top_brand"] = car_details["make"].apply(lambda f: 1 if f in top_brand_cars else 0 )
