# Used Cars Price Prediction Project

## Problem Statement - 

- Build a model to predict car price based on all these features.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing
from sklearn.model_selection import train_test_split, RandomizedSearchCV

# metrics and models
from sklearn.metrics import r2_score, mean_squared_error
from xgboost import XGBRegressor

import warnings
warnings.filterwarnings("ignore")

### Read datasets

In [2]:
df = pd.read_csv('Used Cars Price.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,id,price,year,manufacturer,model,condition,cylinders,fuel,odometer,title_status,transmission,drive,size,type,paint_color,county,state,lat,long
0,55483,7315914053,0,2018.0,ram,promaster 2500,excellent,,gas,44244.0,clean,automatic,,,van,,,ca,32.7928,-116.9665
1,162368,7310885048,13995,2017.0,mazda,cx-3,,4 cylinders,gas,7037.0,rebuilt,automatic,,,SUV,white,,ia,41.207382,-96.023096
2,234393,7308243856,19990,2019.0,mitsubishi,eclipse cross sp,good,,gas,35313.0,clean,other,4wd,,hatchback,white,,nc,35.19,-80.83
3,276110,7315817729,0,2019.0,honda,cr-v,,,gas,25626.0,clean,automatic,,,SUV,orange,,ny,40.854573,-74.120219
4,349033,7301620999,42900,2015.0,chevrolet,corvette,excellent,8 cylinders,gas,29000.0,clean,automatic,,,convertible,black,,sc,34.755562,-82.906419


In [3]:
df.shape

(64032, 20)

In [4]:
# Removing the features which is not important according to me

drop_columns = ['Unnamed: 0', 'id', 'title_status', 'size', 'lat', 'long', 'county']
df = df.drop(columns = drop_columns,axis=1)

In [5]:
df.shape

(64032, 13)

In [6]:
# Checking for null values
df.isna().sum()

price               0
year              158
manufacturer     2569
model             802
condition       26097
cylinders       26511
fuel              424
odometer          669
transmission      353
drive           19471
type            13785
paint_color     19505
state               0
dtype: int64

In [7]:
# Dropping null values

df = df.dropna()
df.head()

Unnamed: 0,price,year,manufacturer,model,condition,cylinders,fuel,odometer,transmission,drive,type,paint_color,state
5,0,2006.0,chrysler,300,like new,8 cylinders,gas,149000.0,automatic,rwd,sedan,white,fl
9,20995,2011.0,chevrolet,silverado 1500,excellent,8 cylinders,gas,92001.0,automatic,4wd,truck,blue,wi
15,50995,2017.0,gmc,yukon denali,like new,8 cylinders,gas,70227.0,automatic,4wd,SUV,grey,ak
22,13500,2014.0,chevrolet,tahoe,good,8 cylinders,gas,96007.0,automatic,rwd,SUV,white,fl
29,34990,2016.0,gmc,canyon crew cab sle pickup,good,6 cylinders,gas,34425.0,other,4wd,pickup,red,ma


In [8]:
df.shape

(17491, 13)

In [9]:
df.describe()  # Show for only numerical columns

Unnamed: 0,price,year,odometer
count,17491.0,17491.0,17491.0
mean,16325.45,2009.535247,112261.6
std,129442.9,9.596493,215514.9
min,0.0,1918.0,0.0
25%,5600.0,2006.0,55787.0
50%,10950.0,2012.0,102567.0
75%,22500.0,2015.0,148000.0
max,17000000.0,2022.0,10000000.0


### Observations

- Odometer std is greater than mean 
- Min value cannot be 0 in odometer and price
- Max value is aldo not justified in price and odometer

### Check if there are any duplicates. Remove if there any duplicates

In [10]:
df = df.drop_duplicates()

df.shape

(16399, 13)

### Filter categorical features

In [11]:
numerics = ['int8','int16','int32','int64','float16','float32','float64']

categorical_columns = df.select_dtypes(exclude = numerics).columns.tolist()

In [12]:
categorical_columns

['manufacturer',
 'model',
 'condition',
 'cylinders',
 'fuel',
 'transmission',
 'drive',
 'type',
 'paint_color',
 'state']

### Encoding categorical columns using get_dummies.

- Here we are using get_dummies not Label Encoder because with label encoder we get values as 0,1,2,3,4 .... so my model get biased because of increasing numbers

In [13]:
df = pd.get_dummies(df, columns = categorical_columns, drop_first = True, dtype = 'int')
df

Unnamed: 0,price,year,odometer,manufacturer_alfa-romeo,manufacturer_audi,manufacturer_bmw,manufacturer_buick,manufacturer_cadillac,manufacturer_chevrolet,manufacturer_chrysler,...,state_sd,state_tn,state_tx,state_ut,state_va,state_vt,state_wa,state_wi,state_wv,state_wy
5,0,2006.0,149000.0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
9,20995,2011.0,92001.0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
15,50995,2017.0,70227.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22,13500,2014.0,96007.0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
29,34990,2016.0,34425.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64015,8999,2010.0,125989.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
64020,0,2007.0,182935.0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
64024,27990,2017.0,31479.0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
64027,3495,1997.0,106253.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Selecting realistic data. Here Domain knowledge will help a lot to decide what could be the higher and lower price.

#### Let's consider below prices in this example

In [14]:
# Filtering the price column with geneuine prices as per me

df = df[df['price'] > 1000]
df = df[df['price'] < 40000]

In [15]:
df.shape

(14742, 4320)

### Divide dataset into features and label

In [16]:
x = df.drop(columns = 'price')
y = df.price

In [17]:
# Data split into train test

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.25, random_state = 10)

In [18]:
xgb = XGBRegressor()

In [19]:
xgb.fit(x_train,y_train)

In [20]:
y_pred = xgb.predict(x_test)

In [21]:
r2_score(y_test,y_pred)

0.8554238868384089

### Hyperparameter Tuning

In [22]:
param = {'n_estimators' : [200, 300, 400, 500],
         'learning_rate' : [0.2,0.3,0.4,0.5]}

In [23]:
rnd = RandomizedSearchCV(xgb, param_distributions = param)

rnd.fit(x_train,y_train)

In [24]:
rnd.best_params_

{'n_estimators': 500, 'learning_rate': 0.2}

In [25]:
xgb = XGBRegressor(n_estimators = 500, learning_rate = 0.2)

xgb.fit(x_train,y_train)

In [26]:
xgb.score(x_test,y_test)

0.8625742200696966

## Our final score is 86%