# Cars Price Prediction

In [1]:
import pandas as pd
import numpy as np

In [2]:
train_df = pd.read_csv('car_price.csv')
train_df.head()

Unnamed: 0,full_name,selling_price,year,seller_type,km_driven,fuel_type,transmission_type,mileage,engine,max_power,seats
0,Maruti Alto Std,1.2,2012.0,Individual,120000,Petrol,Manual,19.7,796.0,46.3,5.0
1,Hyundai Grand i10 Asta,5.5,2016.0,Individual,20000,Petrol,Manual,18.9,1197.0,82.0,5.0
2,Hyundai i20 Asta,2.15,2010.0,Individual,60000,Petrol,Manual,17.0,1197.0,80.0,5.0
3,Maruti Alto K10 2010-2014 VXI,2.26,2012.0,Individual,37000,Petrol,Manual,20.92,998.0,67.1,5.0
4,Ford Ecosport 2015-2021 1.5 TDCi Titanium BSIV,5.7,2015.0,Dealer,30000,Diesel,Manual,22.77,1498.0,98.59,5.0


In [8]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19980 entries, 0 to 19979
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0    full_name         19980 non-null  object 
 1   selling_price      19980 non-null  float64
 2   year               19980 non-null  float64
 3   seller_type        19980 non-null  object 
 4   km_driven          19980 non-null  int64  
 5   fuel_type          19980 non-null  object 
 6   transmission_type  19980 non-null  object 
 7   mileage            19980 non-null  float64
 8   engine             19980 non-null  float64
 9   max_power          19980 non-null  float64
 10  seats              19980 non-null  float64
dtypes: float64(6), int64(1), object(4)
memory usage: 1.7+ MB


## Binary Encoding of Categorical Variables

In [9]:
train_df['fuel_type']= train_df['fuel_type'].map({'Diesel': 1, 'Petrol': 2, 'CNG': 3, 'LPG': 4, 'Electric': 5})
train_df['transmission_type']= train_df['transmission_type'].map({'Manual': 1, 'Automatic': 2})

In [10]:
train_df.drop(columns=['seller_type',' full_name'],inplace=True)

In [11]:
train_df.head()

Unnamed: 0,selling_price,year,km_driven,fuel_type,transmission_type,mileage,engine,max_power,seats
0,1.2,2012.0,120000,2,1,19.7,796.0,46.3,5.0
1,5.5,2016.0,20000,2,1,18.9,1197.0,82.0,5.0
2,2.15,2010.0,60000,2,1,17.0,1197.0,80.0,5.0
3,2.26,2012.0,37000,2,1,20.92,998.0,67.1,5.0
4,5.7,2015.0,30000,1,1,22.77,1498.0,98.59,5.0


## Checking for Missing Values

In [12]:
train_df.isnull().sum()

selling_price        0
year                 0
km_driven            0
fuel_type            0
transmission_type    0
mileage              0
engine               0
max_power            0
seats                0
dtype: int64

## Segregating the target variable from the features

In [13]:
X = train_df.drop(columns="selling_price")
y = train_df.selling_price
X.shape, y.shape

((19980, 8), (19980,))

## Splitting the data

In [14]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=5)

## Model Training

In [15]:
from sklearn import linear_model


model = linear_model.LinearRegression()
model.fit(X_train, y_train)

## Cross Validation

In [16]:
# Validation Score
from sklearn.metrics import r2_score

pred_val = model.predict(X_val)
r2_score(y_val, pred_val)

0.5178203827512744

In [17]:
# Training Score
pred_train = model.predict(X_train)
r2_score(y_train, pred_train)

0.6243522296980586

## Saving the model

In [18]:
# saving the model
import pickle

pickle_out = open("model.pkl", mode = "wb")
pickle.dump(model, pickle_out)
pickle_out.close()