# Multiple Linear Regression- Car dataset

## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

## Importing the dataset

In [2]:
dataset=pd.read_csv("CAR.csv")
dataset.head()

Unnamed: 0,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,2007,60000,70000,Petrol,Individual,Manual,First Owner
1,2007,135000,50000,Petrol,Individual,Manual,First Owner
2,2012,600000,100000,Diesel,Individual,Manual,First Owner
3,2017,250000,46000,Petrol,Individual,Manual,First Owner
4,2014,450000,141000,Diesel,Individual,Manual,Second Owner


In [3]:
dataset.isnull().sum()

year             0
selling_price    0
km_driven        0
fuel             0
seller_type      0
transmission     0
owner            0
dtype: int64

In [5]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4337 entries, 0 to 4336
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   year           4337 non-null   int64 
 1   selling_price  4337 non-null   int64 
 2   km_driven      4337 non-null   int64 
 3   fuel           4337 non-null   object
 4   seller_type    4337 non-null   object
 5   transmission   4337 non-null   object
 6   owner          4337 non-null   object
dtypes: int64(3), object(4)
memory usage: 237.3+ KB


### Split the dataset into independent and Dependent variables

In [6]:
dataset.columns

Index(['year', 'selling_price', 'km_driven', 'fuel', 'seller_type',
       'transmission', 'owner'],
      dtype='object')

In [7]:
X=dataset[['year', 'km_driven', 'fuel', 'seller_type',
       'transmission', 'owner']]
y=dataset[['selling_price']]

### Work with the catagorical data

In [20]:
X=pd.get_dummies(dataset[['year',
                          'km_driven',
                          'fuel',
                          'seller_type',
                          'transmission',
                          'owner']],
                drop_first=True)
X.head()

Unnamed: 0,year,km_driven,fuel_Diesel,fuel_Electric,fuel_LPG,fuel_Petrol,seller_type_Individual,seller_type_Trustmark Dealer,transmission_Manual,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,2007,70000,0,0,0,1,1,0,1,0,0,0,0
1,2007,50000,0,0,0,1,1,0,1,0,0,0,0
2,2012,100000,1,0,0,0,1,0,1,0,0,0,0
3,2017,46000,0,0,0,1,1,0,1,0,0,0,0
4,2014,141000,1,0,0,0,1,0,1,0,1,0,0


### Coding:
 - fuel_CNG -> 0000
 - fuel_Diesel -> 1000
 - fuel_Electric -> 0100 
 - fuel_LPG -> 0010
 - fuel_Petrol -> 0001
 - 
 - seller_type_Dealer -> 000
 - seller_type_Individual -> 010
 - seller_type_Trustmark Dealer -> 001
 - 
 - transmission_Automatic -> 0
 - transmission_Manual -> 1
 - 
 - owner_First Owner -> 0000
 - owner_Fourth & Above Owner -> 1000
 - owner_Second Owner -> 0100
 - owner_Test Drive Car -> 0010
 - owner_Third Owner -> 0001

## Splitting the dataset into the Training set and Test set
- Random State 20

In [21]:
from sklearn.model_selection import train_test_split
X_train,X_rest,y_train,y_rest=train_test_split(X,y,
                                              test_size=.2,
                                              random_state=20)

In [22]:
X_test, X_val, y_test, y_val=train_test_split(X_rest,
                                              y_rest,
                                             test_size=.8,
                                             random_state=20)

## Training the Multiple Linear Regression model on the Training set

In [23]:
from sklearn.linear_model import LinearRegression
regressor=LinearRegression()
regressor.fit(X_train.values,y_train)

LinearRegression()

## Intercept and Coefficient

In [24]:
print("Coefficient", regressor.coef_)
print("Intercept", regressor.intercept_)

Coefficient [[ 3.62245575e+04 -8.31584195e-01  2.88702246e+05 -7.27595761e-11
   4.65464254e+04  2.64057958e+03 -6.04980455e+04  1.71882689e+05
  -8.64323880e+05 -3.50851884e+03 -4.04890692e+04  1.83178786e+05
  -2.83903020e+04]]
Intercept [-71683645.58006921]


In [25]:
X_train.columns

Index(['year', 'km_driven', 'fuel_Diesel', 'fuel_Electric', 'fuel_LPG',
       'fuel_Petrol', 'seller_type_Individual', 'seller_type_Trustmark Dealer',
       'transmission_Manual', 'owner_Fourth & Above Owner',
       'owner_Second Owner', 'owner_Test Drive Car', 'owner_Third Owner'],
      dtype='object')

### Therefore our equation is:
##### selling_price = -71,683,646 + (year)*(3.62x10^4) - (km_driven)*(8.32x10^-1) + (fuel_Diesel)*(2.88x10^5) - (fuel_Electric)*(7.28x10^-11) + (fuel_LPG)*(4.65x10^4) + (fuel_Petrol)*(2.64x10^3) - (seller_type_Individual)*(6.05x10^4) + (seller_type_Trustmark Dealer)*(1.72x10^5) - (transmission_Manual)*(8.64x10^5) - (owner_Fourth & Above Owner)*(3.51x10^3) - (owner_Second Owner)*(4.05x10^4) + (owner_Test Drive Car)*(1.83x10^5) - (owner_Third Owner)*(2.84x10^4)

## Predicting the Test set results

In [26]:
y_predval=regressor.predict(X_val.values)
y_validate = y_val.to_numpy()

In [27]:
np.hstack((y_predval,y_validate)).round()

array([[280944., 350000.],
       [821100., 780000.],
       [465916., 434999.],
       ...,
       [733369., 910000.],
       [748128., 630000.],
       [575889., 700000.]])

In [28]:
y_pred=regressor.predict(X_test.values)

### Calculate RMSE, R-Square

In [29]:
from sklearn.metrics import mean_squared_error, r2_score 
import math 
print(f"r-square: {r2_score(y_test,y_pred):.2f}") 
print(f"MSE: {mean_squared_error(y_test,y_pred):.2f}") 
print(f"RMSE: {math.sqrt(mean_squared_error(y_test,y_pred)):.2f}")

r-square: 0.51
MSE: 168456064524.40
RMSE: 410434.00


## Validation case scenario:
#### 1. Predict how much will be the car selling price for a car of 
- year 2014 
- 70000 km driven 
- fuel type Diesel
- Seller type Dealer
- manual transmission
- first owner

** 465000 ** actual


In [30]:
X_test.head()

Unnamed: 0,year,km_driven,fuel_Diesel,fuel_Electric,fuel_LPG,fuel_Petrol,seller_type_Individual,seller_type_Trustmark Dealer,transmission_Manual,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
1218,2011,60000,0,0,0,1,1,0,1,0,0,0,0
2409,2014,90000,1,0,0,0,1,0,1,0,1,0,0
4010,2018,6000,0,0,0,1,1,0,1,0,0,0,0
797,2013,41988,1,0,0,0,0,0,1,0,0,0,0
723,2010,47564,0,0,0,1,0,0,1,0,1,0,0


In [32]:
regressor.predict([[2014,70000,1,0,0,0,0,0,1,0,0,0,0]])

array([[638780.60333154]])

#### From this validation case we can see that our model did not do very well, as it predicted the selling price to be 638,781 and the actual was 465,000. Some adjustments should be made to improve this model.