# Multiple Linear Regression- Car dataset

## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

## Importing the dataset

In [2]:
car=pd.read_csv("CAR.csv")

In [3]:
car.head()

Unnamed: 0,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,2007,60000,70000,Petrol,Individual,Manual,First Owner
1,2007,135000,50000,Petrol,Individual,Manual,First Owner
2,2012,600000,100000,Diesel,Individual,Manual,First Owner
3,2017,250000,46000,Petrol,Individual,Manual,First Owner
4,2014,450000,141000,Diesel,Individual,Manual,Second Owner


In [4]:
car.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4337 entries, 0 to 4336
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   year           4337 non-null   int64 
 1   selling_price  4337 non-null   int64 
 2   km_driven      4337 non-null   int64 
 3   fuel           4337 non-null   object
 4   seller_type    4337 non-null   object
 5   transmission   4337 non-null   object
 6   owner          4337 non-null   object
dtypes: int64(3), object(4)
memory usage: 237.3+ KB


### Split the dataset into independent and Dependent variables

In [5]:
X=car.drop("selling_price", axis=1)
y=car[['selling_price']]

In [6]:
X.head()

Unnamed: 0,year,km_driven,fuel,seller_type,transmission,owner
0,2007,70000,Petrol,Individual,Manual,First Owner
1,2007,50000,Petrol,Individual,Manual,First Owner
2,2012,100000,Diesel,Individual,Manual,First Owner
3,2017,46000,Petrol,Individual,Manual,First Owner
4,2014,141000,Diesel,Individual,Manual,Second Owner


### Work with the catagorical data

In [7]:
X=pd.get_dummies(car[['year', 'km_driven', 'fuel', 'seller_type', 'transmission', 'owner']], 
                drop_first=True)

In [8]:
X.head()

Unnamed: 0,year,km_driven,fuel_Diesel,fuel_Electric,fuel_LPG,fuel_Petrol,seller_type_Individual,seller_type_Trustmark Dealer,transmission_Manual,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,2007,70000,0,0,0,1,1,0,1,0,0,0,0
1,2007,50000,0,0,0,1,1,0,1,0,0,0,0
2,2012,100000,1,0,0,0,1,0,1,0,0,0,0
3,2017,46000,0,0,0,1,1,0,1,0,0,0,0
4,2014,141000,1,0,0,0,1,0,1,0,1,0,0


## Splitting the dataset into the Training set and Test set
- Random State 20

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X,y, test_size=.2, random_state=20)

## Training the Multiple Linear Regression model on the Training set

In [12]:
from sklearn.linear_model import LinearRegression
regressor=LinearRegression()
regressor.fit(X_train.values, y_train)

LinearRegression()

## Intercept and Coefficient

In [13]:
print("Coefficients: ", regressor.coef_)
print("Intercept: ", regressor.intercept_)

Coefficients:  [[ 3.62245575e+04 -8.31584195e-01  2.88702246e+05  1.60071068e-10
   4.65464254e+04  2.64057958e+03 -6.04980455e+04  1.71882689e+05
  -8.64323880e+05 -3.50851884e+03 -4.04890692e+04  1.83178786e+05
  -2.83903020e+04]]
Intercept:  [-71683645.58006921]


In [14]:
X_train.columns

Index(['year', 'km_driven', 'fuel_Diesel', 'fuel_Electric', 'fuel_LPG',
       'fuel_Petrol', 'seller_type_Individual', 'seller_type_Trustmark Dealer',
       'transmission_Manual', 'owner_Fourth & Above Owner',
       'owner_Second Owner', 'owner_Test Drive Car', 'owner_Third Owner'],
      dtype='object')

### Forumula
selling price = -71683646 + (3.622*10^04)(year) - (8.316*10^-01)(km driven) + (2.887*10^05)(diesel fuel) + (1.6*10^-10)(electric fuel) + (4.655*10^04)(LPG fuel) + (2.641*10^03)(petrol fuel) - (6.05*10^04)(individual seller) + (1.719*10^05)(trustmark dealer seller) - (8.643*10^05)(manual transmission) - (3.509*10^03)(fourth and above owner) - (4.049*10^04)(second owner) + (1.832*10^05)(test drive car owner) - (2.839*10^04)(third owner) 

## Predicting the Test set results

In [15]:
y_pred=regressor.predict(X_test.values)

### Calculate RMSE, R-Square

In [16]:
from sklearn.metrics import mean_squared_error, r2_score
import math

print(f"R-Square: {r2_score(y_test, y_pred):.2f}")
print(f"MSE: {mean_squared_error(y_test, y_pred):.2f}")
print(f"RMSE: {math.sqrt(mean_squared_error(y_test, y_pred)):.2f}")

R-Square: 0.52
MSE: 142267011638.16
RMSE: 377182.99


## Validation case scenario:
#### 1. Predict how much will be the car selling price for a car of 
- year 2014 
- 70000 km driven 
- fuel type Diesel
- Seller type Dealer
- manual transmission
- first owner

** 465000 ** actual


#### Prediction Numbers
- cng 0 0 0 0 
- diesel 1 0 0 0 
- electrict 0 1 0 0 
- lpg 0 0 1 0 
- petrol 0 0 0 1 

- dealer 0 0 
- individual 1 0 
- trustmark dealer 0 1 

- automatic 0 
- manual 1 

- first 0 0 0 0 
- fourth and above 1 0 0 0 
- second 0 1 0 0 
- test drive car 0 0 1 0 
- third 0 0 0 1 

In [17]:
X_test.head()

Unnamed: 0,year,km_driven,fuel_Diesel,fuel_Electric,fuel_LPG,fuel_Petrol,seller_type_Individual,seller_type_Trustmark Dealer,transmission_Manual,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
406,2012,80000,1,0,0,0,1,0,1,0,0,0,1
3027,2013,100000,1,0,0,0,1,0,1,0,0,0,0
2277,2016,30000,0,0,0,1,1,0,1,0,0,0,0
799,2017,7658,0,0,0,1,0,0,0,0,0,0,0
2738,2012,110000,0,0,0,1,1,0,1,0,1,0,0


In [18]:
regressor.predict([[2014, 70000, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]])

array([[638780.60333154]])

In [19]:
print("Predicted price is $638,780.60")

Predicted price is $638,780.60


#### This model is not a good predictor for the selling price of a car. The R-Squared value is 0.52 meaning only 52% of variation can be explained by the variables. In addition, the model predicts the selling price to be 638,780 while the actual is only 465,000