# Multiple Linear Regression Model with categorical variables
### Author : MD. Mehedi Hassan Galib
### Date : 02 August, 2020
<br/>
<br/>

## Loading necessary libraries

In [9]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

<br/>

## Loading dataset

In [45]:
df = pd.read_csv('C:/Users/Mehedi Hassan Galib/Desktop/Python/datas/car.csv')
df.head()

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,swift,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0


<br/>

## Categorical variable to Dummy variabnle

In [59]:
df2 = pd.get_dummies(df, columns = ['Transmission','Fuel_Type'], drop_first = True)
df2

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Seller_Type,Owner,Transmission_Manual,Fuel_Type_Diesel,Fuel_Type_Petrol
0,ritz,2014,3.35,5.59,27000,Dealer,0,1,0,1
1,sx4,2013,4.75,9.54,43000,Dealer,0,1,1,0
2,ciaz,2017,7.25,9.85,6900,Dealer,0,1,0,1
3,wagon r,2011,2.85,4.15,5200,Dealer,0,1,0,1
4,swift,2014,4.60,6.87,42450,Dealer,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...
296,city,2016,9.50,11.60,33988,Dealer,0,1,1,0
297,brio,2015,4.00,5.90,60000,Dealer,0,1,0,1
298,city,2009,3.35,11.00,87934,Dealer,0,1,0,1
299,city,2017,11.50,12.50,9000,Dealer,0,1,1,0


<br/>

## Observing the relationship between variables

In [78]:
df2.corr()

Unnamed: 0,Year,Selling_Price,Present_Price,Kms_Driven,Owner,Transmission_Manual,Fuel_Type_Diesel,Fuel_Type_Petrol
Year,1.0,0.236141,-0.047584,-0.524342,-0.182104,0.000394,0.064315,-0.059959
Selling_Price,0.236141,1.0,0.878983,0.029187,-0.088344,-0.367128,0.552339,-0.540571
Present_Price,-0.047584,0.878983,1.0,0.203647,0.008057,-0.348715,0.473306,-0.465244
Kms_Driven,-0.524342,0.029187,0.203647,1.0,0.089216,-0.16251,0.172515,-0.172874
Owner,-0.182104,-0.088344,0.008057,0.089216,1.0,-0.050316,-0.053469,0.055687
Transmission_Manual,0.000394,-0.367128,-0.348715,-0.16251,-0.050316,1.0,-0.098643,0.091013
Fuel_Type_Diesel,0.064315,0.552339,0.473306,0.172515,-0.053469,-0.098643,1.0,-0.979648
Fuel_Type_Petrol,-0.059959,-0.540571,-0.465244,-0.172874,0.055687,0.091013,-0.979648,1.0


<br/>

## Splitting Explanatory and Response variable

In [70]:
X = pd.DataFrame(df2[['Present_Price','Transmission_Manual','Fuel_Type_Diesel','Fuel_Type_Petrol']])
y = pd.DataFrame(df2.Selling_Price)

<br/>

## Splitting Train & Test dataset

In [72]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = .2, random_state = 5)

<br/>

## Model build up

In [73]:
lm = LinearRegression()
model = lm.fit(X_train, y_train)

<br/>

## R-Squared value

In [74]:
lm.score(X_train, y_train)

0.7973657496540842

<br/>

## Prediction

In [84]:
y_pred = lm.predict(X_test)
y_pred

array([[12.32919552],
       [12.32919552],
       [ 3.78724227],
       [ 6.59789122],
       [ 2.63206109],
       [ 9.45239297],
       [ 6.7309472 ],
       [ 1.05316481],
       [17.6099969 ],
       [ 3.2074216 ],
       [ 6.80305854],
       [ 4.5454693 ],
       [ 3.36352717],
       [ 9.21154439],
       [ 1.08884608],
       [ 2.64098141],
       [ 8.96548685],
       [ 1.13344767],
       [ 8.96548685],
       [ 0.91043972],
       [ 8.96548685],
       [17.6099969 ],
       [19.99618196],
       [ 1.32077435],
       [ 3.85414465],
       [ 3.90766656],
       [ 4.0994534 ],
       [ 4.85768043],
       [ 4.10391356],
       [ 6.79338943],
       [ 3.2074216 ],
       [ 6.7309472 ],
       [ 4.45623926],
       [ 0.95504131],
       [ 0.98626243],
       [ 7.40071984],
       [ 5.55789853],
       [ 3.21188176],
       [ 1.51256119],
       [ 2.20388583],
       [ 1.08884608],
       [ 3.25202319],
       [11.73967888],
       [ 1.6463391 ],
       [ 1.32077435],
       [ 3

<br/>

## Prediction of a single data point

In [87]:
lm.predict([[5.59,1,0,1]])[0][0].round(2)

3.16