## Dummy variables is used when there is text data. We need numbers so that we can put them into equations for our Linear regression model

In [2]:
import pandas as pd
import numpy as np
from sklearn import linear_model

In [23]:
df = pd.read_csv('homeprices2.csv', sep='\t')
df

Unnamed: 0,town,area,price
0,monroe township,2600,550000
1,monroe township,3000,565000
2,monroe township,3200,610000
3,monroe township,3600,680000
4,monroe township,4000,725000
5,west windsor,2600,585000
6,west windsor,2800,615000
7,west windsor,3300,650000
8,west windsor,3600,710000
9,robinsville,2600,575000


## Step 1: Create those dummy columns for all towns

### Can be done using panda's *get_dummies*

In [26]:
dummies = pd.get_dummies(df['town'])
dummies


Unnamed: 0,monroe township,robinsville,west windsor
0,1,0,0
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0
5,0,0,1
6,0,0,1
7,0,0,1
8,0,0,1
9,0,1,0


In [28]:
merged = pd.concat([df, dummies], axis='columns')       
merged


# need to specify axis, as default axis is rows which will add down

Unnamed: 0,town,area,price,monroe township,robinsville,west windsor
0,monroe township,2600,550000,1,0,0
1,monroe township,3000,565000,1,0,0
2,monroe township,3200,610000,1,0,0
3,monroe township,3600,680000,1,0,0
4,monroe township,4000,725000,1,0,0
5,west windsor,2600,585000,0,0,1
6,west windsor,2800,615000,0,0,1
7,west windsor,3300,650000,0,0,1
8,west windsor,3600,710000,0,0,1
9,robinsville,2600,575000,0,1,0


In [29]:
merged.drop(columns=['town'])

Unnamed: 0,area,price,monroe township,robinsville,west windsor
0,2600,550000,1,0,0
1,3000,565000,1,0,0
2,3200,610000,1,0,0
3,3600,680000,1,0,0
4,4000,725000,1,0,0
5,2600,585000,0,0,1
6,2800,615000,0,0,1
7,3300,650000,0,0,1
8,3600,710000,0,0,1
9,2600,575000,0,1,0




### First drop the towns column as we have created dummies from it and its useless further as it is text data



## We also need to drop one of (out of the 3 here) the dummy columns, to avoid *Dummy Variabe Trap*. Whenever one variable can be derived from the rest of the variables, these variables are said to be *Multi Colinear*. And this Multi colinearity creates problem of dummy variable trap, which can mess up the Machine Learning model.


## Hence rule is to drop one of the created dummy columns.


In [32]:
final = merged.drop(['town','west windsor'], axis='columns')
final

Unnamed: 0,area,price,monroe township,robinsville
0,2600,550000,1,0
1,3000,565000,1,0
2,3200,610000,1,0
3,3600,680000,1,0
4,4000,725000,1,0
5,2600,585000,0,0
6,2800,615000,0,0
7,3300,650000,0,0
8,3600,710000,0,0
9,2600,575000,0,1


In [33]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()             # now we have made an object of that class

In [34]:
X = final.drop('price', axis='columns')          # x is all the independent variable( basically all except price)
X

Unnamed: 0,area,monroe township,robinsville
0,2600,1,0
1,3000,1,0
2,3200,1,0
3,3600,1,0
4,4000,1,0
5,2600,0,0
6,2800,0,0
7,3300,0,0
8,3600,0,0
9,2600,0,1


In [36]:
Y = final['price']
Y

0     550000
1     565000
2     610000
3     680000
4     725000
5     585000
6     615000
7     650000
8     710000
9     575000
10    600000
11    620000
12    695000
Name: price, dtype: int64

In [37]:
model.fit(X,Y)                                   # training the model

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [38]:
model.coef_

array([   126.89744141, -40013.97548914, -14327.56396474])

In [40]:
model.predict([[2800, 0, 1]])      


# cost of a house with area 2800 in robinsville
# we need to supply the input values to our model in the order in which we designed X

array([590775.63964739])

In [41]:
model.predict([[3400, 0, 0]])

# cost of house with area 3400 in west windstor

array([681241.66845839])

In [43]:
model.score(X,Y)                             

0.9573929037221873

### model.score() gives the accuracy of the model

### We can similiry use One hot coding for the same operation

In [45]:
# exercise (cars and models and mileage)




# Exercise

In [56]:
df = pd.read_csv('carprices.csv', sep='\t')
df

Unnamed: 0,Car Model,Mileage,Sell Price($),Age(yrs)
0,BMW X5,69000,18000,6
1,BMW X5,35000,34000,3
2,BMW X5,57000,26100,5
3,BMW X5,22500,40000,2
4,BMW X5,46000,31500,4
5,Audi A5,59000,29400,5
6,Audi A5,52000,32000,5
7,Audi A5,72000,19300,6
8,Audi A5,91000,12000,8
9,Mercedez Benz C class,67000,22000,6


In [57]:
df.rename(columns={'Age(yrs)': 'age', 'Sell Price($)': 'price', 'Car Model':'carmodel' }, inplace=True)
df

Unnamed: 0,carmodel,Mileage,price,age
0,BMW X5,69000,18000,6
1,BMW X5,35000,34000,3
2,BMW X5,57000,26100,5
3,BMW X5,22500,40000,2
4,BMW X5,46000,31500,4
5,Audi A5,59000,29400,5
6,Audi A5,52000,32000,5
7,Audi A5,72000,19300,6
8,Audi A5,91000,12000,8
9,Mercedez Benz C class,67000,22000,6


In [58]:
dummies = pd.get_dummies(df['carmodel'])
dummies

Unnamed: 0,Audi A5,BMW X5,Mercedez Benz C class
0,0,1,0
1,0,1,0
2,0,1,0
3,0,1,0
4,0,1,0
5,1,0,0
6,1,0,0
7,1,0,0
8,1,0,0
9,0,0,1


In [59]:
df = pd.concat([df, dummies], axis = 'columns')
df

Unnamed: 0,carmodel,Mileage,price,age,Audi A5,BMW X5,Mercedez Benz C class
0,BMW X5,69000,18000,6,0,1,0
1,BMW X5,35000,34000,3,0,1,0
2,BMW X5,57000,26100,5,0,1,0
3,BMW X5,22500,40000,2,0,1,0
4,BMW X5,46000,31500,4,0,1,0
5,Audi A5,59000,29400,5,1,0,0
6,Audi A5,52000,32000,5,1,0,0
7,Audi A5,72000,19300,6,1,0,0
8,Audi A5,91000,12000,8,1,0,0
9,Mercedez Benz C class,67000,22000,6,0,0,1


In [62]:
df.rename(columns={'Audi A5':'audi', 'BMW X5':'bmw', 'Mercedez Benz C class': 'mercedes'}, inplace=True)
df

Unnamed: 0,carmodel,Mileage,price,age,audi,bmw,mercedes
0,BMW X5,69000,18000,6,0,1,0
1,BMW X5,35000,34000,3,0,1,0
2,BMW X5,57000,26100,5,0,1,0
3,BMW X5,22500,40000,2,0,1,0
4,BMW X5,46000,31500,4,0,1,0
5,Audi A5,59000,29400,5,1,0,0
6,Audi A5,52000,32000,5,1,0,0
7,Audi A5,72000,19300,6,1,0,0
8,Audi A5,91000,12000,8,1,0,0
9,Mercedez Benz C class,67000,22000,6,0,0,1


In [65]:
model = df.drop(['carmodel', 'mercedes'], axis='columns')
model

Unnamed: 0,Mileage,price,age,audi,bmw
0,69000,18000,6,0,1
1,35000,34000,3,0,1
2,57000,26100,5,0,1
3,22500,40000,2,0,1
4,46000,31500,4,0,1
5,59000,29400,5,1,0
6,52000,32000,5,1,0
7,72000,19300,6,1,0
8,91000,12000,8,1,0
9,67000,22000,6,0,0


In [66]:
reg = LinearRegression()

X = df[['Mileage', 'age', 'audi', 'bmw']]

Y = df['price']

reg.fit(X, Y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [67]:
reg.coef_

array([-3.70122094e-01, -1.33245363e+03, -2.45354074e+03, -6.73820733e+03])

In [72]:
reg.predict([[70000, 8, 0, 1]])

array([15670.24301479])

In [73]:
reg.score(X,Y)

0.9417050937281083

In [74]:
reg.predict([[45000,4,0,0]])

array([36991.31721061])