In [1]:
import pandas as pd


In [2]:
df = pd.read_csv(r"D:\Data analytics projects\ML Basics\Linear Regression with dummy variables\homeprices.csv")

In [3]:
df

Unnamed: 0,town,area,price
0,monroe township,2600,550000
1,monroe township,3000,565000
2,monroe township,3200,610000
3,monroe township,3600,680000
4,monroe township,4000,725000
5,west windsor,2600,585000
6,west windsor,2800,615000
7,west windsor,3300,650000
8,west windsor,3600,710000
9,robinsville,2600,575000


We have categorical data above under town. We need to convert it into numerical. For this we use the concept of dummy variables.

# Dummy variables

A dummy variable (aka, an indicator variable) is a numeric variable that represents categorical data, such as gender, race, political affiliation, etc.

Researchers use dummy variables to analyze regression equations when one or more independent variables are categorical. The key to the analysis is to express categorical variables as dummy variables.

Technically, dummy variables are dichotomous, quantitative variables; they can take on any two quantitative values. As a practical matter, regression results are easier to interpret when dummy variables take on two specific values, 1 or 0. Typically, 1 represents the presence of a qualitative attribute, and 0 represents the absence.

The number of dummy variables required to represent a particular categorical variable depends on the number of values that the categorical variable can assume. To represent a categorical variable that can assume k different values, a researcher would need to define k - 1 dummy variables.

In [4]:
pd.get_dummies(df.town)

Unnamed: 0,monroe township,robinsville,west windsor
0,1,0,0
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0
5,0,0,1
6,0,0,1
7,0,0,1
8,0,0,1
9,0,1,0


In [5]:
dummies = pd.get_dummies(df.town)

In [7]:
df2 = pd.concat([df,dummies],axis=1)

In [8]:
df2

Unnamed: 0,town,area,price,monroe township,robinsville,west windsor
0,monroe township,2600,550000,1,0,0
1,monroe township,3000,565000,1,0,0
2,monroe township,3200,610000,1,0,0
3,monroe township,3600,680000,1,0,0
4,monroe township,4000,725000,1,0,0
5,west windsor,2600,585000,0,0,1
6,west windsor,2800,615000,0,0,1
7,west windsor,3300,650000,0,0,1
8,west windsor,3600,710000,0,0,1
9,robinsville,2600,575000,0,1,0


Now we need to remove town column and also one of the dummies column to avoid dummy variable trap.

In [9]:
final = df2.drop(['town','west windsor'],axis=1)

In [10]:
final

Unnamed: 0,area,price,monroe township,robinsville
0,2600,550000,1,0
1,3000,565000,1,0
2,3200,610000,1,0
3,3600,680000,1,0
4,4000,725000,1,0
5,2600,585000,0,0
6,2800,615000,0,0
7,3300,650000,0,0
8,3600,710000,0,0
9,2600,575000,0,1


In [11]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [12]:
model

LinearRegression()

In [13]:
X = final.drop(['price'],axis=1) #Independent variable Area and Town
X

Unnamed: 0,area,monroe township,robinsville
0,2600,1,0
1,3000,1,0
2,3200,1,0
3,3600,1,0
4,4000,1,0
5,2600,0,0
6,2800,0,0
7,3300,0,0
8,3600,0,0
9,2600,0,1


In [14]:
Y = final.price #Dependent variable Price

In [16]:
Y


0     550000
1     565000
2     610000
3     680000
4     725000
5     585000
6     615000
7     650000
8     710000
9     575000
10    600000
11    620000
12    695000
Name: price, dtype: int64

In [17]:
model.fit(X,Y)

LinearRegression()

In [20]:
model.predict([[2800,0,1]]) #We are predicting for 2800 area and 0,1 which is Robinsville

array([590775.63964739])

In [23]:
model.score(X,Y)*100 #To calculate the accuracy of the model

95.73929037221873