# Predict the price of houses using Machine Learning

## 1. Import the required libraries

In [None]:
import numpy as np   #Linear algera Library
import pandas as pd
import matplotlib.pyplot as plt  #to plot graphs
import seaborn as sns  #to plot graphs
sns.set()  #setting seaborn as default

import warnings
warnings.filterwarnings('ignore')

## 2. Read the input data

In [None]:
data= pd.read_csv('/content/Housing.csv')   #reads the input data
data.head()   #displays the first five rows

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


## 3. Understand your data

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   price             545 non-null    int64 
 1   area              545 non-null    int64 
 2   bedrooms          545 non-null    int64 
 3   bathrooms         545 non-null    int64 
 4   stories           545 non-null    int64 
 5   mainroad          545 non-null    object
 6   guestroom         545 non-null    object
 7   basement          545 non-null    object
 8   hotwaterheating   545 non-null    object
 9   airconditioning   545 non-null    object
 10  parking           545 non-null    int64 
 11  prefarea          545 non-null    object
 12  furnishingstatus  545 non-null    object
dtypes: int64(6), object(7)
memory usage: 55.5+ KB


In [None]:
data.describe(include ='all')   #parameter include=all will display NaN values as well

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
count,545.0,545.0,545.0,545.0,545.0,545,545,545,545,545,545.0,545,545
unique,,,,,,2,2,2,2,2,,2,3
top,,,,,,yes,no,no,no,no,,no,semi-furnished
freq,,,,,,468,448,354,520,373,,417,227
mean,4766729.0,5150.541284,2.965138,1.286239,1.805505,,,,,,0.693578,,
std,1870440.0,2170.141023,0.738064,0.50247,0.867492,,,,,,0.861586,,
min,1750000.0,1650.0,1.0,1.0,1.0,,,,,,0.0,,
25%,3430000.0,3600.0,2.0,1.0,1.0,,,,,,0.0,,
50%,4340000.0,4600.0,3.0,1.0,2.0,,,,,,0.0,,
75%,5740000.0,6360.0,3.0,2.0,2.0,,,,,,1.0,,


## 4. Check for NULL values

In [None]:
data.isnull().sum() # No null values

price               0
area                0
bedrooms            0
bathrooms           0
stories             0
mainroad            0
guestroom           0
basement            0
hotwaterheating     0
airconditioning     0
parking             0
prefarea            0
furnishingstatus    0
dtype: int64

## 5. Data Preparation

### a) YES/NO categories

It is seen that there are some columns with categorical values like 'YES' or 'NO'. We need to change them to 0 and 1

In [None]:
#first fetch all the categorical columns with Yes and NO
categorical =  ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea']
#write a function to change yes to 1 and no to 0
def binary_map(x):
    return x.map({'yes': 1, "no": 0})

# now replace yes and no with 1 and 0 in our dataset
data[categorical] = data[categorical].apply(binary_map)



In [None]:
data.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,1,0,0,0,1,2,1,furnished
1,12250000,8960,4,4,4,1,0,0,0,1,3,0,furnished
2,12250000,9960,3,2,2,1,0,1,0,0,2,1,semi-furnished
3,12215000,7500,4,2,2,1,0,1,0,1,3,1,furnished
4,11410000,7420,4,1,2,1,1,1,0,1,2,0,furnished


### b) Dummy variable

Dummy Variables - Now the last column(furnishingstatus) has 3 categories i.e. furnished,semi-furnished and unfurnished. We need to convert this to numbers as well

In [None]:
table = pd.get_dummies(data['furnishingstatus'])   #add the column into table variable
table.head()

Unnamed: 0,furnished,semi-furnished,unfurnished
0,1,0,0
1,1,0,0
2,0,1,0
3,1,0,0
4,1,0,0


furnished will be 00 and to avoid redudency we drop it. semi-furnished will be 10 and unfurnished will be 01

In [None]:
table = pd.get_dummies(data['furnishingstatus'], drop_first = True)  #recreate table but now drop the first column(furnished)
table.head()

Unnamed: 0,semi-furnished,unfurnished
0,0,0
1,0,0
2,1,0
3,0,0
4,0,0


In [None]:
data = pd.concat([data, table], axis = 1)  #attach the other two columns to our data set
data.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus,semi-furnished,unfurnished
0,13300000,7420,4,2,3,1,0,0,0,1,2,1,furnished,0,0
1,12250000,8960,4,4,4,1,0,0,0,1,3,0,furnished,0,0
2,12250000,9960,3,2,2,1,0,1,0,0,2,1,semi-furnished,1,0
3,12215000,7500,4,2,2,1,0,1,0,1,3,1,furnished,0,0
4,11410000,7420,4,1,2,1,1,1,0,1,2,0,furnished,0,0


In [None]:
data.drop(['furnishingstatus'], axis = 1, inplace = True) #drop the old column from the dataset
data.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,semi-furnished,unfurnished
0,13300000,7420,4,2,3,1,0,0,0,1,2,1,0,0
1,12250000,8960,4,4,4,1,0,0,0,1,3,0,0,0
2,12250000,9960,3,2,2,1,0,1,0,0,2,1,1,0
3,12215000,7500,4,2,2,1,0,1,0,1,3,1,0,0
4,11410000,7420,4,1,2,1,1,1,0,1,2,0,0,0


## 6. Split data into Training and Testing data

In [None]:
data.columns

Index(['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'mainroad',
       'guestroom', 'basement', 'hotwaterheating', 'airconditioning',
       'parking', 'prefarea', 'semi-furnished', 'unfurnished'],
      dtype='object')

In [None]:
# Assigning of the necessary variable

X = data.drop('price', axis = 1)
y = data['price']

In [None]:
X.head()

Unnamed: 0,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,semi-furnished,unfurnished
0,7420,4,2,3,1,0,0,0,1,2,1,0,0
1,8960,4,4,4,1,0,0,0,1,3,0,0,0
2,9960,3,2,2,1,0,1,0,0,2,1,1,0
3,7500,4,2,2,1,0,1,0,1,3,1,0,0
4,7420,4,1,2,1,1,1,0,1,2,0,0,0


In [None]:
y.head()

0    13300000
1    12250000
2    12250000
3    12215000
4    11410000
Name: price, dtype: int64

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size = 0.25, random_state = 0)

In [None]:
#shapes
print(X_train.shape)
print(X_test.shape)

(408, 13)
(137, 13)


## Models

### Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression   #for linear regression model
from sklearn.metrics import r2_score

model_1 =LinearRegression() # load model

model_1.fit(X_train,y_train) # train model

#predict the output(predictions) using the test data
predictions = model_1.predict(X_test)

scores_1 = r2_score(y_test, predictions)

print("Scores of Linear Regression",model_1.score(X_test, y_test))

Scores of Linear Regression 0.6956489291232408


### DecisionTree Regressor

In [None]:
# import the regressor
from sklearn.tree import DecisionTreeRegressor

from sklearn.metrics import r2_score

model_2 = DecisionTreeRegressor(random_state=0) # load model

model_2.fit(X_train,y_train) # train model

#predict the output(predictions) using the test data
predictions = model_2.predict(X_test)

scores_2 = r2_score(y_test, predictions)

print("Scores of Decision Tree Regressor", scores_2)

Scores of Decision Tree Regressor 0.06290090674478188


### RandomForestRegressor

In [None]:
# import the regressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import r2_score

model_3 = RandomForestRegressor(n_estimators = 10, random_state = 0) # load model

model_3.fit(X_train,y_train) # train model

#predict the output(predictions) using the test data
predictions = model_3.predict(X_test)

scores_3 = r2_score(y_test, predictions)

print("Scores of Random Forest Regressor", scores_3)

Scores of Random Forest Regressor 0.5608886175094021


### XGBRegressor

In [None]:
# import the regressor
from xgboost import XGBRegressor

from sklearn.metrics import r2_score

model_4 = XGBRegressor(random_state = 0) # load model

model_4.fit(X_train,y_train) # train model

#predict the output(predictions) using the test data
predictions = model_4.predict(X_test)

scores_4 = r2_score(y_test, predictions)

print("Scores of XGBRegressor", scores_4)

Scores of XGBRegressor 0.4755137768651555
