### **Minor Project**
---
#### Applying Gradient Boosting and Random Forest Regression on House Prices 
---
#### **Name** : Denish Trivedi
#### **Email id** : denishtrivedi008@gmail.com / denishtrivedi@iitgn.ac.in

#### **Preparing the Data**

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [5]:
dataset = pd.read_csv(r'C:\Users\A\Desktop\house_prices.csv')
dataset.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


#### **Missing Values**

In [11]:
# find columns with missing data
col_len = len(dataset)

for col in dataset.columns:
    if (dataset[col].count() < col_len):
        print(col)

LotFrontage
Alley
MasVnrType
MasVnrArea
BsmtQual
BsmtCond
BsmtExposure
BsmtFinType1
BsmtFinType2
Electrical
FireplaceQu
GarageType
GarageYrBlt
GarageFinish
GarageQual
GarageCond
PoolQC
Fence
MiscFeature


In [12]:
# filling missing data where required
# LotFrontage
m = dataset['LotFrontage'].mean()
dataset['LotFrontage'].fillna(value=m, inplace=True)
# Alley
dataset['Alley'].fillna(value='None', inplace=True)
# MasVnrType
dataset['MasVnrType'].fillna(value='None', inplace=True)
# MasVnrArea
m = dataset['MasVnrArea'].mean()
dataset['MasVnrArea'].fillna(value=m, inplace=True)
# BsmtQual
dataset['BsmtQual'].fillna(value='None', inplace=True)
# BsmtCond
dataset['BsmtCond'].fillna(value='None', inplace=True)
# BsmtExposure
dataset['BsmtExposure'].fillna(value='None', inplace=True)
# BsmtFinType1
dataset['BsmtFinType1'].fillna(value='None', inplace=True)
# BsmtFinType2
dataset['BsmtFinType2'].fillna(value='None', inplace=True)
# Electrical
dataset['Electrical'].fillna(value='None', inplace=True)
# FireplaceQu
dataset['FireplaceQu'].fillna(value='None', inplace=True)
# GarageType
dataset['GarageType'].fillna(value='None', inplace=True)
# GarageYrBlt
m = dataset['GarageYrBlt'].mean()
dataset['GarageYrBlt'].fillna(value=m, inplace=True)
# GarageFinish
dataset['GarageFinish'].fillna(value='None', inplace=True)
# GarageQual
dataset['GarageQual'].fillna(value='None', inplace=True)
# GarageCond
dataset['GarageCond'].fillna(value='None', inplace=True)
#PoolQC
dataset['PoolQC'].fillna(value='None', inplace=True)
# Fence
dataset['Fence'].fillna(value='None', inplace=True)
# MiscFeature
dataset['MiscFeature'].fillna(value='None', inplace=True)

#### **Categorical Data**

In [14]:
for col in dataset.columns:
    if (type(dataset[col][0]) == str):
        dataset[col] = dataset[col].astype('category')
        dataset[col] = dataset[col].cat.codes
dataset.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,3,65.0,8450,1,1,3,3,0,...,0,3,4,1,0,2,2008,8,4,208500
1,2,20,3,80.0,9600,1,1,3,3,0,...,0,3,4,1,0,5,2007,8,4,181500
2,3,60,3,68.0,11250,1,1,0,3,0,...,0,3,4,1,0,9,2008,8,4,223500
3,4,70,3,60.0,9550,1,1,0,3,0,...,0,3,4,1,0,2,2006,8,0,140000
4,5,60,3,84.0,14260,1,1,0,3,0,...,0,3,4,1,0,12,2008,8,4,250000


#### **Random Forest Regression Model**

In [24]:
x = dataset.iloc[:, 1:-1].values
y = dataset.iloc[:, -1].values

# Splitting Data 

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

# Model

from sklearn.ensemble import RandomForestRegressor
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=0)
rf_regressor.fit(x_train, y_train)
rf_score = regressor.score(x_train, y_train) 

#### **Gradient Boosting Regression Model**

In [25]:
from sklearn.ensemble import GradientBoostingRegressor
gb_regressor = GradientBoostingRegressor(n_estimators=100, random_state=0)
gb_regressor.fit(x_train, y_train)

# finding the best max_depth for GB
best_depth = 2
gb_score = 0
for i in range(2, 30):
    gb_regressor = GradientBoostingRegressor(n_estimators=100, random_state=0, max_depth=i)
    gb_regressor.fit(x_train, y_train)
    s = gb_regressor.score(x_train, y_train)
    if (s > gb_score):
        gb_score = s
        best_depth = i
        
gb_regressor = GradientBoostingRegressor(n_estimators=100, random_state=0, max_depth=best_depth)
gb_regressor.fit(x_train, y_train)

GradientBoostingRegressor(max_depth=24, random_state=0)

#### **Comparing the Accuracies of the above methods**

In [48]:
y_pred_rf = regressor.predict(x_test)
y_pred_gb = gb_regressor.predict(x_test)

from sklearn.metrics import r2_score
from tabulate import tabulate

my_data = [[rf_score, 0.999992], [r2_score(y_test, y_pred_rf), r2_score(y_test, y_pred_gb)]]
head = ['\033[1mRandom Forest', '\033[1mGradient Boosting']
print(tabulate(my_data, headers=head, tablefmt="grid"))

+-----------------+---------------------+
|   [1mRandom Forest |   [1mGradient Boosting |
|        0.979743 |            0.999992 |
+-----------------+---------------------+
|        0.841875 |            0.795011 |
+-----------------+---------------------+
