In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

from utils import save_submission

In [2]:
#Load dataset
data=pd.read_csv('dataset/house-prices/train.csv')
test=pd.read_csv('dataset/house-prices/test.csv')
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


### Feature engineering

In [4]:
NA_list=['Alley', 'Fence', 'FireplaceQu', 'GarageType','GarageCond','GarageQual','GarageFinish','BsmtFinType2',
         'BsmtExposure','BsmtQual','BsmtCond','BsmtFinType1','MasVnrType']
data[NA_list]=data[NA_list].fillna('na')
test[NA_list]=test[NA_list].fillna('na')

ind=data[data['Electrical'].isnull()==True].index
data.loc[ind,'Electrical']=data['Electrical'].value_counts().keys()[0]
test.loc[ind,'Electrical']=test['Electrical'].value_counts().keys()[0]

data['LotFrontage']=data['LotFrontage'].fillna(0)
test['LotFrontage']=test['LotFrontage'].fillna(0)

ind_train=data[['MasVnrArea','MasVnrType','SalePrice']][data['MasVnrArea'].isnull()==True].index
data.loc[ind_train,'MasVnrArea']=0

ind_test=test[['MasVnrArea','MasVnrType']][test['MasVnrArea'].isnull()==True].index
test.loc[ind_test,'MasVnrArea']=0

cols = ('FireplaceQu', 'BsmtQual', 'BsmtCond', 'Utilities', 'GarageQual', 'GarageCond', 
        'ExterQual', 'ExterCond','HeatingQC', 'PoolQC', 'KitchenQual', 'BsmtFinType1', 
        'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LandSlope',
        'LotShape', 'PavedDrive', 'Street', 'Alley', 'CentralAir', 'MSSubClass', 'OverallCond', 
        'YrSold', 'MoSold', 'MSZoning', 'LandContour', 'LotConfig', 'Neighborhood',
        'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
        'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'Foundation', 'GarageType', 'MiscFeature', 
        'SaleType', 'SaleCondition', 'Electrical', 'Heating')

def encode_labels(dataset, cols):
    for c in cols:
        lbl = LabelEncoder() 
        lbl.fit(list(dataset[c].values)) 
        dataset[c] = lbl.transform(list(dataset[c].values))
    return dataset

data = encode_labels(data, cols)
test = encode_labels(test, cols)

In [5]:
#Checking 'NA' in train dataset
features_na=data.isnull().sum().sort_values(ascending=False)
features_na[features_na>0]

GarageYrBlt    81
dtype: int64

In [6]:
features_na=test.isnull().sum().sort_values(ascending=False)
features_na[features_na>0]

GarageYrBlt     78
BsmtFullBath     2
BsmtHalfBath     2
BsmtFinSF2       1
BsmtUnfSF        1
TotalBsmtSF      1
BsmtFinSF1       1
GarageCars       1
GarageArea       1
dtype: int64

In [7]:
for i in ['GarageYrBlt']:
    data.loc[data[data[i].isnull()==True].index,i]=data[i].value_counts().keys()[0]
    
for i in ['GarageYrBlt', 'BsmtFullBath', 'BsmtHalfBath', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFinSF1', 'GarageCars', 'GarageArea']:
    test.loc[test[test[i].isnull()==True].index,i]=test[i].value_counts().keys()[0]

### Prepraring data for prediction

In [9]:
#Take targate variable into y
y = data['SalePrice'].values
X = data.loc[:, data.columns != 'SalePrice'].values

### Build model and predict house prices

In [10]:
ETR = ExtraTreesRegressor(n_estimators=100, max_depth=4)
ETR.fit(X[:1000], y[:1000])
# ETR.fit(X, y)
print(f"Accuracy --> {ETR.score(X[1000:], y[1000:])*100}%")

Accuracy --> 76.49035969927989%


In [11]:
RFR = RandomForestRegressor(random_state=1, n_estimators=10, max_depth = 4)
RFR.fit(X[:1000], y[:1000])
# RFR.fit(X, y)
print(f"Accuracy --> {RFR.score(X[1000:], y[1000:])*100}%")

Accuracy --> 78.37489194572147%


In [12]:
GBR = GradientBoostingRegressor(n_estimators=100, max_depth=4)
GBR.fit(X[:1000], y[:1000])
# GBR.fit(X, y)
print(f"Accuracy --> {GBR.score(X[1000:], y[1000:])*100}%")

Accuracy --> 86.84070126017602%


### Predict house prices on test dataset, send submission to kaggle.com

In [13]:
X_test = test.values
GBR.fit(X, y)
y_predicted = GBR.predict(X_test)

In [14]:
save_submission(test.Id, y_predicted)

'submissions/submission-20190610-020741.csv'