In [1]:
import numpy as np
import pandas as pd
import math
import matplotlib
import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns

%matplotlib inline

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

train_id = train['Id']
test_id = test['Id']

In [2]:
train['hasBsmt'] = ['no' if x == 0 else 'yes' for x in train['TotalBsmtSF']]
train['hasGarage'] = ['no' if math.isnan(x) else 'yes' for x in train['GarageYrBlt']]
train['wasRemod'] = ['no' if x else 'yes' for x in (train['YearRemodAdd'] == train['YearBuilt'])]
train['MSSubClass'] = train['MSSubClass'].astype('category')
train['MoSold'] = train['MoSold'].astype('category')
train['YrSold'] = train['YrSold'].astype('category')
train['YearBuilt'] = train['YearBuilt'].astype('category')
train['YearRemodAdd'] = train['YearRemodAdd'].astype('category')
train['TotalSF'] = train['TotalBsmtSF'] + train['1stFlrSF'] + train['2ndFlrSF']
train = train.drop(labels = 'PoolArea', axis = 1)
train = train.drop(train[train['Id'] == 1299].index)
train = train.drop(train[train['Id'] == 524].index)

train = train.drop(labels = 'Id', axis = 1)

for x in train.columns:
    if train[x].dtype == 'int64' or train[x].dtype == 'float64':
        train[x] = train[x].fillna(0)
        
for x in train.columns:
    if train[x].dtype == 'float64':
        train[x] = train[x].astype(int)
        
test['hasBsmt'] = ['no' if x == 0 else 'yes' for x in test['TotalBsmtSF']]
test['hasGarage'] = ['no' if math.isnan(x) else 'yes' for x in test['GarageYrBlt']]
test['wasRemod'] = ['no' if x else 'yes' for x in (test['YearRemodAdd'] == test['YearBuilt'])]
test['MSSubClass'] = test['MSSubClass'].astype('category')
test['MoSold'] = test['MoSold'].astype('category')
test['YrSold'] = test['YrSold'].astype('category')
test['YearBuilt'] = test['YearBuilt'].astype('category')
test['YearRemodAdd'] = test['YearRemodAdd'].astype('category')
test['TotalSF'] = test['TotalBsmtSF'] + test['1stFlrSF'] + test['2ndFlrSF']
test = test.drop(labels = 'PoolArea', axis = 1)

test = test.drop(labels = 'Id', axis = 1)

for x in test.columns:
    if test[x].dtype == 'int64' or test[x].dtype == 'float64':
        test[x] = test[x].fillna(0)
        
for x in test.columns:
    if test[x].dtype == 'float64':
        test[x] = test[x].astype(int)

X_train = train.drop(labels = 'SalePrice', axis = 1)
y_train = train.SalePrice

In [3]:
from sklearn.preprocessing import LabelEncoder

for c in X_train.columns:
    if X_train[c].dtype == 'object':
        le = LabelEncoder()
        # Need to convert the column type to string in order to encode missing values
        X_train[c] = le.fit_transform(X_train[c].astype(str))

for c in test.columns:
    if test[c].dtype == 'object':
        le = LabelEncoder()
        # Need to convert the column type to string in order to encode missing values
        test[c] = le.fit_transform(test[c].astype(str))

In [4]:
from sklearn import ensemble

randomForest = ensemble.RandomForestRegressor(n_estimators = 100, min_samples_split = 2, min_samples_leaf = 1)

In [5]:
randomForest.fit(X_train, y_train)
randomForest.score(X_train, y_train)

0.98539064467796389

In [6]:
#Eliminate unimportant features
feature_importance = list(zip(X_train.columns, randomForest.feature_importances_))
important_features = []
for x in feature_importance:
    if x[1] > 1e-4:
        important_features.append(x[0])
        
for x in X_train.columns:
    if x not in important_features:
        X_train = X_train.drop(labels = x, axis = 1)
        test = test.drop(labels = x, axis = 1)

X_train.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Alley,LotShape,LandContour,LotConfig,LandSlope,Neighborhood,...,3SsnPorch,ScreenPorch,PoolQC,Fence,MoSold,YrSold,SaleType,SaleCondition,wasRemod,TotalSF
0,60,3,65,8450,2,3,3,4,0,5,...,0,0,3,4,2,2008,8,4,0,2566
1,20,3,80,9600,2,3,3,2,0,24,...,0,0,3,4,5,2007,8,4,0,2524
2,60,3,68,11250,2,0,3,4,0,5,...,0,0,3,4,9,2008,8,4,1,2706
3,70,3,60,9550,2,0,3,0,0,6,...,0,0,3,4,2,2006,8,0,1,2473
4,60,3,84,14260,2,0,3,2,0,15,...,0,0,3,4,12,2008,8,4,0,3343


In [10]:
from sklearn.cross_validation import cross_val_score

cross_val_score(ensemble.RandomForestRegressor(
    n_estimators = 100, min_samples_split = 2, min_samples_leaf = 1),
                X_train, y_train, 'neg_mean_squared_error', cv = 5).mean()



-692575446.77453959

In [11]:
from sklearn.metrics import mean_squared_error

def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(np.log(y), np.log(y_pred)))

In [12]:
randomForest.fit(X_train, y_train)
y_pred = randomForest.predict(X_train)
rmsle(y_train, y_pred)

0.054631777543359995

In [21]:
test_pred = randomForest.predict(test)

In [22]:
RFSubmission = pd.DataFrame({'Id' : test_id,
                             'SalePrice' : test_pred})
RFSubmission.to_csv('RFSubmission.csv', index = False)

In [22]:
#Parameter tuning using Bayesian Optimization
from bayes_opt import BayesianOptimization

def rfrcv(n_estimators, min_samples_leaf, min_samples_split):
    return cross_val_score(
        ensemble.RandomForestRegressor(
        n_estimators = int(n_estimators),
        min_samples_leaf = int(min_samples_leaf),
        min_samples_split = int(min_samples_split),
        random_state = 42),
        X_train, y_train, 'neg_mean_squared_error', cv = 5
    ).mean()

rfrBo = BayesianOptimization(rfrcv,
                            {'n_estimators' : (1, 100),
                             'min_samples_leaf' : (1, 25),
                             'min_samples_split' : (2, 25)}
                            )

rfrBo.maximize(n_iter = 10)

[31mInitialization[0m
[94m-----------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   min_samples_leaf |   min_samples_split |   n_estimators | 
    1 | 00m12s | [35m-1078622978.98972[0m | [32m           13.2294[0m | [32m            15.4737[0m | [32m       58.4121[0m | 
    2 | 00m14s | -1123955728.43137 |            16.5971 |             19.3374 |        72.1824 | 
    3 | 00m05s | [35m-1067942012.80769[0m | [32m           11.2193[0m | [32m             3.1613[0m | [32m       36.0996[0m | 
    4 | 00m12s | -1174065413.25404 |            18.9783 |             20.5932 |        75.7847 | 
    5 | 00m08s | -1083430487.92001 |            13.1442 |              3.5506 |        49.1210 | 
[31mBayesian Optimization[0m
[94m-----------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   min_samples_leaf |   min_samples_split |   n_estimators | 
   

AttributeError: 'dict' object has no attribute 'x'