In [1]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split,RandomizedSearchCV

In [2]:
X = pd.read_csv('train.csv')
X_test = pd.read_csv('test.csv')

(1459, 80)

In [3]:
X = X.drop('Id', axis = 1)
Id = X_test['Id']
X_test = X_test.drop('Id', axis = 1)

(1460, 80)

In [4]:
y = X['SalePrice']
X = X.drop('SalePrice', axis = 1)

In [5]:
X.shape , y.shape, X_test.shape

((1460, 79), (1460,), (1459, 79))

In [6]:
#Removing the features that have more than 1000 null values like PoolQC, Miscfeature
X.dropna(thresh = 1000, inplace = True, axis = 'columns')
X_test.dropna(thresh = 1000 , inplace = True, axis = 'columns')
X_test.shape

(1459, 74)

In [7]:
#Filling the mean value at places where there is NaN
X.fillna(X.mean() , inplace = True)
X_test.fillna(X.mean() , inplace = True)
np.sum(X.isnull().max())

11

In [8]:
# we still have some missing values
X.fillna(value = "Missing", inplace = True)
X_test.fillna(value = "Missing", inplace = True)
np.sum(X.isnull().max())

0

In [9]:
#Label Encoding for categorical features
le = preprocessing.LabelEncoder()
X = X.apply(le.fit_transform)
X_test = X_test.apply(le.fit_transform)
X.head(20)

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,5,3,36,327,1,3,3,0,4,0,...,49,0,0,0,0,0,1,2,8,4
1,0,3,52,498,1,3,3,0,2,0,...,0,0,0,0,0,0,4,1,8,4
2,5,3,39,702,1,0,3,0,4,0,...,30,0,0,0,0,0,8,2,8,4
3,6,3,31,489,1,0,3,0,0,0,...,24,108,0,0,0,0,1,0,8,0
4,5,3,56,925,1,0,3,0,2,0,...,70,0,0,0,0,0,11,2,8,4
5,4,3,57,915,1,0,3,0,4,0,...,20,0,17,0,0,10,9,3,8,4
6,0,3,47,552,1,3,3,0,4,0,...,45,0,0,0,0,0,7,1,8,4
7,5,3,42,593,1,0,3,0,0,0,...,156,94,0,0,0,2,10,3,8,4
8,4,4,22,138,1,3,3,0,4,0,...,0,83,0,0,0,0,3,2,8,0
9,14,3,21,222,1,3,3,0,0,0,...,1,0,0,0,0,0,0,2,8,4


In [10]:
X_train,X_valid,y_train,y_valid = train_test_split(X,y,test_size = 0.3,random_state = 17)

In [11]:
rand_forest = RandomForestRegressor(n_jobs = 1, random_state = 17,max_features = 22, max_depth = 14)
rand_forest.fit(X_train,y_train)

RandomForestRegressor(max_depth=14, max_features=22, n_jobs=1, random_state=17)

In [12]:
pred = rand_forest.predict(X_valid)
print("MSE for training set %.3f " %  np.sqrt(mean_squared_error(y_train, rand_forest.predict(X_train))))
print("MSE for validation set %.3f " %  np.sqrt(mean_squared_error(y_valid, pred)))

MSE for training set 11511.717 
MSE for validation set 27763.102 


### XGBoost and RandomizedCV

In [22]:
from xgboost import XGBRegressor

In [23]:
xgboost = XGBRegressor(random_state = 17, n_estimators = 1000,learning_rate = 0.005)
xgboost.fit(X_train,y_train)
print("MSE for training set %.3f " %  np.sqrt(mean_squared_error(y_train, xgboost.predict(X_train))))
print("MSE for validation set %.3f " %  np.sqrt(mean_squared_error(y_valid, xgboost.predict(X_valid))))
params = {'learning_rate': np.logspace(-1,-5), 'subsample': [0.1,0.2,0.4,0.5,1]}


MSE for training set 7714.773 
MSE for validation set 28016.100 


In [24]:
randomized= RandomizedSearchCV(xgboost, params,verbose = True, random_state = 17)
randomized.fit(X,y)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  4.6min finished


RandomizedSearchCV(estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                          colsample_bylevel=1,
                                          colsample_bynode=1,
                                          colsample_bytree=1, gamma=0,
                                          gpu_id=-1, importance_type='gain',
                                          interaction_constraints='',
                                          learning_rate=0.005, max_delta_step=0,
                                          max_depth=6, min_child_weight=1,
                                          missing=nan,
                                          monotone_constraints='()',
                                          n_estimators=1000, n_jobs=0,
                                          num_parallel_tree...
       5.17947468e-04, 4.29193426e-04, 3.55648031e-04, 2.94705170e-04,
       2.44205309e-04, 2.02358965e-04, 1.67683294e-04, 1.38949549e-04,
       1.15139540e-04, 9.

In [25]:
randomized.best_estimator_

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.018420699693267165, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=1000, n_jobs=0, num_parallel_tree=1, random_state=17,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=0.5,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [26]:
prediction3 = randomized.predict(X_valid)
print("MSE for training set %.3f " %  np.sqrt(mean_squared_error(y_train, randomized.predict(X_train))))
print("MSE for validation set %.3f " %  np.sqrt(mean_squared_error(y_valid, prediction3)))

MSE for training set 4278.331 
MSE for validation set 4529.126 


In [30]:
p = randomized.predict(X_test)
submit= pd.DataFrame()
submit['Id'] = Id 
submit['SalePrice'] = p
submit.to_csv('submission_XGBoostTrees.csv', index = False)
submit.head(20)

Unnamed: 0,Id,SalePrice
0,1461,127741.679688
1,1462,159128.4375
2,1463,179058.421875
3,1464,186858.4375
4,1465,190846.703125
5,1466,168710.625
6,1467,169307.15625
7,1468,163263.90625
8,1469,183145.328125
9,1470,128788.859375
