In [1]:
import seaborn as sns
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
import pickle

In [2]:
# read the train data
train = pd.read_csv('train_m.csv')

In [3]:
# read the test data
test = pd.read_csv('test_m.csv')

In [4]:
# the train target
train_target = train['SalePrice']
train = train.drop('SalePrice', axis='columns')

In [5]:
# the test id for constructing the submission file
test_dummy = pd.read_csv('test.csv')
test_id = test_dummy['Id']

In [6]:
# peek the train data
train.head()

Unnamed: 0,LotFrontage,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,TotalBsmtSF,GrLivArea,TotRmsAbvGrd,Fireplaces,GarageYrBlt,...,OverallQual_9,OverallQual_10,ExterQual_Ex,ExterQual_Fa,ExterQual_Gd,ExterQual_TA,KitchenQual_Ex,KitchenQual_Fa,KitchenQual_Gd,KitchenQual_TA
0,-0.078896,2003,2003,1.209763,0.779431,0.002513,0.529194,8,0,2003,...,0,0,0,0,1,0,0,0,1,0
1,0.572719,1976,1976,-0.802835,0.888257,0.341114,-0.381965,6,1,1976,...,0,0,0,0,0,1,0,0,0,1
2,0.062541,2001,2002,1.137592,0.654803,0.065397,0.659631,6,1,2001,...,0,0,0,0,1,0,0,0,1,0
3,-0.329561,1915,1970,-0.802835,0.384539,-0.105819,0.541448,7,1,1998,...,0,0,0,0,0,1,0,0,1,0
4,0.726089,2000,2000,1.429789,0.7544,0.256237,1.282295,9,1,2000,...,0,0,0,0,1,0,0,0,1,0


In [7]:
# peek the test data
test.head()

Unnamed: 0,LotFrontage,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,TotalBsmtSF,GrLivArea,TotRmsAbvGrd,Fireplaces,GarageYrBlt,...,OverallQual_9,OverallQual_10,ExterQual_Ex,ExterQual_Fa,ExterQual_Gd,ExterQual_TA,KitchenQual_Ex,KitchenQual_Fa,KitchenQual_Gd,KitchenQual_TA
0,0.623823,1961,1961,-0.778788,0.649628,0.059314,-1.442112,5,0,1961,...,0,0,0,0,0,1,0,0,0,1
1,0.66141,1958,1958,1.017606,0.877826,0.395671,-0.194908,6,0,1958,...,0,0,0,0,0,1,0,0,1,0
2,0.388069,1997,1998,-0.778788,0.825951,0.101016,0.449155,6,1,1997,...,0,0,0,0,0,1,0,0,0,1
3,0.547237,1998,1998,0.387009,0.734201,0.099246,0.400213,7,1,1998,...,0,0,0,0,0,1,0,0,1,0
4,-1.245577,1992,1992,-0.778788,0.456245,0.364847,-0.313771,5,0,1992,...,0,0,0,0,1,0,0,0,1,0


In [8]:
# the model selection process
# we try the linear regression, ridge regression and random forest
# using the first 1000 rows to train the model and 
# validate the model using the remaining rows
# the scoring method used is root-mean-squared-error (RMSE)

In [9]:
linear_regression = LinearRegression()
ridge = Ridge()
random_forest = RandomForestRegressor(n_estimators=600, 
                                      min_samples_split=3, 
                                      min_samples_leaf=1, 
                                      max_features='sqrt',
                                      max_depth=20,
                                      bootstrap=False)

In [10]:
lr_model = linear_regression.fit(train.iloc[:1000], train_target[:1000])
pred = np.expm1(lr_model.predict(train[1000:]))
np.sqrt(np.average(np.square(pred-train_target[1000:])))

194046.2618221252

In [11]:
rg_model = ridge.fit(train.iloc[:1000], train_target[:1000])
pred = np.expm1(rg_model.predict(train[1000:]))
np.sqrt(np.average(np.square(pred-train_target[1000:])))

193704.14148325913

In [12]:
rf_model = random_forest.fit(train.iloc[:1000], train_target[:1000])
pred = np.expm1(rf_model.predict(train[1000:]))
np.sqrt(np.average(np.square(pred-train_target[1000:])))

186486.0917853075

In [13]:
# it seems that random forest performs better than other
# also train the linear regression model to compare their performance
# train the ranmdom forest and linear regression model

In [14]:
# using random search to find better parameters
rsearch_ridge_grid = {'alpha': [x for x in np.linspace(0.1, 10, num=100)],
                      'max_iter': [int(x) for x in np.linspace(1000, 20000, num=20)],
                      'random_state': [4432]}
ridge_rand_search = RandomizedSearchCV(estimator=ridge,
                                       param_distributions=rsearch_ridge_grid,
                                       n_iter=50,
                                       scoring='neg_root_mean_squared_error',
                                       n_jobs=-1)
ridge_rand_search.fit(train, train_target)
ridge_rand_search.best_params_

{'random_state': 4432, 'max_iter': 9000, 'alpha': 0.9}

In [19]:
rsearch_ranfor_grid = {'n_estimators': [int(x) for x in np.linspace(100, 1000, num=10)],
                       'min_samples_split': [2, 3, 4],
                       'min_samples_leaf': [1, 2, 3],
                       'max_features': ['auto', 'sqrt'],
                       'max_depth': [int(x) for x in np.linspace(1, 50, num=50)],
                       'bootstrap': [True, False]}
ranfor_rand_search = RandomizedSearchCV(estimator=random_forest,
                                        param_distributions=rsearch_ranfor_grid,
                                        scoring = 'neg_root_mean_squared_error',
                                        n_jobs=-1)
ranfor_rand_search.fit(train, train_target)
ranfor_rand_search.best_params_

{'n_estimators': 400,
 'min_samples_split': 3,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 43,
 'bootstrap': False}

In [20]:
# train the models, get the predictions, and save them to csv file
rf_model = ranfor_rand_search.best_estimator_
pred = np.expm1(rf_model.predict(test))
result = pd.DataFrame({'Id':test_id, 'SalePrice':pred})
result.to_csv('rf_pred.csv', index=False)

In [21]:
rg_model = ridge_rand_search.best_estimator_
pred = np.expm1(rg_model.predict(test))
result = pd.DataFrame({'Id':test_id, 'SalePrice':pred})
result.to_csv('rg_pred.csv', index=False)

In [22]:
lr_model = linear_regression.fit(train, train_target)
pred = np.expm1(lr_model.predict(test))
result = pd.DataFrame({'Id':test_id, 'SalePrice':pred})
result.to_csv('lr_pred.csv', index=False)

In [24]:
# save the models for reuse
with open('rf_model.mdat', 'wb') as f:
    pickle.dump(rf_model, f)
with open('rd_model.mdat', 'wb') as f:
    pickle.dump(rg_model, f)