In [2]:
# Import the libraries
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
import plotly.express as px
from scipy import stats
%matplotlib inline
df=pd.DataFrame()

In [4]:
#Import data
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')

In [52]:
df_train['SalePrice'].mean()

180921.19589041095

In [5]:
#one hot encoding for categorical variables
#fill missing numeric value with zero
# split training and validation dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
df_onehot=pd.get_dummies(pd.concat([df_train, df_test], axis=0), dummy_na=True).fillna(0)
X=df_onehot[df_onehot['SalePrice']>0].drop(['SalePrice','Id'], axis=1)
y=df_onehot[df_onehot['SalePrice']>0]['SalePrice']
df_test=df_onehot[df_onehot['SalePrice']==0].drop(['SalePrice'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
from sklearn.linear_model import LinearRegression,Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
import xgboost as xgb
from xgboost import XGBRegressor
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

In [None]:
#XGboost hyperparameter grid search
param_xgb = {'n_estimators': [200, 300, 400, 500, 600, 700],
             'max_depth': [2, 3, 4, 5, 6],
             'learning_rate': [0.05, 0.1, 0.2],
             'gamma': [0.01, 0.05 ,0.1, 0.2],
             'reg_alpha': [0.001,0.005,0.01, 0.1],
             'subsample': [0.5, 0.6, 0.7]}

grid_xgb = GridSearchCV(estimator=XGBRegressor(),
                        param_grid=param_xgb,
                        cv=5,
                        verbose=2,
                        n_jobs=-1)
start_time = time.time()
grid_xgb.fit(X_train, y_train)
end_time = time.time()
print("{:.2f} seconds".format(end_time - start_time))
grid_xgb.best_params_

Fitting 5 folds for each of 4320 candidates, totalling 21600 fits


In [7]:
xgb_model = XGBRegressor(n_estimators=400, max_depth=3, learning_rate=0.1, gamma=0.05, reg_alpha=0.005, subsample=0.6, random_state=42)
xgb_model.fit(X_train,y_train)
xgb_model.predict(X_test.drop(['Id'], axis=1)),X_test['SalePrice']

In [17]:
#XGB
submission=pd.DataFrame(columns=[ 'Id','SalePrice'])
submission['Id']=df_test['Id']
submission['SalePrice']=xgb_model.predict(df_test.drop(['Id'], axis=1))
submission.to_csv('submission.csv',index=False)

In [19]:
#random forest hyperparameter grid search
params_rf = {'n_estimators': [20, 40,60,80,100,120,140],
             'max_features': [0.2,0.4, 0.6,0.8, 1.0],
             'max_depth': [2,3,4, None],
             'min_samples_leaf': [1,2,3,4,5,6],
             'random_state': [24]}
grid_rf = GridSearchCV(estimator = RandomForestRegressor(),
                       param_grid = params_rf, 
                       cv=5,
                       verbose = 2,
                       n_jobs = -1)
grid_rf.fit(X_train, y_train)
grid_rf.best_params_

Fitting 5 folds for each of 840 candidates, totalling 4200 fits


{'max_depth': None,
 'max_features': 0.2,
 'min_samples_leaf': 1,
 'n_estimators': 80,
 'random_state': 24}

In [20]:
RF = RandomForestRegressor(n_estimators=120, max_features=0.4, max_depth=None, min_samples_leaf=2, random_state=24)
RF.fit(X_train,y_train)

In [21]:
#Random Forest
submission=pd.DataFrame(columns=[ 'Id','SalePrice'])
submission['Id']=df_test['Id']
submission['SalePrice']=RF.predict(df_test.drop(['Id'], axis=1))
submission.to_csv('submission.csv',index=False)

In [26]:
#Lasso hyperparameter grid search
params_Lasso = {'alpha':[5,10,20,30,40,50,100,150,200,250,300,500]
             ,'max_iter': [1000,1500,2000]}
grid_Lasso = GridSearchCV(estimator = Lasso(),
                       param_grid = params_Lasso, 
                       cv=5,
                       verbose = 2,
                       n_jobs = -1)
start_time = time.time()
grid_Lasso.fit(X_train, y_train)
end_time = time.time()
print("{:.2f} seconds".format(end_time - start_time))
grid_Lasso.best_params_

Fitting 5 folds for each of 36 candidates, totalling 180 fits
1.84 seconds


{'alpha': 200, 'max_iter': 1000}

In [8]:
LR_Lasso=Lasso(alpha=200,max_iter=1000)
LR_Lasso.fit(X_train,y_train)

In [29]:
#Lasso
submission=pd.DataFrame(columns=[ 'Id','SalePrice'])
submission['Id']=df_test['Id']
submission['SalePrice']=LR_Lasso.predict(df_test.drop(['Id'], axis=1))
submission.to_csv('submission.csv',index=False)

In [58]:
#LGB hyperparameter grid search
params_LGB = {    'num_leaves': [30],
    'max_depth': [30],
    'learning_rate': [0.03],
    'n_estimators': [200,300,400],
    'feature_fraction': [0.8],
    'bagging_fraction': [0.8],
    'bagging_freq': [3,4,5,6,7],
             }
grid_LGB = GridSearchCV(estimator = lgb.LGBMRegressor(),
                       param_grid = params_LGB, 
                       cv=5,
                       verbose = 2,
                       n_jobs = -1)
start_time = time.time()
grid_LGB.fit(X_train, y_train)
end_time = time.time()
print("{:.2f} seconds".format(end_time - start_time))
grid_LGB.best_params_

Fitting 5 folds for each of 15 candidates, totalling 75 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005630 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3218
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 193
[LightGBM] [Info] Start training from score 181441.541952
58.29 seconds


{'bagging_fraction': 0.8,
 'bagging_freq': 5,
 'feature_fraction': 0.8,
 'learning_rate': 0.03,
 'max_depth': 30,
 'n_estimators': 300,
 'num_leaves': 30}

In [10]:
LGB=lgb.LGBMRegressor(num_leaves=30, learning_rate=0.03,max_depth=30,n_estimators=300,feature_fraction=0.8,bagging_fraction=0.8,bagging_freq=5)
LGB.fit(X_train,y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003826 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3218
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 193
[LightGBM] [Info] Start training from score 181441.541952


In [61]:
#LGB
submission=pd.DataFrame(columns=[ 'Id','SalePrice'])
submission['Id']=df_test['Id']
submission['SalePrice']=LGB.predict(df_test.drop(['Id'], axis=1))
submission.to_csv('submission.csv',index=False)



In [21]:
a=LGB.predict(df_test.drop(['Id'], axis=1))
b=xgb_model.predict(df_test.drop(['Id'], axis=1))



In [28]:
# ensemble equal weighted LGB,XGB,Lasso
submission=pd.DataFrame(columns=[ 'Id','SalePrice'])
submission['Id']=df_test['Id']
submission['SalePrice']=np.mean([LGB.predict(df_test.drop(['Id'], axis=1)),xgb_model.predict(df_test.drop(['Id'], axis=1)),LR_Lasso.predict(df_test.drop(['Id'], axis=1))],axis=0)
submission.to_csv('submission.csv',index=False)



In [None]:
# ensemble equal weighted LGB,XGB,Lasso
submission=pd.DataFrame(columns=[ 'Id','SalePrice'])
submission['Id']=df_test['Id']
submission['SalePrice']=np.mean([LGB.predict(df_test.drop(['Id'], axis=1)),xgb_model.predict(df_test.drop(['Id'], axis=1)),LR_Lasso.predict(df_test.drop(['Id'], axis=1))],axis=0)
submission.to_csv('submission.csv',index=False)

In [30]:
# ensemble weighted 0.7 LGB, 0.2 XGB, 0.1 Lasso
submission=pd.DataFrame(columns=[ 'Id','SalePrice'])
submission['Id']=df_test['Id']
submission['SalePrice']=0.7*LGB.predict(df_test.drop(['Id'], axis=1))+0.2*xgb_model.predict(df_test.drop(['Id'], axis=1))+0.1*LR_Lasso.predict(df_test.drop(['Id'], axis=1))
submission.to_csv('submission.csv',index=False)



In [32]:
# ensemble weighted 0.5 LGB, 0.5 Lasso
submission=pd.DataFrame(columns=[ 'Id','SalePrice'])
submission['Id']=df_test['Id']
submission['SalePrice']=0.5*LGB.predict(df_test.drop(['Id'], axis=1))+0.5*LR_Lasso.predict(df_test.drop(['Id'], axis=1))
submission.to_csv('submission.csv',index=False)

