In [35]:
# import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor

from sklearn.model_selection import PredefinedSplit, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error

from scipy.stats import loguniform as sp_loguniform
from scipy.stats import randint as sp_randint

import statsmodels.api as sm
import statsmodels.formula.api as smf

import warnings
warnings.filterwarnings('ignore')

In [36]:
# get training, validation and test data
X_train_sf = pd.read_csv('X_train_sf.csv')
X_train_sf.index = X_train_sf['Unnamed: 0']
X_train_sf = X_train_sf.drop(['Unnamed: 0'], axis=1)
X_val_sf = pd.read_csv('X_val_sf.csv')
X_val_sf.index = X_val_sf['Unnamed: 0']
X_val_sf = X_val_sf.drop(['Unnamed: 0'], axis=1)
X_test_sf = pd.read_csv('X_test_sf.csv')
X_test_sf.index = X_test_sf['Unnamed: 0']
X_test_sf = X_test_sf.drop(['Unnamed: 0'], axis=1)
y_train_sf = pd.read_csv('y_train_sf.csv')
y_train_sf.index = y_train_sf['Unnamed: 0']
y_train_sf = y_train_sf.drop(['Unnamed: 0'], axis=1)
y_val_sf = pd.read_csv('y_val_sf.csv')
y_val_sf.index = y_val_sf['Unnamed: 0']
y_val_sf = y_val_sf.drop(['Unnamed: 0'], axis=1)
y_test_sf = pd.read_csv('y_test_sf.csv')
y_test_sf.index = y_test_sf['Unnamed: 0']
y_test_sf = y_test_sf.drop(['Unnamed: 0'], axis=1)

In [8]:
# create a predefined validation set for following random search
y_train_plus_val_sf = pd.concat([y_train_sf, y_val_sf])
y_train_plus_val_sf_copy = y_train_plus_val_sf.copy()
y_train_plus_val_sf_copy.columns = ['train_val_split']
y_train_plus_val_sf_copy.loc[y_train_sf.index,'train_val_split'] = -1
y_train_plus_val_sf_copy.loc[y_val_sf.index,'train_val_split'] = 0
val_fold_sf = np.array(y_train_plus_val_sf_copy)
ps_sf = PredefinedSplit(val_fold_sf)

# get training plus validation set
X_train_plus_val_sf = pd.concat([X_train_sf, X_val_sf])

In [30]:
lr = LinearRegression().fit(X_train_sf,y_train_sf)
lr.score(X_train_sf,y_train_sf)
mean_squared_error(y_test_sf, lr.predict(X_test_sf),squared=False)

269.84328197000224

In [37]:
y_train_plus_val_sf

Unnamed: 0_level_0,sale_price
Unnamed: 0,Unnamed: 1_level_1
222564,5.0
197559,90.0
241099,3.0
160125,445.0
102261,75.0
...,...
240777,71.5
189764,114.9
188542,147.0
256875,60.0


In [28]:
y_val_sf

Unnamed: 0_level_0,sale_price
Unnamed: 0,Unnamed: 1_level_1
71651,35.0
84240,330.0
179897,194.0
87412,758.0
92786,257.0
...,...
240777,71.5
189764,114.9
188542,147.0
256875,60.0


Unnamed: 0_level_0,basements_None,basements_full,basements_partial,central_air_N,central_air_Y,exterior_condition_above average,exterior_condition_average,exterior_condition_below average,exterior_condition_new,exterior_condition_none,...,lat,lng,sale_year,sale_month,sale_week,sale_dow,age,bath_total_ratio,bed_total_ratio,livable_area_ratio
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
71651,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.284588,0.001694,1.311388,-1.734310,1.488049,-0.846561,0.023948,-0.292255,0.067536,0.004251
84240,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.817876,0.296815,0.946969,0.395043,-0.036652,1.127286,-0.097513,3.076335,-0.823611,-0.013471
179897,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.440902,1.168605,-0.146287,-0.821730,0.725699,1.127286,-0.291852,-0.292255,0.067536,-0.014201
87412,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,-0.227349,-1.065587,0.946969,0.090850,-0.036652,1.127286,-0.401167,1.238922,-0.823611,0.014571
92786,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,-0.534336,-1.329952,0.946969,-0.213343,-1.561352,-1.504510,0.145410,-0.292255,0.067536,0.007626
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
240777,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,-0.650649,-0.224057,-1.239542,-1.430117,-0.799002,0.469337,0.266872,-0.292255,0.067536,0.003967
189764,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.064414,0.095087,-0.510705,-1.430117,0.725699,0.469337,0.206141,-0.292255,0.067536,-0.005223
188542,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,-1.671015,-1.986651,-0.146287,-1.430117,0.725699,0.469337,-0.279705,-0.292255,0.067536,-0.011695
256875,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.857962,0.249235,-1.968380,1.003430,1.488049,1.127286,0.023948,-0.598490,0.958684,-0.017725


### Model 1: Linear regression

In [24]:
# random search for linear regression
param_lr_sf = {'fit_intercept': [True,False],
               'normalize':[True,False]}

lr_cv_sf = RandomizedSearchCV(
           LinearRegression(),
           param_distributions=param_lr_sf,
           n_iter=20,
           cv=ps_sf,
           scoring='neg_root_mean_squared_error', 
           n_jobs=-1,
           random_state=123
).fit(X_train_plus_val_sf, np.array(y_train_plus_val_sf).ravel())

# find best model hyperparameters 
print(lr_cv_sf.best_params_)
print(lr_cv_sf.best_estimator_)
print('Min RMSE is: {}'.format(-lr_cv_sf.best_score_))

{'normalize': True, 'fit_intercept': False}
LinearRegression(fit_intercept=False, normalize=True)
Min RMSE is: 23479006242.85987


In [33]:
lr_r2_sf = r2_score(y_val_sf, lr_cv_sf.best_estimator_.predict(X_val_sf))
print(lr_r2_sf)

0.3389078027320038


### Model 2: Lasso

In [34]:
# random search for lasso
param_la_sf = {'alpha': sp_uniform(1e-4,1e2),
            'fit_intercept':[True,False],
            'normalize':[True,False]}

la_cv_sf = RandomizedSearchCV(
        Lasso(),
        param_distributions=param_la_sf,
        n_iter=10,
        cv=ps_sf,
        scoring='neg_root_mean_squared_error', 
        n_jobs=-1,
        random_state=123
).fit(X_train_plus_val_sf, np.array(y_train_plus_val_sf).ravel())

# find best model hyperparameters 
print(la_cv_sf.best_params_)
print(la_cv_sf.best_estimator_)
print('Min RMSE is: {}'.format(-la_cv_sf.best_score_))

{'alpha': 0.07684071705306554, 'fit_intercept': False, 'normalize': True}
Lasso(alpha=0.07684071705306554, fit_intercept=False, normalize=True)
Min RMSE is: 237.02107253707626


In [7]:
la_r2 = r2_score(y_val, la_cv.best_estimator_.predict(X_val))
print(la_r2)

0.305349511720147


### Model 3: Ridge

In [8]:
# random search for ridge
param_rd = {'alpha': sp_loguniform(1e-4,1e2),
            'fit_intercept':[True,False],
            'normalize':[True,False]}

rd_cv = RandomizedSearchCV(
        Ridge(),
        param_distributions=param_rd,
        n_iter=10,
        cv=ps,
        scoring='neg_root_mean_squared_error', 
        n_jobs=-1,
        random_state=123
).fit(X_train_plus_val, np.array(y_train_plus_val).ravel())

# Find best model hyperparameters 
print(rd_cv.best_params_)
print(rd_cv.best_estimator_)
print('Min RMSE is: {}'.format(-rd_cv.best_score_))

{'alpha': 2.074024196289186, 'fit_intercept': False, 'normalize': False}
Ridge(alpha=2.074024196289186, fit_intercept=False)
Min RMSE is: 291.2205283061082


In [9]:
rd_r2 = r2_score(y_val, rd_cv.best_estimator_.predict(X_val))
print(rd_r2)

0.3053048717781678


### Model 4: KNeighborsRegressor

In [10]:
# random search for KNeighborsRegressor
param_knn = {'n_neighbors': sp_randint(1,21),
             'weights': ['uniform', 'distance'],
             'leaf_size': sp_randint(1,21)}

knn_cv = RandomizedSearchCV(
         KNeighborsRegressor(),
         param_distributions=param_knn,
         n_iter=10,
         cv=ps,
         scoring='neg_root_mean_squared_error', 
         n_jobs=-1,
         random_state=123
).fit(X_train_plus_val, np.array(y_train_plus_val).ravel())

# find best model hyperparameters 
print(knn_cv.best_params_)
print(knn_cv.best_estimator_)
print('Min RMSE is: {}'.format(-knn_cv.best_score_))

{'leaf_size': 20, 'n_neighbors': 11, 'weights': 'distance'}
KNeighborsRegressor(leaf_size=20, n_neighbors=11, weights='distance')
Min RMSE is: 196.4906417029585


In [15]:
knn_r2 = knn_cv.best_estimator_.score(X_val, y_val)
print(knn_r2)

0.999991040751846


In [21]:
mean_squared_error(y_test, knn_cv.best_estimator_.predict(X_test),squared=False)

305.46086041884803

### Model 5: RandomForestRegressor

In [None]:
# random search for RandomForestRegressor
param_rf = {'n_estimators': [100, 120, 200, 300, 500, 800, 1200],
            'max_depth': [None, 5, 8, 15, 25, 30],
            'min_samples_split': [1, 2, 5, 10 ,15, 100],
            'min_samples_leaf': [1, 2, 5, 10],
            'max_features': ['auto', 'sqrt', 'log2']}

rf_cv = RandomizedSearchCV(
        RandomForestRegressor(),
        param_distributions=param_rf,
        n_iter=10,
        cv=ps,
        scoring='neg_root_mean_squared_error', 
        n_jobs=-1,
        random_state=123
).fit(X_train_plus_val, np.array(y_train_plus_val).ravel())

# find best model hyperparameters 
print(rf_cv.best_params_)
print(rf_cv.best_estimator_)
print('Min RMSE is: {}'.format(-rf_cv.best_score_))

In [None]:
rf_r2 = r2_score(y_val, rf_cv.best_estimator_.predict(X_val))
print(rf_r2)

### Model 6: MLPRegressor

In [None]:
# random search for MLPRegressor
hl = []
for i in [1, 2, 3, 4, 5, 10, 15, 25, 30]:
    hl.append((i))
    hl.append((i,i))

param_mlp = {'hidden_layer_sizes': hl,
             'learning_rate_init': sp_loguniform(1e-4, 1e2),
             'alpha': sp_loguniform(1e-4, 1e2),
             'batch_size': [1, 3, 5, 10, 20, 50, 100, 250, 500]}

mlp_cv = RandomizedSearchCV(
         MLPRegressor(),
         param_distributions=param_mlp,
         n_iter=10,
         cv=ps,
         scoring='neg_root_mean_squared_error', 
         n_jobs=-1,
         random_state=123
).fit(X_train_plus_val, np.array(y_train_plus_val).ravel())

# find best model hyperparameters 
print(mlp_cv.best_params_)
print(mlp_cv.best_estimator_)
print('Min RMSE is: {}'.format(-mlp_cv.best_score_))

In [None]:
mlp_r2 = r2_score(y_val, mlp_cv.best_estimator_.predict(X_val))
print(mlp_r2)

In [18]:
X_train['market_value']

Unnamed: 0
256855    0.033895
215549    0.319771
262666   -0.176805
274362   -0.138698
263877   -0.018298
            ...   
268018   -0.230777
222089   -0.081760
66666    -0.299281
83247    -0.139143
63518    -0.207498
Name: market_value, Length: 120455, dtype: float64