# Model Building and Evaluation

In [1]:
import numpy as np # Linear algebra
import pandas as pd # Data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt  # Matlab-style plotting
# Make sure plot shows immediately
%matplotlib inline 
import seaborn as sns # Library for plotting
color = sns.color_palette()
sns.set_style('darkgrid')

from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error
from xgboost import XGBRegressor
from joblib import dump, load

In [2]:
def rmsle(y_true, y_pred):
    return np.sqrt(np.mean(np.power(np.log(y_pred + 1) - np.log(y_true + 1),2)))

In [3]:
def show_gd_result(gd_result):
    print("Best params: ", gd_result.best_params_)
    print("Best score: ", gd_result.best_score_)

In [4]:
def tune_xgb(X_train, y_train):
    xgb_params = {
        'learning_rate': [ 0.01, 0.1, 1],
        'max_depth': [3, 5, 7],
        'subsample': [0.3, 0.5, 0.7],
        'n_estimators' : [100, 300, 500 ],
    }

    xgb_search = GridSearchCV(estimator = XGBRegressor(objective='reg:squarederror'),
            param_grid = xgb_params,                        
            scoring = make_scorer(rmsle, greater_is_better=False),
            cv = 5,
            n_jobs = -1,
            verbose = 4)

    xgb_search.fit(X_train,y_train)
    return xgb_search

In [18]:
def tune_svr(X_train, y_train):
    svr_params = {
        'C': [0.01, 0.1, 1, 10],
        'epsilon': [0.0001, 0.001, 0.01, 0.1, 1],
        'gamma': ['auto', 'scale']
    }

    svr_search = GridSearchCV(estimator = SVR(kernel="rbf"),
            param_grid = svr_params,                        
            scoring = make_scorer(rmsle, greater_is_better=False),
            cv = 5,
            n_jobs = -1,
            verbose = 4)

    svr_search.fit(X_train, y_train)
    return svr_search

# Strategy A

In [6]:
X_a_train = pd.read_csv("./processed/strategy-a/train/X_train.csv", index_col=["MemberID"])
y_a_train = pd.read_csv("./processed/strategy-a/train/y_train.csv", index_col=["MemberID"])
X_a_test = pd.read_csv("./processed/strategy-a/test/X_test.csv", index_col=["MemberID"])
y_a_test = pd.read_csv("./processed/strategy-a/test/y_test.csv", index_col=["MemberID"])

## XGBoost

Tuning for XGBoost

In [7]:
%%time
a_xgb_search = tune_xgb(X_a_train.sample(5000, random_state=1).to_numpy(),y_a_train.sample(5000, random_state=1).to_numpy())

Fitting 5 folds for each of 81 candidates, totalling 405 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   37.8s
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:  7.3min
[Parallel(n_jobs=-1)]: Done 213 tasks      | elapsed: 17.0min
[Parallel(n_jobs=-1)]: Done 384 tasks      | elapsed: 32.7min
[Parallel(n_jobs=-1)]: Done 405 out of 405 | elapsed: 35.5min finished
CPU times: user 4.35 s, sys: 249 ms, total: 4.6 s
Wall time: 35min 30s


In [8]:
show_gd_result(a_xgb_search)

Best params:  {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.3}
Best score:  -0.538593650095859


In [9]:
rmsle(y_a_test.to_numpy().T, a_xgb_search.best_estimator_.predict(X_a_test.to_numpy()))

0.5078447373821157

In [10]:
df_a_xgb_result = pd.DataFrame(a_xgb_search.cv_results_)
df_a_xgb_result.to_csv("./processed/strategy-a/result/xgb_result.csv", index=False)

Re-train with best params 

In [25]:
xgb_a_best_param = {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.3}

In [26]:
xgb_a_final = XGBRegressor(objective='reg:squarederror', **xgb_a_best_param)

In [27]:
%%time
xgb_a_final.fit(X_a_train.sample(10000, random_state=2).to_numpy(),y_a_train.sample(10000, random_state=2).to_numpy())

CPU times: user 3.67 s, sys: 85.5 ms, total: 3.75 s
Wall time: 2.78 s


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.01, max_delta_step=0, max_depth=3,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=0.3,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [28]:
y_a_pred = xgb_a_final.predict(X_a_test.to_numpy())

In [29]:
rmsle(y_a_test.to_numpy().T, y_a_pred)

0.5029061268185858

In [30]:
dump(xgb_a_final, "./processed/strategy-a/result/xgb.model")

['./processed/strategy-a/result/xgb.model']

## Support Vector Regression

Tuning for Support Vector Regression

In [19]:
%%time
a_svr_search = tune_svr(X_a_train.sample(5000, random_state=1).to_numpy(),y_a_train.sample(5000, random_state=1).to_numpy())

Fitting 5 folds for each of 40 candidates, totalling 200 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    9.5s
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:   49.1s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:  4.0min finished
CPU times: user 2.68 s, sys: 105 ms, total: 2.79 s
Wall time: 3min 59s


In [20]:
show_gd_result(a_svr_search)

Best params:  {'C': 0.1, 'epsilon': 0.1, 'gamma': 'scale'}
Best score:  -0.4967324995325847


In [21]:
rmsle(y_a_test.to_numpy().T, a_svr_search.best_estimator_.predict(X_a_test.to_numpy()))

0.478414322491396

In [22]:
df_a_svr_result = pd.DataFrame(a_svr_search.cv_results_)
df_a_svr_result.to_csv("./processed/strategy-a/result/svr_result.csv", index=False)

Re-train with best params

In [31]:
svr_a_best_param = {'C': 0.1, 'epsilon': 0.1, 'gamma': 'scale'}

In [32]:
svr_a_final = SVR(**svr_a_best_param)

In [33]:
%%time
svr_a_final.fit(X_a_train.sample(10000, random_state=2).to_numpy(),y_a_train.sample(10000, random_state=2).to_numpy())

CPU times: user 7.58 s, sys: 197 ms, total: 7.78 s
Wall time: 9.21 s


SVR(C=0.1)

In [34]:
y_a_pred = svr_a_final.predict(X_a_test.to_numpy())

In [35]:
rmsle(y_a_test.to_numpy().T, y_a_pred)

0.4803696068160331

In [36]:
dump(svr_a_final, "./processed/strategy-a/result/svr.model")

['./processed/strategy-a/result/svr.model']

# Strategy B

In [23]:
X_b_train = pd.read_csv("./processed/strategy-b/train/X_train.csv", index_col=["MemberID"])
y_b_train = pd.read_csv("./processed/strategy-b/train/y_train.csv", index_col=["MemberID"])
X_b_test = pd.read_csv("./processed/strategy-b/test/X_test.csv", index_col=["MemberID"])
y_b_test = pd.read_csv("./processed/strategy-b/test/y_test.csv", index_col=["MemberID"])

## XGBoost

Tuning for XGBoost

In [37]:
%%time
b_xgb_search = tune_xgb(X_b_train.sample(5000, random_state=1).to_numpy(),y_b_train.sample(5000, random_state=1).to_numpy())

Fitting 5 folds for each of 81 candidates, totalling 405 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   39.4s
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:  6.8min
[Parallel(n_jobs=-1)]: Done 213 tasks      | elapsed: 21.6min
[Parallel(n_jobs=-1)]: Done 384 tasks      | elapsed: 44.9min
[Parallel(n_jobs=-1)]: Done 405 out of 405 | elapsed: 49.8min finished
CPU times: user 4.84 s, sys: 288 ms, total: 5.13 s
Wall time: 49min 49s


In [38]:
show_gd_result(b_xgb_search)

Best params:  {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.3}
Best score:  -0.5189132992905694


In [39]:
rmsle(y_b_test.to_numpy().T, b_xgb_search.best_estimator_.predict(X_b_test.to_numpy()))

0.4945377380003405

In [40]:
df_b_xgb_result = pd.DataFrame(b_xgb_search.cv_results_)
df_b_xgb_result.to_csv("./processed/strategy-b/result/xgb_result.csv", index=False)

Re-train with best params

In [41]:
xgb_b_best_param = {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.3}

In [42]:
xgb_b_final = XGBRegressor(objective='reg:squarederror', **xgb_b_best_param)

In [43]:
%%time
xgb_b_final.fit(X_b_train.sample(10000, random_state=2).to_numpy(),y_b_train.sample(10000, random_state=2).to_numpy())

CPU times: user 4.67 s, sys: 107 ms, total: 4.78 s
Wall time: 3.19 s


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.01, max_delta_step=0, max_depth=3,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=0.3,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [44]:
y_b_pred = xgb_b_final.predict(X_b_test.to_numpy())

In [45]:
rmsle(y_b_test.to_numpy().T, y_b_pred)

0.49550572912214236

Save model

In [46]:
dump(xgb_b_final, "./processed/strategy-b/result/xgb.model")

['./processed/strategy-b/result/xgb.model']

## Support Vector Regression

Tuning for Support Vector Regression

In [47]:
%%time
b_svr_search = tune_svr(X_b_train.sample(5000, random_state=1).to_numpy(),y_b_train.sample(5000, random_state=1).to_numpy())

Fitting 5 folds for each of 40 candidates, totalling 200 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   39.6s
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:  7.5min finished
CPU times: user 7.18 s, sys: 249 ms, total: 7.43 s
Wall time: 7min 37s


In [48]:
show_gd_result(b_svr_search)

Best params:  {'C': 1, 'epsilon': 0.01, 'gamma': 'auto'}
Best score:  -0.4727802739832475


In [49]:
rmsle(y_b_test.to_numpy().T, b_svr_search.best_estimator_.predict(X_b_test.to_numpy()))

0.4741860679910001

In [50]:
df_b_svr_result = pd.DataFrame(b_svr_search.cv_results_)
df_b_svr_result.to_csv("./processed/strategy-b/result/svr_result.csv", index=False)

Re-train with best params

In [51]:
svr_b_best_param = {'C': 1, 'epsilon': 0.01, 'gamma': 'auto'}

In [52]:
svr_b_final = SVR(**svr_b_best_param)

In [53]:
%%time
svr_b_final.fit(X_b_train.sample(10000, random_state=2).to_numpy(),y_b_train.sample(10000, random_state=2).to_numpy())

CPU times: user 1min 33s, sys: 1.68 s, total: 1min 35s
Wall time: 2min 13s


SVR(C=1, epsilon=0.01, gamma='auto')

In [54]:
y_b_pred = svr_b_final.predict(X_b_test.to_numpy())

In [55]:
rmsle(y_b_test.to_numpy().T, y_b_pred)

0.47406410624280426

In [56]:
dump(svr_b_final, "./processed/strategy-b/result/svr.model")

['./processed/strategy-b/result/svr.model']