# Model Building and Evaluation

In [25]:
import numpy as np # Linear algebra
import pandas as pd # Data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt  # Matlab-style plotting
# Make sure plot shows immediately
%matplotlib inline 
import seaborn as sns # Library for plotting
color = sns.color_palette()
sns.set_style('darkgrid')

from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error
from xgboost import XGBRegressor

In [31]:
def rmsle(y_true, y_pred):
    return np.sqrt(np.mean(np.power(np.log(y_true + 1) - np.log(y_pred + 1),2)))

In [64]:
def show_gd_result(gd_result):
    print("Best params: ", gd_result.best_params_)
    print("Best score: ", gd_result.best_score_)

# Strategy A

In [30]:
X_a_train = pd.read_csv("./processed/strategy-a/train/X_train.csv", index_col=["MemberID"])
y_a_train = pd.read_csv("./processed/strategy-a/train/y_train.csv", index_col=["MemberID"])
X_a_test = pd.read_csv("./processed/strategy-a/test/X_test.csv", index_col=["MemberID"])
y_a_test = pd.read_csv("./processed/strategy-a/test/y_test.csv", index_col=["MemberID"])

## XGBoost

In [48]:
%%time
a_xgb_params = {
    'learning_rate': [0.01, 0.1, 1],
    'max_depth': [3, 5, 7],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.5, 0.7],
    'colsample_bytree': [0.5, 0.7],
    'n_estimators' : [100, 200, 500],
    'objective': ['reg:squarederror']
}

a_xgb_search = GridSearchCV(estimator = XGBRegressor(),
        param_grid = a_xgb_params,                        
        scoring = make_scorer(rmsle, greater_is_better=False),
        cv = 5,
        n_jobs = -1,
        verbose = 4)

a_xgb_search.fit(X_a_train.to_numpy()[:10],y_a_train.to_numpy()[:10])

Fitting 5 folds for each of 324 candidates, totalling 1620 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:   14.8s
[Parallel(n_jobs=-1)]: Done 213 tasks      | elapsed:   35.1s
[Parallel(n_jobs=-1)]: Done 384 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 605 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 874 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 1193 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 1560 tasks      | elapsed:  3.8min
CPU times: user 5.79 s, sys: 315 ms, total: 6.1 s
Wall time: 3min 54s
[Parallel(n_jobs=-1)]: Done 1620 out of 1620 | elapsed:  3.9min finished


GridSearchCV(cv=5,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None, gamma=None,
                                    gpu_id=None, importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n_estimators=100, n_jobs=...
                                    scale_pos_weight=None, subsample=None,
                                    tree_method=None, validate_parameters=None,
                                    verbosity=None),
             n_jobs=-1,
             param_grid={'colsample_bytree': [0.5, 0.7],
   

In [65]:
show_gd_result(a_xgb_search)

Best params:  {'colsample_bytree': 0.5, 'learning_rate': 1, 'max_depth': 3, 'min_child_weight': 5, 'n_estimators': 100, 'objective': 'reg:squarederror', 'subsample': 0.7}
Best score:  -0.16280960430938485


In [51]:
rmsle(y_a_test.to_numpy().T, a_xgb_search.best_estimator_.predict(X_a_test.to_numpy()))

0.4913529736240679

In [63]:
df_a_xgb_result = pd.DataFrame(a_xgb_search.cv_results_)
df_a_xgb_result.to_csv("./processed/strategy-a/result/xgb_result.csv", index=False)

## Support Vector Regression

In [45]:
%%time
a_svr_params = {
    'C': [0.01, 0.1, 1, 10],
    'epsilon': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10],
    'gamma': [0.0001, 0.001, 0.005, 0.1, 1, 3, 5]
}

a_svr_search = GridSearchCV(estimator = SVR(kernel="rbf"),
        param_grid = a_svr_params,                        
        scoring = make_scorer(rmsle, greater_is_better=False),
        cv = 5,
        n_jobs = -1,
        verbose = 4)

a_svr_search.fit(X_a_train.to_numpy()[:1000],y_a_train.to_numpy()[:1000])

Fitting 5 folds for each of 308 candidates, totalling 1540 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done 172 tasks      | elapsed:    7.6s
[Parallel(n_jobs=-1)]: Done 443 tasks      | elapsed:   18.7s
[Parallel(n_jobs=-1)]: Done 620 tasks      | elapsed:   28.9s
[Parallel(n_jobs=-1)]: Done 961 tasks      | elapsed:   50.4s
[Parallel(n_jobs=-1)]: Done 1310 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 1533 out of 1540 | elapsed:  1.6min remaining:    0.4s
[Parallel(n_jobs=-1)]: Done 1540 out of 1540 | elapsed:  1.6min finished
CPU times: user 6.08 s, sys: 411 ms, total: 6.49 s
Wall time: 1min 36s


GridSearchCV(cv=5, estimator=SVR(), n_jobs=-1,
             param_grid={'C': [0.01, 0.1, 1, 10],
                         'epsilon': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05,
                                     0.1, 0.5, 1, 5, 10],
                         'gamma': [0.0001, 0.001, 0.005, 0.1, 1, 3, 5]},
             scoring=make_scorer(rmsle, greater_is_better=False), verbose=4)

In [66]:
show_gd_result(a_svr_search)

Best params:  {'C': 0.1, 'epsilon': 0.1, 'gamma': 5}
Best score:  -0.4790569299851186


In [47]:
rmsle(y_a_test.to_numpy().T, a_svr_search.best_estimator_.predict(X_a_test.to_numpy()))

0.4773990273135189

In [67]:
df_a_svr_result = pd.DataFrame(a_svr_search.cv_results_)
df_a_svr_result.to_csv("./processed/strategy-a/result/svr_result.csv", index=False)

# Strategy B

In [5]:
X_b_train = pd.read_csv("./processed/strategy-b/train/X_train.csv", index_col=["MemberID"])
y_b_train = pd.read_csv("./processed/strategy-b/train/y_train.csv", index_col=["MemberID"])
X_b_test = pd.read_csv("./processed/strategy-b/test/X_test.csv", index_col=["MemberID"])
y_b_test = pd.read_csv("./processed/strategy-b/test/y_test.csv", index_col=["MemberID"])

## XGBoost

In [37]:
%%time
b_xgb_params = {
    'learning_rate': [0.01, 0.1, 1],
    'max_depth': [3, 5, 7],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.5, 0.7],
    'colsample_bytree': [0.5, 0.7],
    'n_estimators' : [100, 200, 500],
    'objective': ['reg:squarederror']
}

b_xgb_search = GridSearchCV(estimator = XGBRegressor(),
        param_grid = b_xgb_params,                        
        scoring = make_scorer(rmsle, greater_is_better=False),
        cv = 5,
        n_jobs = -1,
        verbose = 4)

b_xgb_search.fit(X_b_train.to_numpy()[:10],y_b_train.to_numpy()[:10])

Fitting 5 folds for each of 324 candidates, totalling 1620 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:   15.4s
[Parallel(n_jobs=-1)]: Done 213 tasks      | elapsed:   35.8s
[Parallel(n_jobs=-1)]: Done 384 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 605 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 874 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 1193 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done 1560 tasks      | elapsed:  5.8min
CPU times: user 6.67 s, sys: 466 ms, total: 7.14 s
Wall time: 6min 9s
[Parallel(n_jobs=-1)]: Done 1620 out of 1620 | elapsed:  6.2min finished


GridSearchCV(cv=5,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None, gamma=None,
                                    gpu_id=None, importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n_estimators=100, n_jobs=...
                                    scale_pos_weight=None, subsample=None,
                                    tree_method=None, validate_parameters=None,
                                    verbosity=None),
             n_jobs=-1,
             param_grid={'colsample_bytree': [0.5, 0.7],
   

In [68]:
show_gd_result(b_xgb_search)

Best params:  {'colsample_bytree': 0.5, 'learning_rate': 0.01, 'max_depth': 3, 'min_child_weight': 5, 'n_estimators': 100, 'objective': 'reg:squarederror', 'subsample': 0.5}
Best score:  -0.5278092811202181


In [53]:
rmsle(y_b_test.to_numpy().T, b_xgb_search.best_estimator_.predict(X_b_test.to_numpy()))

0.5281696912615652

In [70]:
df_b_xgb_result = pd.DataFrame(b_xgb_search.cv_results_)
df_b_xgb_result.to_csv("./processed/strategy-b/result/xgb_result.csv", index=False)

## Support Vector Regression

In [55]:
%%time
b_svr_params = {
    'C': [0.01, 0.1, 1, 10],
    'epsilon': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10],
    'gamma': [0.0001, 0.001, 0.005, 0.1, 1, 3, 5]
}

b_svr_search = GridSearchCV(estimator = SVR(),
        param_grid = b_svr_params,                        
        scoring = make_scorer(rmsle, greater_is_better=False),
        cv = 5,
        n_jobs = -1,
        verbose = 4)

b_svr_search.fit(X_b_train.to_numpy()[:10],y_b_train.to_numpy()[:10])

Fitting 5 folds for each of 308 candidates, totalling 1540 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done 1228 tasks      | elapsed:    3.8s
CPU times: user 1.05 s, sys: 91.7 ms, total: 1.14 s
Wall time: 4.21 s
[Parallel(n_jobs=-1)]: Done 1540 out of 1540 | elapsed:    4.1s finished


GridSearchCV(cv=5, estimator=SVR(), n_jobs=-1,
             param_grid={'C': [0.01, 0.1, 1, 10],
                         'epsilon': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05,
                                     0.1, 0.5, 1, 5, 10],
                         'gamma': [0.0001, 0.001, 0.005, 0.1, 1, 3, 5]},
             scoring=make_scorer(rmsle, greater_is_better=False), verbose=4)

In [69]:
show_gd_result(b_svr_search)

Best params:  {'C': 0.01, 'epsilon': 0.0001, 'gamma': 0.0001}
Best score:  -0.35253388861232143


In [58]:
rmsle(y_b_test.to_numpy().T, b_svr_search.best_estimator_.predict(X_b_test.to_numpy()))

0.5062718790980713

In [71]:
df_b_svr_result = pd.DataFrame(b_svr_search.cv_results_)
df_b_svr_result.to_csv("./processed/strategy-b/result/svr_result.csv", index=False)