# Other

In [1]:
import numpy as np # Linear algebra
import pandas as pd # Data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt  # Matlab-style plotting
# Make sure plot shows immediately
%matplotlib inline 
import seaborn as sns # Library for plotting
color = sns.color_palette()
sns.set_style('darkgrid')

from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error
from xgboost import XGBRegressor

In [2]:
def rmsle(y_true, y_pred):
    return np.sqrt(np.mean(np.power(np.log(y_pred + 1) - np.log(y_true + 1),2)))

# Learning Curve

In [3]:
n_sample = [ 10, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 2000, 3000, 4000, 5000, 6000 ]

## Strategy A

In [4]:
X_a_train = pd.read_csv("./processed/strategy-a/train/X_train.csv", index_col=["MemberID"])
y_a_train = pd.read_csv("./processed/strategy-a/train/y_train.csv", index_col=["MemberID"])
X_a_test = pd.read_csv("./processed/strategy-a/test/X_test.csv", index_col=["MemberID"])
y_a_test = pd.read_csv("./processed/strategy-a/test/y_test.csv", index_col=["MemberID"])

### XGBoost

In [5]:
a_xgb_params = {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.3}

In [6]:
%%time
a_xgb_result = []
for i in n_sample:
    a_xgb = XGBRegressor(objective='reg:squarederror', **a_xgb_params)
    a_xgb.fit(X_a_train.sample(n=i, random_state=1).to_numpy(), y_a_train.sample(n=i, random_state=1).to_numpy())
    y_pred = a_xgb.predict(X_a_test.to_numpy())
    rs = rmsle(y_a_test.to_numpy().T, y_pred)
    print("Sample: {}, rmsle: {}".format(i, rs))
    a_xgb_result.append(rs)


Sample: 10, rmsle: 0.4823575692333976
Sample: 100, rmsle: 0.48185417129002456
Sample: 200, rmsle: 0.4988620842657617
Sample: 300, rmsle: 0.5209147106124357
Sample: 400, rmsle: 0.5151518939764415
Sample: 500, rmsle: 0.5247630466736724
Sample: 600, rmsle: 0.5157140293177316
Sample: 700, rmsle: 0.5109233021284056
Sample: 800, rmsle: 0.5100568819305227
Sample: 900, rmsle: 0.509550115665655
Sample: 1000, rmsle: 0.5105598367046137
Sample: 2000, rmsle: 0.5072139428010072
Sample: 3000, rmsle: 0.5063631500908787
Sample: 4000, rmsle: 0.5079755000459188
Sample: 5000, rmsle: 0.5078447373821157
Sample: 6000, rmsle: 0.5068801043769587
CPU times: user 21.9 s, sys: 1.45 s, total: 23.3 s
Wall time: 12.2 s


### Support Vector Regression

In [7]:
a_svr_params = {'C': 0.1, 'epsilon': 0.1, 'gamma': 'scale'}

In [8]:
%%time
a_svr_result = []
for i in n_sample:
    a_svr = SVR(**a_svr_params)
    a_svr.fit(X_a_train.sample(n=i, random_state=1).to_numpy(), y_a_train.sample(n=i, random_state=1).to_numpy())
    y_pred = a_svr.predict(X_a_test.to_numpy())
    rs = rmsle(y_a_test.to_numpy().T, y_pred)
    print("Sample: {}, rmsle: {}".format(i, rs))
    a_xgb_result.append(rs)


Sample: 10, rmsle: 0.47960091516026193
Sample: 100, rmsle: 0.4809984710566387
Sample: 200, rmsle: 0.4809187323744621
Sample: 300, rmsle: 0.4805788488376517
Sample: 400, rmsle: 0.48051325167109027
Sample: 500, rmsle: 0.48018212766500157
Sample: 600, rmsle: 0.4802885857504097
Sample: 700, rmsle: 0.4802321927623156
Sample: 800, rmsle: 0.48025930095004565
Sample: 900, rmsle: 0.48005178365960716
Sample: 1000, rmsle: 0.4801557755174278
Sample: 2000, rmsle: 0.47987002985386934
Sample: 3000, rmsle: 0.47928212412515475
Sample: 4000, rmsle: 0.4789058058071127
Sample: 5000, rmsle: 0.478414322491396
Sample: 6000, rmsle: 0.4785960686950652
CPU times: user 1min 52s, sys: 1.66 s, total: 1min 54s
Wall time: 2min 11s


## Strategy B

In [9]:
X_b_train = pd.read_csv("./processed/strategy-b/train/X_train.csv", index_col=["MemberID"])
y_b_train = pd.read_csv("./processed/strategy-b/train/y_train.csv", index_col=["MemberID"])
X_b_test = pd.read_csv("./processed/strategy-b/test/X_test.csv", index_col=["MemberID"])
y_b_test = pd.read_csv("./processed/strategy-b/test/y_test.csv", index_col=["MemberID"])

### XGBoost

In [10]:
b_xgb_params = {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.3}

In [11]:
%%time
b_xgb_result = []
for i in n_sample:
    b_xgb = XGBRegressor(objective='reg:squarederror', **b_xgb_params)
    b_xgb.fit(X_b_train.sample(n=i, random_state=1).to_numpy(), y_b_train.sample(n=i, random_state=1).to_numpy())
    y_pred = b_xgb.predict(X_b_test.to_numpy())
    rs = rmsle(y_b_test.to_numpy().T, y_pred)
    print("Sample: {}, rmsle: {}".format(i, rs))
    b_xgb_result.append(rs)


Sample: 10, rmsle: 0.5096456446894639
Sample: 100, rmsle: 0.4985007140501991
Sample: 200, rmsle: 0.496012703915103
Sample: 300, rmsle: 0.4988683810343194
Sample: 400, rmsle: 0.49799336558359497
Sample: 500, rmsle: 0.5099611762340597
Sample: 600, rmsle: 0.5029850704354204
Sample: 700, rmsle: 0.5036604754304911
Sample: 800, rmsle: 0.5021589983025834
Sample: 900, rmsle: 0.5013844273980206
Sample: 1000, rmsle: 0.5041692028333487
Sample: 2000, rmsle: 0.5002407297535661
Sample: 3000, rmsle: 0.4966788076907899
Sample: 4000, rmsle: 0.49757707355750336
Sample: 5000, rmsle: 0.4945377380003405
Sample: 6000, rmsle: 0.4928877888414815
CPU times: user 18.4 s, sys: 834 ms, total: 19.3 s
Wall time: 11.5 s


### Support Vector Regression

In [12]:
b_svr_params = {'C': 1, 'epsilon': 0.01, 'gamma': 'auto'}

In [13]:
%%time
b_svr_result = []
for i in n_sample:
    b_svr = SVR(**b_svr_params)
    b_svr.fit(X_b_train.sample(n=i, random_state=1).to_numpy(), y_b_train.sample(n=i, random_state=1).to_numpy())
    y_pred = b_svr.predict(X_b_test.to_numpy())
    rs = rmsle(y_b_test.to_numpy().T, y_pred)
    print("Sample: {}, rmsle: {}".format(i, rs))
    b_xgb_result.append(rs)


Sample: 10, rmsle: 0.4742723324792723
Sample: 100, rmsle: 0.47390571084557
Sample: 200, rmsle: 0.47389037155141556
Sample: 300, rmsle: 0.4741929623240851
Sample: 400, rmsle: 0.4746361425855459
Sample: 500, rmsle: 0.4747840422402774
Sample: 600, rmsle: 0.47482468641688247
Sample: 700, rmsle: 0.4745421212788578
Sample: 800, rmsle: 0.47454428516869596
Sample: 900, rmsle: 0.4744053296628215
Sample: 1000, rmsle: 0.47406067119613715
Sample: 2000, rmsle: 0.47406974857372275
Sample: 3000, rmsle: 0.47367724975233616
Sample: 4000, rmsle: 0.4738133511974307
Sample: 5000, rmsle: 0.4741860679910001
Sample: 6000, rmsle: 0.47416639028513735
CPU times: user 1min 28s, sys: 1.46 s, total: 1min 30s
Wall time: 1min 58s
