In [11]:
import numpy as np
from sklearn import preprocessing
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib
import pandas as pd

In [2]:
x = np.load("data/x.npy")
y = np.load("data/y.npy")
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
np.save("data/x_normalized.npy", x_scaled)

x_test = np.load("data/x_test.npy")
x_test_scaled = min_max_scaler.fit_transform(x_test)
np.save("data/x_test_normalized.npy", x_test_scaled)

# 5fold
x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, test_size=0.33, random_state=42)



In [4]:
params = {'n_estimators': [50, 100, 500], 'algorithm':['SAMME.R', 'SAMME'],
'learning_rate': np.logspace(-4, 0, 10)}

adab = AdaBoostClassifier()
clf = GridSearchCV(adab, scoring='accuracy', param_grid=params, n_jobs=2, verbose=2)
clf.fit(x_train, y_train)

print('Best Error: ' + str(clf.best_score_))
print('Best Model: ' + str(clf.best_params_))

[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


Fitting 3 folds for each of 80 candidates, totalling 240 fits


[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed: 573.5min
[Parallel(n_jobs=2)]: Done 158 tasks      | elapsed: 675.3min
[Parallel(n_jobs=2)]: Done 240 out of 240 | elapsed: 732.7min finished


Best Error: 0.779440
Best Model: {'algorithm': 'SAMME.R', 'learning_rate': 0.3593813663804626, 'n_estimators': 500}


In [5]:
clf = AdaBoostClassifier(algorithm='SAMME.R', learning_rate=0.3593813663804626, n_estimators=500)

In [15]:
clf.fit(x_scaled, y)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=0.3593813663804626, n_estimators=500,
          random_state=None)

In [16]:
print(clf.score(x_scaled, y))

0.7839701857206922


In [17]:
from sklearn.metrics import roc_auc_score
prob_preds = clf.predict_proba(x_scaled)
performance = roc_auc_score(y, prob_preds[:, 1])
print(performance)

0.7907737655563116


In [9]:
def generate_csv(model, x_data, name):
    ids = [i for i, _ in enumerate(x_data)]
    ids = np.array(ids)
    
    predictions = model.predict(x_data).flatten()
    
    df = pd.DataFrame()
    df['id'] = ids
    df['target'] = predictions
    df.to_csv(name + ".csv", index=False)

In [12]:
generate_csv(clf, x_test_scaled, "submissions/adaboost1")
# 0.54037

Model with rank: 1
Mean validation score: 0.78258 (std: 0.00059)
Parameters: {'learning_rate': 0.1, 'max_depth': 5, 'max_features': 50, 'subsample': 0.9}

Model with rank: 2
Mean validation score: 0.78249 (std: 0.00123)
Parameters: {'learning_rate': 0.1, 'max_depth': 5, 'max_features': 100, 'subsample': 0.8}

Model with rank: 3
Mean validation score: 0.78142 (std: 0.00082)
Parameters: {'learning_rate': 0.1, 'max_depth': 5, 'max_features': 50, 'subsample': 0.8}

Model with rank: 4
Mean validation score: 0.78140 (std: 0.00116)
Parameters: {'learning_rate': 0.1, 'max_depth': 5, 'max_features': 20, 'subsample': 0.9}

Model with rank: 5
Mean validation score: 0.78062 (std: 0.00122)
Parameters: {'learning_rate': 0.1, 'max_depth': 5, 'max_features': 100, 'subsample': 0.9}

Model with rank: 6
Mean validation score: 0.78034 (std: 0.00102)
Parameters: {'learning_rate': 0.1, 'max_depth': 5, 'max_features': 20, 'subsample': 0.8}

Model with rank: 7
Mean validation score: 0.77956 (std: 0.00198)
Parameters: {'learning_rate': 0.5, 'max_depth': 2, 'max_features': 50, 'subsample': 0.9}

Model with rank: 8
Mean validation score: 0.77935 (std: 0.00050)
Parameters: {'learning_rate': 0.5, 'max_depth': 2, 'max_features': 100, 'subsample': 0.9}

Model with rank: 9
Mean validation score: 0.77905 (std: 0.00084)
Parameters: {'learning_rate': 0.5, 'max_depth': 2, 'max_features': 20, 'subsample': 0.9}

Model with rank: 10
Mean validation score: 0.77893 (std: 0.00104)
Parameters: {'learning_rate': 0.1, 'max_depth': 10, 'max_features': 20, 'subsample': 0.9}

Best Error: 0.782579
Best Model: {'learning_rate': 0.1, 'max_depth': 5, 'max_features': 50, 'subsample': 0.9}

