Alycia's attempt at gradient boosting classifier from scikit-learn ensemble models

In [16]:
import numpy as np

from sklearn import preprocessing
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.externals import joblib
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.metrics import roc_auc_score

In [2]:
x = np.load("data/x.npy")
y = np.load("data/y.npy")
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
np.save("data/x_normalized.npy", x_scaled)

x_test = np.load("data/x_test.npy")
x_test_scaled = min_max_scaler.fit_transform(x_test)
np.save("data/x_test_normalized.npy", x_test_scaled)

x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, test_size=0.33, random_state=42)



In [3]:
params = {'learning_rate': [0.01, 0.05, 0.1, 0.25, 0.5], 'max_depth': [2, 5, 10, 15],
'subsample': [0.8, 0.85, 0.9], 'max_features': [20, 50, 100, 150]}

gbclass = GradientBoostingClassifier()
clf = GridSearchCV(gbclass, scoring='accuracy', param_grid=params, n_jobs=4, verbose=2)
clf.fit(x_train, y_train)

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 3 folds for each of 240 candidates, totalling 720 fits


[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:  2.9min
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed: 772.5min
[Parallel(n_jobs=4)]: Done 357 tasks      | elapsed: 941.3min
[Parallel(n_jobs=4)]: Done 640 tasks      | elapsed: 1147.0min
[Parallel(n_jobs=4)]: Done 720 out of 720 | elapsed: 1213.7min finished


Rank: 


NameError: name 'results' is not defined

In [5]:
# output top 10 models
for i in range(1, 11):
    models = np.flatnonzero(clf.cv_results_['rank_test_score']==i)
    for m in models:
        print("Rank: ".format(i))
        print("Mean validation score: {0:.5f} (std: {1:.5f})".format(
              clf.cv_results_['mean_test_score'][m],
              clf.cv_results_['std_test_score'][m]))
        print("Parameters: " + str(clf.cv_results_['params'][m]) + '\n')

print('Best Error: ' + str(clf.best_score_))
print('Best Model: ' + str(clf.best_params_))

Rank: 
Mean validation score: 0.78290 (std: 0.00113)
Parameters: {'learning_rate': 0.1, 'max_depth': 5, 'max_features': 150, 'subsample': 0.8}

Rank: 
Mean validation score: 0.78205 (std: 0.00111)
Parameters: {'learning_rate': 0.1, 'max_depth': 5, 'max_features': 100, 'subsample': 0.9}

Rank: 
Mean validation score: 0.78191 (std: 0.00154)
Parameters: {'learning_rate': 0.05, 'max_depth': 10, 'max_features': 100, 'subsample': 0.9}

Rank: 
Mean validation score: 0.78179 (std: 0.00083)
Parameters: {'learning_rate': 0.1, 'max_depth': 5, 'max_features': 50, 'subsample': 0.8}

Rank: 
Mean validation score: 0.78166 (std: 0.00196)
Parameters: {'learning_rate': 0.05, 'max_depth': 5, 'max_features': 150, 'subsample': 0.9}

Rank: 
Mean validation score: 0.78166 (std: 0.00097)
Parameters: {'learning_rate': 0.1, 'max_depth': 5, 'max_features': 150, 'subsample': 0.85}

Rank: 
Mean validation score: 0.78156 (std: 0.00057)
Parameters: {'learning_rate': 0.1, 'max_depth': 5, 'max_features': 100, 'subsamp

In [6]:
def generate_csv(model, x_data, name):
    ids = [i for i, _ in enumerate(x_data)]
    ids = np.array(ids)
    
    predictions = model.predict(x_data).flatten()
    
    df = pd.DataFrame()
    df['id'] = ids
    df['target'] = predictions
    df.to_csv(name + ".csv", index=False)

In [7]:
clf = GradientBoostingClassifier(learning_rate=0.1, max_depth=5, 
                                 subsample=0.8, max_features=150)

#learning_rate': 0.1, 'max_depth': 5, 'max_features': 150, 'subsample': 0.8

In [None]:
y_train = np.load("data/y.npy)

In [14]:
clf.fit(x_scaled, y)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=5,
              max_features=150, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=0.8, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [None]:
predictions = clf.predict(x_test)

In [15]:
print(clf.score(x_scaled, y))

0.8020628759645568


In [17]:
from sklearn.metrics import roc_auc_score
prob_preds = clf.predict_proba(x_scaled)
performance = roc_auc_score(y, prob_preds[:, 1])
print(performance)

0.8195007639799631


In [None]:
print(clf.score(x_test, y_test))

In [19]:
x_test = np.load("data/x_test.npy")
generate_csv(clf, x_test, "submissions/gbclassifier2")