In [1]:
import time

import pandas as pd
import numpy as np
import joblib

from sklearn.ensemble import GradientBoostingClassifier, ExtraTreesClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import xgboost

## MERGE TRAIN PREDS

In [2]:
trainDataFrame = pd.read_csv('./data/train.csv')
trainLabels = trainDataFrame['TARGET']
trainDataFrame = None

In [3]:
trainLabels.shape

(76020,)

In [7]:
level1Clfs = [
    {"model_id": "gbc_mf10", "clf": GradientBoostingClassifier(max_features=10), "n_models":20},
    {"model_id": "xgb_lr0.005_ne125", "clf": xgboost.XGBClassifier(learning_rate=0.05, n_estimators=125), "n_models":20},
    {"model_id": "etc_ne100_md6_mf30", "clf": ExtraTreesClassifier(n_estimators=100, max_depth=6, max_features=30), "n_models":20},
    {"model_id": "rfc_ne125_md6", "clf": RandomForestClassifier(n_estimators=125, max_depth=6), "n_models":20},
    {"model_id": "dtc_md5_mf75", "clf": DecisionTreeClassifier(max_depth=5, max_features=75), "n_models":20},
    {"model_id": "bnb_a0.001", "clf": BernoulliNB(alpha=0.001), "n_models":20},
    {"model_id": "lr_c0.001", "clf": LogisticRegression(C=0.001), "n_models":20},
    {"model_id": "svm_c0.25_proba_true", "clf": SVC(C=0.25, probability=True), "n_models":20},
    {"model_id": "knn_nn75", "clf": KNeighborsClassifier(n_neighbors=75), "n_models":8}, # only 8 models (folds: 0-7) trained for this algo. it takes ~45 mins to train a model on full training set!!!    
    {"model_id": "fnn_keras_h100sig_o2softm_regl20.01_bs10_ep100_sgd", "clf": "Keras [3920, 100 sigmoid (l2(0.01)), 2 softmax (l2(0.01))], batch:10, epochs:100, optimizer: sgd, regularizers: (l2, l2), best_model_metric: None (Last Model of iteration)", "n_models":20} 
]

In [11]:
trainLevel1Preds = pd.DataFrame(index=range(76020))
for m in level1Clfs:
    start = time.time()
    model_id = m["model_id"]
    model_id_short = model_id.split("_")[0]
    n_models = m["n_models"]
    for fold in range(n_models):
        preds = joblib.load('./data/level1_trn_preds/%s_%d_of_19.joblib' % (model_id, fold))
        col_name = "%s_%d" % (model_id_short, fold)
        trainLevel1Preds[col_name] = preds
    print "Finished processing %s model in %.1f mins" % (model_id_short, (time.time()-start)/60.)

print "trainLevel1Preds.shape: %s" % str(trainLevel1Preds.shape)
trainLevel1Preds['TARGET'] = trainLabels
trainLevel1Preds.to_csv('./data/trainLevel1Preds.csv', index=False)
print "saved data to './data/trainLevel1Preds.csv'"

Finished processing gbc model in 0.0 mins
Finished processing xgb model in 0.0 mins
Finished processing etc model in 0.0 mins
Finished processing rfc model in 0.0 mins
Finished processing dtc model in 0.0 mins
Finished processing bnb model in 0.0 mins
Finished processing lr model in 0.0 mins
Finished processing svm model in 0.0 mins
Finished processing knn model in 0.0 mins
Finished processing fnn model in 0.0 mins
trainLevel1Preds.shape: (76020, 188)
saved data to './data/trainLevel1Preds.csv'


In [12]:
# test
df = pd.read_csv('./data/trainLevel1Preds.csv')
print df.shape
df.head()

(76020, 189)


Unnamed: 0,gbc_0,gbc_1,gbc_2,gbc_3,gbc_4,gbc_5,gbc_6,gbc_7,gbc_8,gbc_9,...,fnn_11,fnn_12,fnn_13,fnn_14,fnn_15,fnn_16,fnn_17,fnn_18,fnn_19,TARGET
0,0.342172,0.414435,0.323783,0.44965,0.31551,0.371713,0.344857,0.363904,0.328568,0.3931,...,0.265872,0.141779,0.301909,0.207389,0.251576,0.398962,0.358739,0.329578,0.329215,0
1,0.212977,0.179875,0.16836,0.277437,0.185459,0.194718,0.210133,0.172217,0.229657,0.183873,...,0.157858,0.087183,0.192595,0.130645,0.144881,0.225168,0.20009,0.181024,0.18028,0
2,0.060077,0.056295,0.058152,0.060049,0.052826,0.065247,0.058802,0.062107,0.05423,0.049104,...,0.059168,0.034401,0.075563,0.050149,0.062954,0.08157,0.09539,0.068544,0.065753,0
3,0.35101,0.46445,0.381325,0.449268,0.410513,0.451058,0.449003,0.489866,0.399165,0.416468,...,0.425196,0.286204,0.484742,0.342633,0.3808,0.434669,0.517109,0.455566,0.469757,0
4,0.279508,0.122038,0.199128,0.21148,0.158174,0.204654,0.17854,0.141302,0.146963,0.243928,...,0.103202,0.069171,0.140853,0.104212,0.10064,0.169018,0.165605,0.150597,0.102972,0


## MERGE TEST PREDS

In [2]:
testDataFrame = pd.read_csv('./data/test.csv')
testIDs = testDataFrame['ID']
testDataFrame = None

In [3]:
testIDs.shape

(75818,)

In [4]:
level1Clfs = [
    {"model_id": "gbc_mf10", "clf": GradientBoostingClassifier(max_features=10), "n_models":20},
    {"model_id": "xgb_lr0.005_ne125", "clf": xgboost.XGBClassifier(learning_rate=0.05, n_estimators=125), "n_models":20},
    {"model_id": "etc_ne100_md6_mf30", "clf": ExtraTreesClassifier(n_estimators=100, max_depth=6, max_features=30), "n_models":20},
    {"model_id": "rfc_ne125_md6", "clf": RandomForestClassifier(n_estimators=125, max_depth=6), "n_models":20},
    {"model_id": "dtc_md5_mf75", "clf": DecisionTreeClassifier(max_depth=5, max_features=75), "n_models":20},
    {"model_id": "bnb_a0.001", "clf": BernoulliNB(alpha=0.001), "n_models":20},
    {"model_id": "lr_c0.001", "clf": LogisticRegression(C=0.001), "n_models":20},
    {"model_id": "svm_c0.25_proba_true", "clf": SVC(C=0.25, probability=True), "n_models":20},
    {"model_id": "knn_nn75", "clf": KNeighborsClassifier(n_neighbors=75), "n_models":8}, # only 8 models (folds: 0-7) trained for this algo. it takes ~45 mins to train a model on full training set!!!    
    {"model_id": "fnn_keras_h100sig_o2softm_regl20.01_bs10_ep100_sgd", "clf": "Keras [3920, 100 sigmoid (l2(0.01)), 2 softmax (l2(0.01))], batch:10, epochs:100, optimizer: sgd, regularizers: (l2, l2), best_model_metric: None (Last Model of iteration)", "n_models":20} 
]

In [5]:
testLevel1Preds = pd.DataFrame(index=range(testIDs.shape[0]))
for m in level1Clfs:
    start = time.time()
    model_id = m["model_id"]
    model_id_short = model_id.split("_")[0]
    n_models = m["n_models"]
    for fold in range(n_models):
        preds = joblib.load('./data/level1_test_preds/%s_%d_of_19.joblib' % (model_id, fold))
        col_name = "%s_%d" % (model_id_short, fold)
        testLevel1Preds[col_name] = preds
    print "Finished processing %s model in %.1f mins" % (model_id_short, (time.time()-start)/60.)

print "testLevel1Preds.shape: %s" % str(testLevel1Preds.shape)
testLevel1Preds['ID'] = testIDs
testLevel1Preds.to_csv('./data/testLevel1Preds.csv', index=False)
print "saved data to './data/testLevel1Preds.csv'"

Finished processing gbc model in 0.0 mins
Finished processing xgb model in 0.0 mins
Finished processing etc model in 0.0 mins
Finished processing rfc model in 0.0 mins
Finished processing dtc model in 0.0 mins
Finished processing bnb model in 0.0 mins
Finished processing lr model in 0.0 mins
Finished processing svm model in 0.0 mins
Finished processing knn model in 0.0 mins
Finished processing fnn model in 0.0 mins
testLevel1Preds.shape: (75818, 188)
saved data to './data/testLevel1Preds.csv'
