In [1]:
import time

import pandas as pd
import numpy as np

import joblib 

from sklearn.ensemble import GradientBoostingClassifier, ExtraTreesClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import xgboost
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.callbacks import ModelCheckpoint
from keras.regularizers import l2
from keras.models import model_from_json


Using Theano backend.


In [2]:
level1Clfs = [
    {"model_id": "gbc_mf10", "clf": GradientBoostingClassifier(max_features=10), "n_models":20},
    {"model_id": "xgb_lr0.005_ne125", "clf": xgboost.XGBClassifier(learning_rate=0.05, n_estimators=125), "n_models":20},
    {"model_id": "etc_ne100_md6_mf30", "clf": ExtraTreesClassifier(n_estimators=100, max_depth=6, max_features=30), "n_models":20},
    {"model_id": "rfc_ne125_md6", "clf": RandomForestClassifier(n_estimators=125, max_depth=6), "n_models":20},
    {"model_id": "dtc_md5_mf75", "clf": DecisionTreeClassifier(max_depth=5, max_features=75), "n_models":20},
    {"model_id": "bnb_a0.001", "clf": BernoulliNB(alpha=0.001), "n_models":20},
    {"model_id": "lr_c0.001", "clf": LogisticRegression(C=0.001), "n_models":20},
    {"model_id": "svm_c0.25_proba_true", "clf": SVC(C=0.25, probability=True), "n_models":20},
    {"model_id": "knn_nn75", "clf": KNeighborsClassifier(n_neighbors=75), "n_models":8}, # only 8 models (folds: 0-7) trained for this algo. it takes ~45 mins to train a model on full training set!!!    
    {"model_id": "fnn_keras_h100sig_o2softm_regl20.01_bs10_ep100_sgd", "clf": "Keras [3920, 100 sigmoid (l2(0.01)), 2 softmax (l2(0.01))], batch:10, epochs:100, optimizer: sgd, regularizers: (l2, l2), best_model_metric: None (Last Model of iteration)", "n_models":20} 
]

In [4]:
testTransformedDF = pd.read_csv('./data/testTransformed.csv')

In [7]:
print testTransformedDF.shape

(75818, 3921)


In [6]:
#trainLabels = trainTransformedDF['TARGET']
testFeatures = testTransformedDF.drop(['ID'], axis=1)

In [8]:
testFeatures.shape

(75818, 3920)

In [9]:
for m in level1Clfs:
    model_id = m["model_id"]
    n_models = m["n_models"]
    print "Processing model_id: %s" % model_id
    for fold in range(n_models):
        start = time.time()
        clf = None
        if "keras" in model_id:
            model_json = joblib.load("./models/level1/%s.joblib" % (model_id))
            clf = model_from_json(model_json)
            clf.compile(loss='binary_crossentropy', optimizer='sgd', metrics=['accuracy'])
            clf.load_weights("./models/level1/%s_%d_of_19.HDF5" % (model_id, fold))
            preds = clf.predict_proba(testFeatures.as_matrix())[:,1]
        else:
            clf = joblib.load('./models/level1/%s_%d_of_19.joblib' % (model_id, fold))
            preds = clf.predict_proba(testFeatures)[:,1]
        
        joblib.dump(preds, "./data/level1_test_preds/%s_%d_of_19.joblib" % (model_id, fold))
        print "Finished processing fold %d of 20 at %.1f mins." % (fold+1, (time.time()-start)/60.)

        

Processing model_id: gbc_mf10
Finished processing fold 1 of 20 at 0.2 mins.
Finished processing fold 2 of 20 at 0.1 mins.
Finished processing fold 3 of 20 at 0.1 mins.
Finished processing fold 4 of 20 at 0.1 mins.
Finished processing fold 5 of 20 at 0.1 mins.
Finished processing fold 6 of 20 at 0.1 mins.
Finished processing fold 7 of 20 at 0.1 mins.
Finished processing fold 8 of 20 at 0.1 mins.
Finished processing fold 9 of 20 at 0.2 mins.
Finished processing fold 10 of 20 at 0.1 mins.
Finished processing fold 11 of 20 at 0.2 mins.
Finished processing fold 12 of 20 at 0.1 mins.
Finished processing fold 13 of 20 at 0.2 mins.
Finished processing fold 14 of 20 at 0.1 mins.
Finished processing fold 15 of 20 at 0.2 mins.
Finished processing fold 16 of 20 at 0.2 mins.
Finished processing fold 17 of 20 at 0.2 mins.
Finished processing fold 18 of 20 at 0.1 mins.
Finished processing fold 19 of 20 at 0.1 mins.
Finished processing fold 20 of 20 at 0.1 mins.
Processing model_id: xgb_lr0.005_ne125


  **kwargs)
