In [1]:
import time

import pandas as pd
import numpy as np

import joblib 

from sklearn.ensemble import GradientBoostingClassifier, ExtraTreesClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import xgboost

In [None]:
level1Clfs = [
    {"model_id": "gbc_mf10", "clf": GradientBoostingClassifier(max_features=10)},
    {"model_id": "xgb_lr0.005_ne125", "clf": xgboost.XGBClassifier(learning_rate=0.05, n_estimators=125)},
    {"model_id": "etc_ne100_md6_mf30", "clf": ExtraTreesClassifier(n_estimators=100, max_depth=6, max_features=30)},
    {"model_id": "rfc_ne125_md6", "clf": RandomForestClassifier(n_estimators=125, max_depth=6)},
    {"model_id": "dtc_md5_mf75", "clf": DecisionTreeClassifier(max_depth=5, max_features=75)},
    {"model_id": "bnb_a0.001", "clf": BernoulliNB(alpha=0.001)},
    {"model_id": "lr_c0.001", "clf": LogisticRegression(C=0.001)},
    {"model_id": "svm_c0.25_proba_true", "clf": SVC(C=0.25, probability=True)},
    {"model_id": "knn_nn75", "clf": KNeighborsClassifier(n_neighbors=75)}, # only 8 models (folds: 0-7) trained for this algo. it takes ~45 mins to train a model on full training set!!!    
    {"model_id": "fnn_keras_h100sig_o2softm_regl20.01_bs10_ep100_sgd", "clf": "Keras [3920, 100 sigmoid (l2(0.01)), 2 softmax (l2(0.01))], batch:10, epochs:100, optimizer: sgd, regularizers: (l2, l2), best_model_metric: None (Last Model of iteration)"} 
]



In [2]:
trainTransformedDF = pd.read_csv('./data/trainTransformed.csv') # from schema: ./models/features/merged_schema_best_3920_features.joblib
print trainTransformedDF.shape

(76020, 3921)


In [3]:
#trainLabels = trainTransformedDF['TARGET']
trainFeatures = trainTransformedDF.drop(['TARGET'], axis=1)

In [4]:
fold_size = 3650

dataTarget0 = trainTransformedDF[trainTransformedDF.TARGET == 0]
dataTarget1 = trainTransformedDF[trainTransformedDF.TARGET == 1]

def getBalancedTrainSetFold(fold):

    if fold == 19:
        trn0 = dataTarget0[fold*fold_size:]
    else:
        trn0 = dataTarget0[fold*fold_size:(fold+1)*fold_size]
    #trn1 = dataTarget1[0:1500]
    trn = pd.concat([trn0, dataTarget1])
    y_train = trn['TARGET']
    X_train = trn.drop(['TARGET'], axis=1)
    
    return X_train, y_train

In [None]:
for m in level1Clfs:
    model_id = m["model_id"]
    print "Processing model_id: %s" % model_id
    for fold in range(20): # 20 folds: one model for each fold
        start = time.time()
        clf = m["clf"]
        X, y = getBalancedTrainSetFold(fold)
        clf.fit(X, y)
        # save the model
        joblib.dump(clf, "./models/level1/%s_%d_of_19.joblib" % (model_id, fold))
        preds = clf.predict_proba(trainFeatures)[:,1]
        joblib.dump(preds, "./data/level1_trn_preds/%s_%d_of_19.joblib" % (model_id, fold))
        print "Finished processing fold %d of 20 at %.1f mins." % (fold+1, (time.time()-start)/60.)

        

## generate keras models

In [5]:
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.callbacks import ModelCheckpoint
from keras.regularizers import l2
from keras.models import model_from_json


Using Theano backend.


In [6]:
def vectorized_result(j):
    """Return a 10-dimensional unit vector with a 1.0 in the jth
    position and zeroes elsewhere.  This is used to convert a digit
    (0...9) into a corresponding desired output from the neural
    network."""
    e = np.zeros((2, 1))
    e[j] = 1.0
    return e

In [None]:
# create and save model architecture
model = Sequential()
# 1. hidden layer
model.add(Dense(output_dim=100, input_dim=3920, W_regularizer=l2(0.01))) #W_regularizer=l2(0.1), 
model.add(Activation('sigmoid'))
# output layer
model.add(Dense(output_dim=2, W_regularizer=l2(0.01))) #, W_regularizer=l2(0.1)
model.add(Activation("softmax"))

joblib.dump(model.to_json(), "./models/level1/%s.joblib" % (model_id))


In [7]:
model_id = "fnn_keras_h100sig_o2softm_regl20.01_bs10_ep100_sgd"
#checkpointer = ModelCheckpoint(filepath=model_id+".hdf5", verbose=0, save_best_only=True)
print "Processing model_id: %s" % model_id

for fold in range(20): # 20 folds: one model for each fold
    start = time.time()
    X, y = getBalancedTrainSetFold(fold)
    y_vect = np.array([vectorized_result(x) for x in y])
    y_vect = np.reshape(y_vect, (y_vect.shape[0], 2))
    clf = None
    model_json = joblib.load("./models/level1/%s.joblib" % (model_id))
    clf = model_from_json(model_json)

    clf.compile(loss='binary_crossentropy', optimizer='sgd', metrics=['accuracy'])

    clf.fit(X.as_matrix(), y_vect, verbose=0, batch_size=10, 
                 nb_epoch=100)
    
    # save the model (weights)
    clf.save_weights("./models/level1/%s_%d_of_19.HDF5" % (model_id, fold))
    preds = clf.predict_proba(trainFeatures.as_matrix())[:,1]
    # save keras level1 preds for train set:
    joblib.dump(preds, "./data/level1_trn_preds/%s_%d_of_19.joblib" % (model_id, fold))
    print "Finished processing fold %d of 20 at %.1f mins." % (fold+1, (time.time()-start)/60.)
    

Processing model_id: fnn_keras_h100sig_o2softm_regl20.01_bs10_ep100_sgd
Finished processing fold 16 of 20 at 6.0 mins.
Finished processing fold 17 of 20 at 5.9 mins.
Finished processing fold 18 of 20 at 5.9 mins.
Finished processing fold 19 of 20 at 5.8 mins.
Finished processing fold 20 of 20 at 5.9 mins.


  **kwargs)


### Test saved model and preds (non-keras)

In [None]:
test_clf = joblib.load('./models/level1/gbc_mf10_0_of_20.joblib')

In [None]:
test_clf

In [None]:
test_clf_preds = joblib.load('./data/level1_trn_preds/gbc_mf10_0_of_20.joblib')

In [None]:
type(test_clf_preds)

In [None]:
trainLabels = trainTransformedDF['TARGET']

In [None]:
from sklearn.metrics import roc_auc_score

roc_auc_score(trainLabels, test_clf_preds)

### Test saved model and preds (keras)

In [None]:
model_json = joblib.load("./models/level1/%s.joblib" % (model_id))
test_clf = model_from_json(model_json)

In [None]:
test_clf

In [None]:
test_clf.compile(loss='binary_crossentropy', optimizer='sgd', metrics=['accuracy'])

In [None]:
test_clf.load_weights("./models/level1/%s_0_of_19.HDF5" % (model_id))

In [None]:
test_clf_preds = test_clf.predict_proba(trainFeatures.as_matrix())[:,1]

In [None]:
from sklearn.metrics import roc_auc_score

roc_auc_score(trainLabels, test_clf_preds)