In [1]:
import pandas as pd
import numpy as np
import pickle
import argparse

from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.layers import Dense, Embedding, Input, Flatten, Activation, concatenate, BatchNormalization, Dropout
from keras.models import Model
from keras.models import Sequential
import keras.backend as K
import keras

import tensorflow as tf

from sklearn.metrics import roc_auc_score, precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix

# file
import os
os.chdir("..")
import handset_model_current as handset_model
os.chdir("cmtnn")

Using TensorFlow backend.


In [2]:
def bool_arg(string):
    value = string.lower()
    if value == 'true':
        return True
    elif value == 'false':
        return False
    else:
        raise argparse.ArgumentTypeError("Expected True or False, but got {}".format(string))

# minimal preprocessing
parser = argparse.ArgumentParser()

# model hyperparameters
# small number of epochs for experimentation
parser.add_argument('--epochs', default=10, type=int,
                    help="Nr of epochs. Default is 100", dest="epochs")
parser.add_argument('--batch_size', default=256, type=int,
                    help="Batch size. Default is 32", dest="batch_size")
parser.add_argument('--earlystop', default=3, type=int,
                    help="Number of epochs with no improvement after which training will be stopped.",
                    dest="earlystop")
parser.add_argument('--verbose', default=True, type=bool_arg, help="If True (default), verbose output",
                    dest="verbose")

# cross_val is not ready to be used
parser.add_argument('--cross_val', default=0, type=int,
                    help="Number of folds (if bigger than 0) to use for cross validation. Default is 0.",
                    dest="cross_val")

# no applying class weights
parser.add_argument('--apply_class_weights', default=False, type=bool_arg,
                    help="If True, apply different loss weights (based on frequency of samples) to different "
                         "classes.",
                    dest="apply_class_weights")

# no smooth factor
parser.add_argument('--smooth_factor', default=0, type=float,
                    help="Smooth factor to be used when calculating class weights, so that highly unfrequent "
                    "classes do not get huge weights.",
                    dest="smooth_factor")

# oversampling with neg to pos ratio=3
parser.add_argument('--oversample', default=True, type=bool_arg,
                    help="If True (default), apply oversampling to generate balanced batches.",
                    dest="oversample")
parser.add_argument('--ratio', default=3, type=int,
                    help="Ratio of negative to positive samples to use for balanced batch generation "
                         "(if oversample=True)",
                    dest="ratio")

# activation: prelu
parser.add_argument('--activation', default='prelu',
                    help="NN activation to be used. Default is prelu",
                    dest="activation")

# no x_vars
parser.add_argument('--x_vars', default=False, type=bool_arg, help="If True (default), include X variables",
                    dest="x_vars")

# standardize numerical data
parser.add_argument('--std', default=True, type=bool_arg, help="If True (default), standardize data.",
                    dest="std")

# no pca
parser.add_argument('--pca_whiten', default=False, type=bool_arg, help="If True (default), PCA-whiten data.",
                    dest="pca_whiten")
parser.add_argument('--pca_reduce', default=0, type=float,
                    help="{0, 1, 0<x<1} If 0, no dimensionality reduction is done. If 1, Thomas P. Minka's method "
                         "('Automatic Choice of Dimensionality for PCA'. NIPS 2000) is used to determine the "
                         "number of dimensions to keep. If 0 < pca_reduce < 1, enough number of dimensions will "
                         "be kept to keep 'pca_reduce' percentage of variance explained. Default is 0.9.",
                    dest="pca_reduce")

# one-hot encode cat data (embeddings are not used)
parser.add_argument('--cat_enc', default='one-hot',
                    help="Encoding to be used for categorical variables. Default is 'integer' "
                         "(embedding layers will then be used). Other alternatives: 'hashing_char', "
                         "'hashing_all', 'one-hot'.",
                    dest="cat_enc")

# no log transform
parser.add_argument('--log_xform', default=False, type=bool_arg, help="If True (default), log-transform data.",
                    dest="log_xform")

# encode categorical and binary data as 1/0
parser.add_argument('--binary_enc', default=True, type=bool_arg,
                    help="If False (default), the negative cases of binary variables will be represented as -1 "
                         "instead of 0.", dest="binary_enc")

# id for saving/ loading
parser.add_argument('--data_split_id', default=2, type=int,
                    help="Id for the train-test data split to be used. If a new id is given, a new data split "
                         "will be generated and saved to disk with the given id. If id is 0 (default), a new "
                         "split will be generated, but not saved to disk. If a previously used id is given, "
                         "a previously generated and saved data split with that id will be used.",
                    dest="data_split_id")
parser.add_argument("-f")
args = parser.parse_args()

In [3]:
os.chdir("..")
data_train, data_test, cat_levels = handset_model.load_and_preprocess_data(args)  # split_id=2 for f_classif features
os.chdir("cmtnn")

generating dictionary with levels of catagorical variables...
Reusing data split with id=2
Loading previously pre-processed numerical data...
Loading previously pre-processed categorical data...


In [4]:
data_train['num'].shape, data_train['cat'].shape  # 2nd dim should be 7 and 235

((466632, 7), (466632, 235))

In [5]:
model = handset_model.create_model(data_train["num"].columns, data_train["cat"].columns, cat_encoding=args.cat_enc,
                     cat_emb_dim=handset_model.CAT_EMB_DIM, cat_levels=cat_levels, include_x_vars=args.x_vars,
                     activation=args.activation)

In [6]:
chkp_file = "handset_weights.best.hdf5"
handset_model.train_and_evaluate_model(model, data_train, data_test, nb_epochs=args.epochs,
                                 batch_size=args.batch_size,
                                 oversample=args.oversample,
                                 apply_class_weights=args.apply_class_weights,
                                 smooth_factor=args.smooth_factor,
                                 verbose=args.verbose, chkp_file=chkp_file,
                                 earlystop_pat=args.earlystop)

Using class_weights:  {0: 1.0, 1: 1.0}
Oversampling with ration neg/pos= 1
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [7]:
fdata_train = data_train.copy()
fdata_test = data_test.copy()

In [8]:
fdata_train['labels'] = data_train['labels'] ^ 1
fdata_test['labels'] = data_test['labels'] ^ 1

In [9]:
fmodel = handset_model.create_model(fdata_train["num"].columns, fdata_train["cat"].columns, cat_encoding=args.cat_enc,
                     cat_emb_dim=handset_model.CAT_EMB_DIM, cat_levels=cat_levels, include_x_vars=args.x_vars,
                     activation=args.activation)

In [10]:
from sklearn.utils import shuffle

# one small change to adapt the generator to the falsity model: reverse 0/1
class OverSamplingBatchGenerator:
    def __init__(self, data_train, batch_size=32, r=1):
        # n_batch = n_batch_neg + n_batch_pos
        # r = n_batch_neg / n_batch_pos
        # N = floor(n_neg / n_batch_neg) = floor(n_neg * (1 + r) / (n_batch * r))

       # if (batch_size % (1 + r)) <> 0:
       #     raise Exception("batch_size must be divisible by (1 + r)")

        self.data_train = data_train
        self.neg_idx = (data_train["labels"][data_train["labels"][handset_model.LABEL_COL] == 1]).index.values  # reverse 0 to 1
        self.pos_idx = (data_train["labels"][data_train["labels"][handset_model.LABEL_COL] == 0]).index.values  # reverse 1 to 0
        # Number of positive and negative examples per batch
        self.n_batch_pos = int(batch_size / (1 + r))
        self.n_batch_neg = (batch_size - self.n_batch_pos)
        # Total number of negative examples
        n_neg = self.neg_idx.size
        # Number of batches
        self.N = int((n_neg * (1 + r) / (batch_size * r)))
        if self.N*self.n_batch_neg > n_neg:
            self.N = int(n_neg/self.n_batch_neg)

    def get_no_batches(self):
        return self.N

    def generator(self):
        labels = np.vstack([np.zeros((self.n_batch_pos, 1)), np.ones((self.n_batch_neg, 1))])  # reverse zeros/ones
        while True:
            np.random.shuffle(self.neg_idx)
            # Shuffle negative data at the beginning of each epoch. There should be self.N steps per epoch
            # (i.e., one complete for-loop).
            # No need to shuffle positive data, since we randomly sample from it for every batch

            for start_idx in range(0, self.N * self.n_batch_neg, self.n_batch_neg):
                batch_pos_idx = np.random.choice(self.pos_idx, self.n_batch_pos, replace=False)
                batch_neg_idx = self.neg_idx[start_idx: (start_idx + self.n_batch_neg)]
                batch = [np.vstack([self.data_train["num"].loc[batch_pos_idx].values,
                                    self.data_train["num"].loc[batch_neg_idx].values])] + \
                [np.vstack([self.data_train["cat"].loc[batch_pos_idx].values,
                            self.data_train["cat"].loc[batch_neg_idx].values])]
                batch.append(labels)
                batch = shuffle(*batch)

                yield (batch[0:-1], batch[-1])


In [11]:
# define a local train_and_evaluate_model function that uses the generator above
def train_and_evaluate_model(model, data_train, data_test, nb_epochs=100,
                             batch_size=32, cvscores=None, verbose=False, chkp_file=None,
                             earlystop_pat=10,
                             r_balanced_batch=1, oversample = True, apply_class_weights=False,
                             smooth_factor=0.1):

    # Define callbacks for early stopping and model checkpointing
    earlystopping = EarlyStopping(monitor='val_loss', patience=earlystop_pat, verbose=True, mode='auto')
    checkpoint = ModelCheckpoint(chkp_file, monitor='val_acc', verbose=verbose, save_best_only=True, mode='max')

    class_weights = {0: 1.0, 1:1.0}
    if apply_class_weights:
            y = data_train["labels"].values
            class_weights = get_class_weights(y.reshape((1, y.shape[0]))[0], smooth_factor=smooth_factor)
    print("Using class_weights: ", class_weights)

    # Fit the model
    if oversample:
        # fixed input and output shapes for generator
        print("Oversampling with ration neg/pos=", r_balanced_batch)
        gen = OverSamplingBatchGenerator(data_train, batch_size=batch_size, r=r_balanced_batch)

        history = model.fit_generator(gen.generator(),
                                  validation_data=(
                                      [data_test["num"].values]+[data_test["cat"].values],
                                      data_test["labels"].values),
                                  steps_per_epoch=gen.get_no_batches(),
                                  epochs=nb_epochs,
                                  verbose=verbose,
                                  max_q_size=10,
                                  workers=1,
                                  pickle_safe=True,class_weight=class_weights,
                                  callbacks=[checkpoint, earlystopping])

    else:
        # fixed input and output shapes for generator
        history = model.fit([data_train["num"].values] +
                            [data_train["cat"].values],
                            data_train["labels"].values,
                            validation_data=(
                                [data_test["num"].values]+[data_test["cat"].values],
                                      data_test["labels"].values),
                            shuffle=True,
                            epochs=nb_epochs, batch_size=batch_size, class_weight=class_weights,
                            verbose=verbose, callbacks=[checkpoint,earlystopping])


    pickle.dump(history.history, open("f_history.pickle", "wb"))


In [12]:
chkp_file = "f_handset_weights.best.hdf5"

# use local function
train_and_evaluate_model(fmodel, fdata_train, fdata_test, nb_epochs=args.epochs,
                         batch_size=args.batch_size,
                         oversample=args.oversample,
                         apply_class_weights=args.apply_class_weights,
                         smooth_factor=args.smooth_factor,
                         verbose=args.verbose, chkp_file=chkp_file,
                         earlystop_pat=args.earlystop)

Using class_weights:  {0: 1.0, 1: 1.0}
Oversampling with ration neg/pos= 1
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [19]:
p_train = model.predict([data_train["num"].values]+[data_train["cat"].values])
fp_train = fmodel.predict([fdata_train["num"].values]+[fdata_train["cat"].values])

p_test = model.predict([data_test["num"].values]+[data_test["cat"].values])
fp_test = fmodel.predict([fdata_test["num"].values]+[fdata_test["cat"].values])

In [57]:
threshold = 0.5

fpred_train = []
for i in range(len(p_train)):
    if fp_train[i] < threshold:
        fpred_train.append(1)
    else:
        fpred_train.append(0)
        
fpred_test = []
for i in range(len(p_test)):
    if fp_test[i] < threshold:
        fpred_test.append(1)
    else:
        fpred_test.append(0)

In [58]:
np.unique(fpred_test, return_counts=True)

(array([0, 1]), array([95028, 21631]))

In [59]:
threshold2 = 0.5

tpred_train = []
for i in range(len(p_train)):
    if p_train[i] > threshold2:
        tpred_train.append(1)
    else:
        tpred_train.append(0)
        
tpred_test = []
for i in range(len(p_test)):
    if p_test[i] > threshold2:
        tpred_test.append(1)
    else:
        tpred_test.append(0)

In [60]:
np.unique(tpred_test, return_counts=True)

(array([0, 1]), array([95257, 21402]))

In [61]:
train_preds = []
for i in range(len(tpred_train)):
    train_preds.append(tpred_train[i] & fpred_train[i])
    
test_preds = []
for i in range(len(tpred_test)):
    test_preds.append(tpred_test[i] & fpred_test[i])

In [62]:
y_train = data_train['labels']['TARGET_S_TO_S_APPLE']
y_test = data_test['labels']['TARGET_S_TO_S_APPLE']

roc_auc_train = roc_auc_score(y_train.values, train_preds)
roc_auc_test = roc_auc_score(y_test.values, test_preds)

a_train = accuracy_score(y_train.values, np.rint(train_preds))
a_test = accuracy_score(y_test.values, np.rint(test_preds))

prec_train = precision_score(y_train.values, np.rint(train_preds))
prec_test = precision_score(y_test.values, np.rint(test_preds))

r_train = recall_score(y_train.values, np.rint(train_preds))
r_test = recall_score(y_test.values, np.rint(test_preds))

m_train = confusion_matrix(y_train.values, np.rint(train_preds))
m_test = confusion_matrix(y_test.values, np.rint(test_preds))
true_pos_rate_train = m_train[1][1]/(m_train[1][1]+m_train[1][0])
true_pos_rate_test = m_test[1][1]/(m_test[1][1]+m_test[1][0])

print('train-auc: %f\teval-auc: %f' % (roc_auc_train, roc_auc_test))
print('train-accuracy: %f\teval-accuracy: %f' % (a_train, a_test))
print('train-precision: %f\teval-precision: %f' % (prec_train, prec_test))
print('train-recall: %f\teval-recall: %f' % (r_train, r_test))

print('train-confusion-matrix:\n', m_train)
print('test-confusion-matrix:\n', m_test)
print('train-true-pos-rate: %f\teval-true-pos-rate: %f' % (true_pos_rate_train, true_pos_rate_test))

train-auc: 0.919812	eval-auc: 0.708753
train-accuracy: 0.848510	eval-accuracy: 0.844710
train-precision: 0.031612	eval-precision: 0.018243
train-recall: 0.991831	eval-recall: 0.571429
train-confusion-matrix:
 [[393635  70671]
 [    19   2307]]
test-confusion-matrix:
 [[98211 17867]
 [  249   332]]
train-true-pos-rate: 0.991831	eval-true-pos-rate: 0.571429
