In [1]:
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns
import os
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix, hstack, issparse
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, StandardScaler
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score
from sklearn.ensemble import RandomForestClassifier
from collections import Counter
import pickle
from sklearn.pipeline import Pipeline
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV
# from tqdm import tqdm
import time
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
from keras.models import Model, Sequential
from keras.layers import Input, Dense, Conv1D, MaxPooling1D, Softmax, Add, Flatten, Activation, LSTM, Bidirectional
from keras.layers import Dropout, BatchNormalization, TimeDistributed, GRU, Reshape
from keras import backend as K
from keras.optimizers import Adam
from keras.callbacks import LearningRateScheduler, ModelCheckpoint, EarlyStopping
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import to_categorical
from keras.regularizers import l2, l1

import functools

import types
import copy

from numpy.random import seed
from tensorflow import set_random_seed
import tensorflow as tf

K.clear_session()

session_conf = tf.ConfigProto(intra_op_parallelism_threads=1,
                              inter_op_parallelism_threads=1,
#                               device_count = {'GPU' : 0}
                             )

# The below tf.set_random_seed() will make random number generation
# in the TensorFlow backend have a well-defined initial state.
# For further details, see:
# https://www.tensorflow.org/api_docs/python/tf/set_random_seed

tf.set_random_seed(1234)

sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
K.set_session(sess)

seed(17)
set_random_seed(17)

sns.set(style="darkgrid")

Using TensorFlow backend.


In [3]:
# os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"   # see issue #152
# os.environ["CUDA_VISIBLE_DEVICES"] = ""

In [4]:
# A helper function for writing predictions to a file
def write_to_submission_file(predicted_labels, out_file,
                             target='target', index_label="session_id"):
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(1, predicted_labels.shape[0] + 1),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)


def add_time_features(df, X_sparse):
    hour = df['time1'].apply(lambda ts: ts.hour)
    morning = ((hour >= 8) & (hour <= 11)).astype('int')
    day = ((hour >= 12) & (hour <= 18)).astype('int')
    evening = ((hour >= 19) & (hour <= 23)).astype('int')
    night = ((hour >= 0) & (hour <= 7)).astype('int')

    weekday = df['time1'].apply(lambda ts: ts.weekday())
#    weekend = (weekday <= 4).astype('int')


    min_d = df[times].min(axis=1)
    max_d = df[times].max(axis=1)

    # Calculate sessions' duration in seconds
    seconds = (max_d - min_d) / np.timedelta64(1, 's')

    n_unique_sites = df[df[sites] != 0][sites].apply(
            lambda site: site[site != 0].nunique(),
            axis=1).astype('float64')

    X = hstack([
            X_sparse,
            morning.values.reshape(-1, 1),
            day.values.reshape(-1, 1),
            evening.values.reshape(-1, 1),
            night.values.reshape(-1, 1),
            seconds.values.reshape(-1, 1),
            n_unique_sites.values.reshape(-1, 1)
#            weekday.values.reshape(-1, 1)
#            weekend.values.reshape(-1, 1)
            ])
    return X


In [5]:
# def as_keras_metric(method):
#     @functools.wraps(method)
#     def wrapper(self, args, **kwargs):
#         """ Wrapper for turning tensorflow metrics into keras metrics """
#         value, update_op = method(self, args, **kwargs)
#         K.get_session().run(tf.local_variables_initializer())
#         with tf.control_dependencies([update_op]):
#             value = tf.identity(value)
#         return value
#     return wrapper
# auc_roc = as_keras_metric(tf.metrics.auc)

def auc_roc(y_true, y_pred):
    return tf.py_func(roc_auc_score, (y_true, y_pred), tf.double)

In [6]:
def grid_clf(estimator, param_grid, Xtrain, ytrain, cv, scoring=auc_roc):
    """
    CV using GridSearchCV for given estimator.
    """    
    grid_nn = GridSearchCV(estimator=estimator, scoring=scoring, param_grid=param_grid, cv=cv)
    grid_result_nn = grid_nn.fit(Xtrain, ytrain)
    
    # summarize results
    print("Best: %f using %s" % (grid_result_nn.best_score_, grid_result_nn.best_params_))
    means = grid_result_nn.cv_results_['mean_test_score']
    stds = grid_result_nn.cv_results_['std_test_score']
    params = grid_result_nn.cv_results_['params']
    for mean, stdev, param in zip(means, stds, params):
        print("%f (%f) with: %r" % (mean, stdev, param))
        
    return grid_result_nn
    
def grid_logit_model(
    Xtrain, 
    ytrain, 
#     scoring=auc_roc,
    cv=StratifiedKFold(n_splits=3),
    param_grid=None):
    """
    CV using GridSearchCV for LogisticRegression model.
    """    
    clf = LogisticRegression(
        multi_class='ovr', 
        solver='saga',
        random_state=17, 
        n_jobs=-1)

    if param_grid is None:
        Cs = [1, 0.01]
        param_grid = dict(
            C=Cs,
            multi_class=['ovr']
        )
    
    return grid_clf(clf, param_grid, Xtrain, ytrain, cv, scoring=scoring)

In [7]:
class KerasBatchClassifier(KerasClassifier):
    """
    Add fit_generator to KerasClassifier to convert sparse matrices to numpy arrays before fitting.
    """

    def fit(self, X, y, **kwargs):
        if not issparse(X):
            return super().fit(X, y, **kwargs)

        # taken from keras.wrappers.scikit_learn.KerasClassifier.fit ###################################################
        if self.build_fn is None:
            self.model = self.__call__(**self.filter_sk_params(self.__call__))
        elif not isinstance(self.build_fn, types.FunctionType) and not isinstance(self.build_fn, types.MethodType):
            self.model = self.build_fn(**self.filter_sk_params(self.build_fn.__call__))
        else:
            self.model = self.build_fn(**self.filter_sk_params(self.build_fn))

        loss_name = self.model.loss
        if hasattr(loss_name, '__name__'):
            loss_name = loss_name.__name__
        if loss_name == 'categorical_crossentropy' and len(y.shape) != 2:
            y = to_categorical(y)

        fit_args = copy.deepcopy(self.filter_sk_params(Sequential.fit_generator))
        fit_args.update(kwargs)
        ################################################################################################################

        early_stopping = EarlyStopping(monitor="val_loss", 
                                       patience=3, 
#                                        verbose=1, 
                                       mode="auto")
        model_checkpoint = ModelCheckpoint("results/best_weights.{epoch:02d}.hdf5", 
                                           monitor="val_loss", 
#                                            verbose=1, 
                                           save_best_only=True,
                                           mode="auto")
        
        callbacks = [early_stopping, model_checkpoint]
        fit_args.update({"callbacks": callbacks})

        self.__history = self.model.fit_generator(
            self.batch_generator(X, y, batch_size=self.sk_params["batch_size"]),
#             samples_per_epoch=X.shape[0],
            steps_per_epoch=X.shape[0] // self.sk_params["batch_size"],
            **fit_args)

        return self.__history

#     def score(self, X, y, **kwargs):
#         kwargs = self.filter_sk_params(Sequential.evaluate, kwargs)

#         # sparse to numpy array
#         X = KerasBatchClassifier.sparse_to_array(X)

#         loss_name = self.model.loss
#         if hasattr(loss_name, '__name__'):
#             loss_name = loss_name.__name__
#         if loss_name == 'categorical_crossentropy' and len(y.shape) != 2:
#             y = to_categorical(y)
#         outputs = self.model.evaluate(X, y, **kwargs)
#         if type(outputs) is not list:
#             outputs = [outputs]
#         for name, output in zip(self.model.metrics_names, outputs):
#             if name == 'acc':
#                 return output
#         raise Exception('The model is not configured to compute accuracy. '
#                         'You should pass `metrics=["accuracy"]` to '
#                         'the `model.compile()` method.')

    def predict(self, X, batch_size=128):
        return self.predict_proba(self, X, batch_size=batch_size)[:, 1] >= 0.5
#         pred_generator = KerasBatchClassifier.batch_generator
#         add_part = 1 if X.shape[0] % batch_size != 0 else 0
#         return self.model.predict_generator(
#             red_generator(X, None, batch_size), X.shape[0] // batch_size + add_part)[:, 1] >= 0.5

    def predict_proba(self, X, batch_size=128):
        pred_generator = KerasBatchClassifier.batch_generator
        add_part = 1 if X.shape[0] % batch_size != 0 else 0
        return self.model.predict_generator(
            pred_generator(X, None, batch_size=batch_size), X.shape[0] // batch_size + add_part)

    @staticmethod
    def batch_generator(x, y=None, batch_size=128, predict_mode = False):
        """ batch generator to enable sparse input """
        index = np.arange(x.shape[0])
        start = 0
        hasData = True
        while hasData:
#             if start == 0 and y is not None:
#                 np.random.shuffle(index)
            batch = index[start:start + batch_size]
            if y is not None:
                yield KerasBatchClassifier.sparse_to_array(x[batch]), y[batch]
            else:
                yield KerasBatchClassifier.sparse_to_array(x[batch])
                
            hasData = (start + batch_size) < x.shape[0]
            start += batch_size
            if not predict_mode:
                if start >= x.shape[0]:
                    start = 0
                    
                hasData = True

        if predict_mode:
            remain_part = start % x.shape[0]
            if (start - remain_part) < batch_size:
                start = start - batch_size
                remain_part = x.shape[0] - start
                print('yield last! start={} remain_part={}'.format(start, remain_part))
                batch = index[start:start + remain_part]
                yield KerasBatchClassifier.sparse_to_array(x[batch])

#         while True:
#             for Xy in zip(x, y):
#                 yield (KerasBatchClassifier.sparse_to_array(Xy[0]), Xy[1])

#     def on_epoch_end(self):
#         'Updates indexes after each epoch'
#         self.start = 0
#         if self.shuffle == True:
#             np.random.shuffle(self.indexes)
            
    @staticmethod
    def sparse_to_array(sparse_list):
#         print('sparse_list.shape={}'.format(sparse_list.shape))
        nn = np.expand_dims(sparse_list.toarray(), 2)
#         print('nn.shape={}'.format(nn.shape))
        return nn
#         array_list = []
#         for s in sparse_list:
#             array_list.append(s.toarray().astype(np.int8))
#         return np.expand_dims(np.array(array_list), 1)
#         return np.array(array_list)

    @property
    def history(self):
        return self.__history

In [8]:
# class DataGenerator(keras.utils.Sequence):
#     'Generates data for Keras'
#     def __init__(self, x, y=None, batch_size=32, shuffle=True, predict_mode = False):
#         'Initialization'
#         self.batch_size = batch_size
#         self.y = y
#         self.x = x
#         self.predict_mode = predict_mode
#         self.shuffle = shuffle
#         self.on_epoch_end()

#     def __len__(self):
#         'Denotes the number of batches per epoch'
#         return int(np.floor(len(self.list_IDs) / self.batch_size))

#     def __getitem__(self, index):
#         'Generate one batch of data'
#         # Generate indexes of the batch
#         indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

#         # Find list of IDs
#         list_IDs_temp = [self.list_IDs[k] for k in indexes]

#         # Generate data
#         X, y = self.__data_generation(list_IDs_temp)

#         return X, y

#     def on_epoch_end(self):
#         'Updates indexes after each epoch'
#         self.indexes = np.arange(len(self.list_IDs))
#         if self.shuffle == True:
#             np.random.shuffle(self.indexes)

#     def __data_generation(self, list_IDs_temp):
#         'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
#         # Initialization
#         X = np.empty((self.batch_size, *self.dim, self.n_channels))
#         y = np.empty((self.batch_size), dtype=int)

#         # Generate data
#         for i, ID in enumerate(list_IDs_temp):
#             # Store sample
#             X[i,] = np.load('data/' + ID + '.npy')

#             # Store class
#             y[i] = self.labels[ID]

#         return X, keras.utils.to_categorical(y, num_classes=self.n_classes)

In [51]:
# def focal_loss(gamma=2., alpha=.25):

#     gamma = float(gamma)
#     alpha = float(alpha)

#     def focal_loss_fixed(y_true, y_pred):
#         """Focal loss for multi-classification
#         FL(p_t)=-alpha(1-p_t)^{gamma}ln(p_t)
#         Notice: y_pred is probability after softmax
#         gradient is d(Fl)/d(p_t) not d(Fl)/d(x) as described in paper
#         d(Fl)/d(p_t) * [p_t(1-p_t)] = d(Fl)/d(x)
#         Focal Loss for Dense Object Detection
#         https://arxiv.org/abs/1708.02002

#         Arguments:
#             y_true {tensor} -- ground truth labels, shape of [batch_size, num_cls]
#             y_pred {tensor} -- model's output, shape of [batch_size, num_cls]

#         Keyword Arguments:
#             gamma {float} -- (default: {2.0})
#             alpha {float} -- (default: {4.0})

#         Returns:
#             [tensor] -- loss.
#         """
#         epsilon = 1.e-5
#         print(y_pred.shape)
#         y_true = tf.convert_to_tensor(y_true, tf.float32)
#         y_pred = tf.convert_to_tensor(y_pred, tf.float32)

#         model_out = tf.add(y_pred, epsilon)
#         ce = tf.multiply(y_true, -tf.log(model_out))
#         weight = tf.multiply(y_true, tf.pow(tf.subtract(1., model_out), gamma))
#         fl = tf.multiply(alpha, tf.multiply(weight, ce))
#         reduced_fl = tf.reduce_max(fl, axis=1)
#         return tf.reduce_mean(reduced_fl)
#     return focal_loss_fixed

# def focal_loss(gamma=2., alpha=.25):
#     def focal_loss_fixed(y_true, y_pred):
#         pt_1 = tf.where(tf.equal(y_true, 1), y_pred, tf.ones_like(y_pred))
#         pt_0 = tf.where(tf.equal(y_true, 0), y_pred, tf.zeros_like(y_pred))
#         return -K.sum(alpha * K.pow(1. - pt_1, gamma) * K.log(pt_1))-K.sum((1-alpha) * K.pow( pt_0, gamma) * K.log(1. - pt_0))

#     return focal_loss_fixed

# def focal_loss(gamma=2, alpha=0.25):
#     def focal_loss_fixed(y_true, y_pred):#with tensorflow
#         eps = 1e-4
#         y_pred=K.clip(y_pred,eps,1.-eps)#improve the stability of the focal loss and see issues 1 for more information
#         pt_1 = tf.where(tf.equal(y_true, 1), y_pred, tf.ones_like(y_pred))
#         pt_0 = tf.where(tf.equal(y_true, 0), y_pred, tf.zeros_like(y_pred))
#         return -K.sum(alpha * K.pow(1. - pt_1, gamma) * K.log(pt_1))-K.sum((1-alpha) * K.pow( pt_0, gamma) * K.log(1. - pt_0))
#     return focal_loss_fixed

In [66]:
class KerasModelCreator():
    def __init__(
        self, 
        feature,
        depth,
        metrics=['accuracy'],
        filters=32, #32
        pool_size=5, # 5
        random_state=17):
        
        self.feature=feature
        self.depth=depth
        self.filters=filters
        self.pool_size=pool_size
        self.random_state=random_state
        self.metrics=metrics
    
    def __call__(
        self, 
        optimizer='adam', 
        init='glorot_uniform', 
        loss='categorical_crossentropy', #'sparse_categorical_crossentropy'
#         metrics=[auc_roc],
        **sk_params):
        
        seed(self.random_state)
        set_random_seed(self.random_state)

        filters=self.filters
        pool_size=self.pool_size
        kernel_size=2
        
        print('feature={} depth={} filters={}'.format(self.feature, self.depth, filters))
        inp = Input(shape=(self.feature, self.depth))
        
        hidden_size = 2
        dropout_rate = 0.1
        regul_coef_dense = 0.01
        
        C = Conv1D(filters=filters, kernel_size=kernel_size, strides=1, kernel_initializer=init,
                   kernel_regularizer=l2(regul_coef_dense),
                   input_shape=(self.feature, self.depth))(inp)
        C11 = Conv1D(filters=filters, kernel_size=kernel_size, strides=1, kernel_initializer=init, padding='same')(C)
        A11 = Activation("sigmoid")(C11)
        C12 = Conv1D(filters=filters, kernel_size=kernel_size, strides=1, kernel_initializer=init, padding='same')(A11)
        S11 = Add()([C12, C])
        A12 = Activation("sigmoid")(S11)
        M11 = MaxPooling1D(pool_size=pool_size, strides=2)(A12)
        
#         C21 = Conv1D(filters=filters, kernel_size=kernel_size, strides=1, kernel_initializer=init, padding='same')(M11)
#         A21 = Activation("relu")(C21)
#         C22 = Conv1D(filters=filters, kernel_size=kernel_size, strides=1, kernel_initializer=init, padding='same')(A21)
#         S21 = Add()([C22, M11])
#         A22 = Activation("relu")(S21)
#         M21 = MaxPooling1D(pool_size=pool_size, strides=2)(A22)
        
#         C31 = Conv1D(filters=filters, kernel_size=kernel_size, strides=1, kernel_initializer=init, padding='same')(M21)
#         A31 = Activation("relu")(C31)
#         C32 = Conv1D(filters=filters, kernel_size=kernel_size, strides=1, kernel_initializer=init, padding='same')(A31)
#         S31 = Add()([C32, M21])
#         A32 = Activation("relu")(S31)
#         M31 = MaxPooling1D(pool_size=pool_size, strides=2)(A32)

        output = Dropout(rate=dropout_rate)(M11)
        
#         output = Bidirectional(
#         output = \
#              LSTM(hidden_size, 
#                    activation='tanh',
#                    dropout=dropout_rate,
#                    return_sequences=False,
#              )(output)#(inp)
    
#         output = GRU(8, activation="relu")(output)
    
#         output = Dropout(rate=dropout_rate)(output)
#         output = Dense(pool_size, 
#                        activation=None,
#                        kernel_regularizer=l2(regul_coef_dense)
#                       )(output)
        output = BatchNormalization()(output)
        output = Activation('sigmoid')(output)
        output = Dropout(rate=dropout_rate)(output)
#         output = Dense(pool_size, 
#                        activation=None,
#                        kernel_regularizer=l2(regul_coef_dense))(output)
#         output = BatchNormalization()(output)
#         output = TimeDistributed(Dense(pool_size))(output)


#         F1 = output # Flatten()(output)
        F1 = Flatten()(output)

        D1 = Dense(filters, kernel_initializer=init)(F1)
        A6 = Activation("sigmoid")(D1)
        D2 = Dense(filters, kernel_initializer=init)(A6)
        D3 = Dense(pool_size, kernel_initializer=init)(D2)
#         D3 = Dense(kernel_initializer=init)(D2)
        A7 = Activation('sigmoid')(D3) # Softmax()(D3)
        act_output = A7 # Reshape((2, -1))(A7)
        model = Model(inputs=inp, outputs=act_output)
        
# #         C = inp  # Conv1D(filters=filters, kernel_size=kernel_size, strides=1, kernel_initializer=init)(inp)
#         C = Conv1D(filters=filters, kernel_size=kernel_size, strides=1, kernel_initializer=init,
#                    kernel_regularizer=regularizers.l2(0.1),
#                    input_shape=(self.feature, self.depth))(inp)

#         C11 = Conv1D(filters=filters, kernel_size=kernel_size, strides=1, kernel_initializer=init, padding='same')(C)
#         A11 = Activation("relu")(C11)
#         C12 = Conv1D(filters=filters, kernel_size=kernel_size, strides=1, kernel_initializer=init, padding='same')(A11)
#         S11 = Add()([C12, C])
#         A12 = Activation("relu")(S11)
#         M11 = MaxPooling1D(pool_size=pool_size, strides=2)(A12)


#         C21 = Conv1D(filters=filters, kernel_size=kernel_size, strides=1, kernel_initializer=init, padding='same')(M11)
#         A21 = Activation("relu")(C21)
#         C22 = Conv1D(filters=filters, kernel_size=kernel_size, strides=1, kernel_initializer=init, padding='same')(A21)
#         S21 = Add()([C22, M11])
#         A22 = Activation("relu")(S21)
#         M21 = MaxPooling1D(pool_size=pool_size, strides=2)(A22)


#         C31 = Conv1D(filters=filters, kernel_size=kernel_size, strides=1, kernel_initializer=init, padding='same')(M21)
#         A31 = Activation("relu")(C31)
#         C32 = Conv1D(filters=filters, kernel_size=kernel_size, strides=1, kernel_initializer=init, padding='same')(A31)
#         S31 = Add()([C32, M21])
#         A32 = Activation("relu")(S31)
#         M31 = MaxPooling1D(pool_size=pool_size, strides=2)(A32)


#         C41 = Conv1D(filters=filters, kernel_size=kernel_size, strides=1, kernel_initializer=init, padding='same')(M31)
#         A41 = Activation("relu")(C41)
#         C42 = Conv1D(filters=filters, kernel_size=kernel_size, strides=1, kernel_initializer=init, padding='same')(A41)
#         S41 = Add()([C42, M31])
#         A42 = Activation("relu")(S41)
#         M41 = MaxPooling1D(pool_size=pool_size, strides=2)(A42)


#         C51 = Conv1D(filters=filters, kernel_size=kernel_size, strides=1, kernel_initializer=init, padding='same')(M41)
#         A51 = Activation("relu")(C51)
#         C52 = Conv1D(filters=filters, kernel_size=kernel_size, strides=1, kernel_initializer=init, padding='same')(A51)
#         S51 = Add()([C52, M41])
#         A52 = Activation("relu")(S51)
#         M51 = MaxPooling1D(pool_size=pool_size, strides=2)(A52)

#         F1 = Flatten()(M21)#Flatten()(C)
# #         F1 = Flatten()(M51)

#         D1 = Dense(filters, kernel_initializer=init)(F1)
#         A6 = Activation("relu")(D1)
#         D2 = Dense(filters, kernel_initializer=init)(A6)
#         D3 = Dense(pool_size, kernel_initializer=init)(D2)
#         A7 = Softmax()(D3)

#         model = Model(inputs=inp, outputs=A7)


        model.compile(loss=loss, optimizer=optimizer, 
#                       metrics =[auc_roc],
                      metrics=self.metrics,
                      **sk_params)
        model.summary()
        return model

In [11]:
def show_clf_results(
    clf, 
    Xtest, 
    ytest, 
    ytrain,
    additional_title='',
    show_metrics=True,
    **kwargs):
    
    ypred = clf.predict_proba(Xtest, **kwargs)
    
    if show_metrics:
        print('roc_auc: {}'.format(roc_auc_score(y_score=ypred, y_true=ytest)))
        print('accuracy: {}'.format(accuracy_score(y_pred=ypred>=0.5, y_true=ytest)))
        print('precision: {}'.format(precision_score(y_pred=ypred>=0.5, y_true=ytest, average='macro')))    
        
    ypred

In [12]:
def nn_batch_generator(X_data, y_data, batch_size=100):
    samples_per_epoch = X_data.shape[0]
    number_of_batches = samples_per_epoch/batch_size
    counter=0
    index = np.arange(np.shape(y_data)[0])
    while 1:
        index_batch = index[batch_size*counter:batch_size*(counter+1)]
        X_batch = X_data[index_batch,:].todense()
        y_batch = y_data[index_batch]
        counter += 1
        yield np.array(X_batch),y_batch
        if (counter > number_of_batches):
            counter=0

In [13]:
def train_nn_model(
    Xtrain, 
    ytrain, 
    Xtest, 
    ytest,
    metrics=[auc_roc],
    batch_size=100,
    epochs=15,
    filters=32,
    pool_size=5,
    additional_title='[NN]',
    show_metrics=True,
    verbose=1
    ):
    
    seed(17)
    set_random_seed(17)
    
    n_obs, feature = Xtrain.shape
    depth = 1
    
    model_creator = KerasModelCreator(
        metrics=metrics,
        feature=feature, 
        depth=depth,
        filters=filters,
        pool_size=pool_size,
        random_state=17)
    
    model_nn1 = KerasBatchClassifier(
        build_fn=model_creator, 
        epochs=epochs, 
        batch_size=batch_size, 
        verbose=verbose)
    
    
    history = model_nn1.fit(
#         nn_batch_generator(Xtrain, ytrain, batch_size=batch_size),
#         samples_per_epoch=Xtrain.shape[0],
        Xtrain,
        ytrain, 
        epochs=epochs,
#         batch_size=batch_size, 
        verbose=verbose, 
    #                     validation_data=(X_test, y_test_nn), 
    #                     callbacks=[lrate]
                           )
    return model_nn1, history
#     return show_clf_results(
#             model_nn1, 
#             Xtest, 
#             ytest, 
#             ytrain, 
#             additional_title,
#             show_metrics=show_metrics
#         ), model_nn1, history

In [14]:
def grid_nn_model(   
    Xtrain, 
    ytrain, 
    scoring=auc_roc,
    batch_size=100,
    epochs=15,
    filters=32,
    pool_size=5,
    cv=StratifiedKFold(n_splits=3),
    param_grid=None):
    """
    CV using GridSearchCV for NN model.
    """ 
    n_obs, feature, depth = Xtrain.shape
    model_creator = KerasModelCreator(
        feature=feature, 
        depth=depth,
        filters=filters,
        pool_size=pool_size,
        random_state=17)

    clf = KerasBatchClassifier(build_fn=model_creator, 
                          epochs=2, 
                          batch_size=batch_size, 
                          verbose=1)
    if param_grid is None:
        optimizers = ['rmsprop', 'adam']
        init = ['glorot_uniform', 'normal', 'uniform']
        metrics=[[auc_roc]]#'['categorical_accuracy', 'accuracy']
        epochs = [1, 2]
        batches = [50, 100]
        param_grid = dict(
            optimizer=optimizers, 
            epochs=epochs, 
            batch_size=batches, 
            init=init,
#             metrics=metrics
        )
    
    return grid_clf(clf, param_grid, Xtrain, ytrain, cv, scoring=scoring)


In [15]:
train_df = pd.read_csv('../../data/websites_train_sessions.csv',
                       index_col='session_id')
test_df = pd.read_csv('../../data/websites_test_sessions.csv',
                      index_col='session_id')

# Convert time1, ..., time10 columns to datetime type
times = ['time%s' % i for i in range(1, 11)]
sites = ['site%s' % i for i in range(1, 11)]
train_df[times] = train_df[times].apply(pd.to_datetime)
test_df[times] = test_df[times].apply(pd.to_datetime)

# Sort the data by time
train_df = train_df.sort_values(by='time1')

# Look at the first rows of the training set
train_df.head()


Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21669,56,2013-01-12 08:05:57,55.0,2013-01-12 08:05:57,,NaT,,NaT,,NaT,...,NaT,,NaT,,NaT,,NaT,,NaT,0
54843,56,2013-01-12 08:37:23,55.0,2013-01-12 08:37:23,56.0,2013-01-12 09:07:07,55.0,2013-01-12 09:07:09,,NaT,...,NaT,,NaT,,NaT,,NaT,,NaT,0
77292,946,2013-01-12 08:50:13,946.0,2013-01-12 08:50:14,951.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:16,...,2013-01-12 08:50:16,948.0,2013-01-12 08:50:16,784.0,2013-01-12 08:50:16,949.0,2013-01-12 08:50:17,946.0,2013-01-12 08:50:17,0
114021,945,2013-01-12 08:50:17,948.0,2013-01-12 08:50:17,949.0,2013-01-12 08:50:18,948.0,2013-01-12 08:50:18,945.0,2013-01-12 08:50:18,...,2013-01-12 08:50:18,947.0,2013-01-12 08:50:19,945.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:20,0
146670,947,2013-01-12 08:50:20,950.0,2013-01-12 08:50:20,948.0,2013-01-12 08:50:20,947.0,2013-01-12 08:50:21,950.0,2013-01-12 08:50:21,...,2013-01-12 08:50:21,946.0,2013-01-12 08:50:21,951.0,2013-01-12 08:50:22,946.0,2013-01-12 08:50:22,947.0,2013-01-12 08:50:22,0


In [16]:
# Convert time1, ..., time10 columns to datetime type
times = ['time%s' % i for i in range(1, 11)]
sites = ['site%s' % i for i in range(1, 11)]
train_df[times] = train_df[times].apply(pd.to_datetime)
test_df[times] = test_df[times].apply(pd.to_datetime)

# Sort the data by time
train_df = train_df.sort_values(by='time1')

# Look at the first rows of the training set
train_df.head()

Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21669,56,2013-01-12 08:05:57,55.0,2013-01-12 08:05:57,,NaT,,NaT,,NaT,...,NaT,,NaT,,NaT,,NaT,,NaT,0
54843,56,2013-01-12 08:37:23,55.0,2013-01-12 08:37:23,56.0,2013-01-12 09:07:07,55.0,2013-01-12 09:07:09,,NaT,...,NaT,,NaT,,NaT,,NaT,,NaT,0
77292,946,2013-01-12 08:50:13,946.0,2013-01-12 08:50:14,951.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:16,...,2013-01-12 08:50:16,948.0,2013-01-12 08:50:16,784.0,2013-01-12 08:50:16,949.0,2013-01-12 08:50:17,946.0,2013-01-12 08:50:17,0
114021,945,2013-01-12 08:50:17,948.0,2013-01-12 08:50:17,949.0,2013-01-12 08:50:18,948.0,2013-01-12 08:50:18,945.0,2013-01-12 08:50:18,...,2013-01-12 08:50:18,947.0,2013-01-12 08:50:19,945.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:20,0
146670,947,2013-01-12 08:50:20,950.0,2013-01-12 08:50:20,948.0,2013-01-12 08:50:20,947.0,2013-01-12 08:50:21,950.0,2013-01-12 08:50:21,...,2013-01-12 08:50:21,946.0,2013-01-12 08:50:21,951.0,2013-01-12 08:50:22,946.0,2013-01-12 08:50:22,947.0,2013-01-12 08:50:22,0


In [17]:
with open('../../data/site_dic.pkl', "rb") as inp_file:
    site_dic = pickle.load(inp_file)

inv_site_dic = {v: k for k, v in site_dic.items()}
# inv_site_dic.update({0: ''})

In [18]:
train_df[sites] = train_df[sites].fillna(0)
test_df[sites] = test_df[sites].fillna(0)

In [19]:
def AAA():
    X_train = train_df[sites].apply(
            lambda x: " ".join(
                    [inv_site_dic[a] for a in x.values if a != 0]), axis=1)

    X_train = X_train.apply(lambda x: x.replace('.', ' '))

    X_test = test_df[sites].apply(
            lambda x: " ".join(
                    [inv_site_dic[a] for a in x.values if a != 0]), axis=1)

    X_test = X_test.apply(lambda x: x.replace('.', ' '))

    y_train = train_df['target'].astype('int')


    pipeline = Pipeline(
            [("vectorize",
              TfidfVectorizer(ngram_range=(1, 3),
                              max_features=100000)),  # 100000!!!!
            ("tfidf", TfidfTransformer())])

    pipeline.fit(X_train.ravel(), y_train)


    X_train = pipeline.transform(X_train.ravel())
    X_test = pipeline.transform(X_test.ravel())


    print(type(X_train))    # scipy.sparse.csr.csr_matrix

    print(X_train.shape)   # (253561, 250000)


    X_train = add_time_features(train_df, X_train)
    X_test = add_time_features(test_df, X_test)


    print(X_train.shape, X_test.shape)  #  ((253561, 250004), (82797, 250004))

    time_split = TimeSeriesSplit(n_splits=12)
    logit = LogisticRegression(C=1, random_state=17)

    # c_values = np.logspace(-2,2, 10)
    # c_values = np.arange(0,5.,step=0.5)
    # c_values = np.concatenate((np.arange(0.1,1,0.1), np.arange(1,5,0.5)))
    c_values = np.concatenate((np.arange(0.5, 2,step=0.5), np.arange(2, 3.6, 0.1)))

    logit_grid_searcher = GridSearchCV(
            estimator=logit,
            param_grid={'C': c_values},
            scoring='auc',
            n_jobs=-1,
            cv=time_split,
            verbose=10)


    logit_grid_searcher.fit(X_train, y_train)


    print(logit_grid_searcher.best_score_, logit_grid_searcher.best_params_)
    # (0.8884620990228279, {'C': 3.5000000000000013})
    # weekend+weekday 0.876261489918 {'C': 3.5000000000000013}
    # weekday  0.876293321626 {'C': 3.5000000000000013}
    # def : 100000   0.890742942071 {'C': 3.5000000000000013}
    #                0.890858325471 {'C': 4.200000000000002}
    #      +seconds+n_unique_sites  0.893673781638 {'C': 3.4000000000000012}

    logit_test_pred = logit_grid_searcher.predict_proba(X_test)[:, 1]
    write_to_submission_file(logit_test_pred, 'submit.csv')


In [20]:
# Special transformer to save output shape
class ShapeSaver(BaseEstimator, TransformerMixin):
    def transform(self, X):
        self.shape = X.shape
        return X

    def fit(self, X, y=None, **fit_params):
        return self


In [21]:
#####################################
## Helper functions that extract different data
#####################################

# Return sites columns as a single string
# This string can be supplied into CountVectorizer or TfidfVectorizer

def extract_sites_as_string(X):
    #return X[sites].astype('str').apply(' '.join, axis=1)
    return X['sites_str']



# Year-month feature from A4
def feature_year_month(X):
    return pd.DataFrame(X['time1'].dt.year * 100 + X['time1'].dt.month)

def feature_year_month_log1p(X):
    return pd.DataFrame(np.log1p(X['time1'].dt.year * 100 + X['time1'].dt.month))

# yearfeature from A4
def feature_year(X):
    return pd.DataFrame(X['time1'].dt.year)

# Hour feature from A4
def feature_hour(X):
    return pd.DataFrame(X['time1'].dt.hour)

# Hour feature from A4
def feature_hour_log(X):
    return np.log1p(pd.DataFrame(X['time1'].dt.hour))


# Month
def feature_month(X):
    return pd.DataFrame(X['time1'].dt.month)

# Weekday
def feature_weekday(X):
    return pd.DataFrame(X['time1'].dt.weekday)

# Is day feature from A4
def feature_is_daytime(X):
    return pd.DataFrame( (X['time1'].dt.hour >= 12) & (X['time1'].dt.hour <= 18))

# Is evening feature from A4
def feature_is_evening(X):
    return pd.DataFrame( (X['time1'].dt.hour >= 19) & (X['time1'].dt.hour <= 23))

# Is morning feature from A4
def feature_is_morning(X):
    return pd.DataFrame(X['time1'].dt.hour <= 11)

# Long Session length feature from A4
def feature_is_long_session(X):
    X['session_end_time'] = X[times].max(axis=1)
    session_duration = (X['session_end_time'] - X['time1']).astype('timedelta64[s]')
#    q = session_duration.quantile([0.1, 0.90]).values
    X['long_session_duration'] = 0
    X[session_duration < 10]['long_session_duration'] = 1
    X[session_duration < 20]['long_session_duration'] = 2
    X[session_duration < 100]['long_session_duration'] = 3
    X[session_duration < 500]['long_session_duration'] = 4
    X[session_duration < 1000]['long_session_duration'] = 5
#    X[(session_duration > q[1]) & (session_duration <= q[2])]['long_session_duration'] = 2
#    X[(session_duration > q[2]) & (session_duration <= q[3])]['long_session_duration'] = 3
#    X[(session_duration > q[3]) & (session_duration <= q[4])]['long_session_duration'] = 4
#    X[session_duration > q[1]]['long_session_duration'] = 2
    return X[['long_session_duration']]

# Session length feature from A4
def feature_session_len(X):
    X['session_end_time'] = X[times].max(axis=1)
    X['session_duration'] = (X['session_end_time'] - X['time1']).astype('timedelta64[s]')
    return X[['session_duration']]

# uniq sites per session
def feature_uniq_sites(X):
    X['n_unique_sites'] = X[X[sites] != 0][sites].apply(
            lambda site: site[site != 0].nunique(), axis=1).astype('float64')

    return X[['n_unique_sites']]


In [22]:
transform_pipeline = Pipeline([
    ('features', FeatureUnion([
        # List of features goes here:
#        ('year_month_val', Pipeline([
#            ('extract', FunctionTransformer(feature_year_month, validate=False)),
#            ('scale', StandardScaler()),
#            ('shape', ShapeSaver())
#        ])),
        ('session_len', Pipeline([
            ('extract', FunctionTransformer(feature_session_len, validate=False)),
            ('scale', StandardScaler()),
            ('shape', ShapeSaver())
        ])),
        ('weekday_cat', Pipeline([
            ('extract', FunctionTransformer(feature_weekday, validate=False)),
            ('ohe', OneHotEncoder()),
            ('shape', ShapeSaver())
        ])),
#        ('hour_val', Pipeline([
#            ('extract', FunctionTransformer(feature_hour, validate=False)),
##            ('scale', StandardScaler()),
#            ('ohe', OneHotEncoder()),
#            ('shape', ShapeSaver())
#         ])),
        ('hour_val_log1p', Pipeline([
            ('extract', FunctionTransformer(feature_hour_log, validate=False)),
            ('scale', StandardScaler()),
            ('shape', ShapeSaver())
         ])),
        ('hour_cat', Pipeline([
            ('extract', FunctionTransformer(feature_hour, validate=False)),
            ('ohe', OneHotEncoder()),
            ('shape', ShapeSaver())
         ])),
        ('month_cat', Pipeline([
            ('extract', FunctionTransformer(feature_month, validate=False)),
            ('ohe', OneHotEncoder()),
            ('shape', ShapeSaver())
         ])),
        ('is_morning', Pipeline([
            ('extract', FunctionTransformer(feature_is_morning, validate=False)),
            ('shape', ShapeSaver())
         ])),
        ('is_daytime', Pipeline([
            ('extract', FunctionTransformer(feature_is_daytime, validate=False)),
            ('shape', ShapeSaver())
         ])),
        ('is_evening', Pipeline([
            ('extract', FunctionTransformer(feature_is_evening, validate=False)),
            ('shape', ShapeSaver())
         ])),
        ('is_long_session', Pipeline([
            ('extract', FunctionTransformer(feature_is_long_session, validate=False)),
            ('ohe', OneHotEncoder()),
            ('shape', ShapeSaver())
         ])),
#        ('feature_uniq_sites', Pipeline([
#            ('extract', FunctionTransformer(feature_uniq_sites, validate=False)),
#            ('ohe', OneHotEncoder()),
#            ('shape', ShapeSaver())
#         ])),
        ('year', Pipeline([
            ('extract', FunctionTransformer(feature_year, validate=False)),
            ('ohe', OneHotEncoder()),
            ('shape', ShapeSaver())
         ])),
        ('sites_tfidf', Pipeline([
            ('extract', FunctionTransformer(extract_sites_as_string, validate=False)),
            ('count', TfidfVectorizer(token_pattern=r'(?u)\b\w+\b',
                                      ngram_range=(1, 3),
                                      max_features=50000)),  # 100000!
            ("tfidf", TfidfTransformer()),
            ('shape', ShapeSaver())
        ])),
        # Add more features here :)
        # ...
    ]))
])


In [23]:
# Run preprocessing on full data
x_train_new = train_df.iloc[:, :-1]
x_train_new['sites_str'] = train_df[sites].apply(
        lambda x: " ".join(
                [inv_site_dic[a] for a in x.values if a != 0]), axis=1)

x_train_new['sites_str'] = x_train_new['sites_str'].apply(lambda x: x.replace('.', ' '))

x_test_new = test_df.iloc[:, :]
x_test_new['sites_str'] = test_df[sites].apply(
        lambda x: " ".join(
                [inv_site_dic[a] for a in x.values if a != 0]), axis=1)

x_test_new['sites_str'] = x_test_new['sites_str'].apply(lambda x: x.replace('.', ' '))

transformed_train_df = transform_pipeline.fit_transform(x_train_new)
transformed_test_df = transform_pipeline.transform(x_test_new)

X_train_new = transformed_train_df
y_train_new = train_df['target']
X_test_new = transformed_test_df

print(transformed_train_df.shape, transformed_test_df.shape)


(253561, 50044) (82797, 50044)


## NN

In [24]:
# model.fit_generator(generator=batch_generator(X_train_sparse, Y_train, batch_size),
#                     nb_epoch=nb_epoch, 
#                     samples_per_epoch=X_train_sparse.shape[0])

In [68]:
EPOCHS = 1
BATCH_SIZE = 100
FILTERS = 20

In [26]:
transformed_train_df.shape, type(transformed_train_df)

((253561, 50044), scipy.sparse.csr.csr_matrix)

In [27]:
# max_train_data = 10000
X_train_nn1 = X_train_new#[:max_train_data]# np.expand_dims(transformed_train_df, 2)
X_test_nn1 = X_test_new# np.expand_dims(transformed_test_df, 2)
y_train_nn1 = y_train_new#[:max_train_data]

In [28]:
X_train_nn1.shape, y_train_nn1.shape, X_test_nn1.shape, type(X_test_nn1)

((253561, 50044), (253561,), (82797, 50044), scipy.sparse.csr.csr_matrix)

In [29]:
X_train_part, X_valid, y_train_part, y_valid = \
        train_test_split(
                X_train_nn1,
                y_train_nn1,
                test_size=0.20,
                random_state=17,
                stratify=y_train_nn1)

X_train_part.shape, X_valid.shape

((202848, 50044), (50713, 50044))

In [69]:
%%time
# with tf.device('/gpu:0'):
    # y_pred_nn1, 
model_nn1, history = train_nn_model(
    X_train_part,
    y_train_part,
    X_valid, 
    y_valid, 
    metrics=['accuracy'],
    additional_title='[NN]',
    pool_size=2,
    show_metrics=True, 
    filters=FILTERS,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    verbose=1)

# with open('no_ym_fg_nn_model.pkl', 'wb') as f:
#     pickle.dump(model_nn1, f)
#     pickle.dump(history, f)

feature=50044 depth=1 filters=20
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_14 (InputLayer)           (None, 50044, 1)     0                                            
__________________________________________________________________________________________________
conv1d_40 (Conv1D)              (None, 50043, 20)    60          input_14[0][0]                   
__________________________________________________________________________________________________
conv1d_41 (Conv1D)              (None, 50043, 20)    820         conv1d_40[0][0]                  
__________________________________________________________________________________________________
activation_57 (Activation)      (None, 50043, 20)    0           conv1d_41[0][0]                  
____________________________________________________________________________

ResourceExhaustedError: OOM when allocating tensor with shape[20] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[{{node batch_normalization_14/moments/mean}} = Mean[T=DT_FLOAT, Tidx=DT_INT32, _class=["loc:@train...ad/truediv"], keep_dims=true, _device="/job:localhost/replica:0/task:0/device:GPU:0"](dropout_27/cond/Merge, training_13/Adam/gradients/batch_normalization_14/moments/variance_grad/mod)]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.


In [38]:
with open('no_ym_fg_nn_model.pkl', 'wb') as f:
    pickle.dump(model_nn1, f)
    pickle.dump(history, f)

PicklingError: Can't pickle <function auc at 0x00000198C3556400>: it's not the same object as tensorflow.python.ops.metrics_impl.auc

In [60]:
# y_pred_nn__ = show_clf_results(
#             model_nn1, 
#             X_valid, 
#             y_valid, 
#             y_train_part, 
#             additional_title='[NN]',
#             show_metrics=True,
#             batch_size=BATCH_SIZE
#         )

BATCH_SIZE, FILTERS, EPOCHS

(100, 2, 1)

In [61]:
# y_pred_nn__ = show_clf_results(
#             model_nn1, 
#             X_valid, 
#             y_valid, 
#             y_train_part, 
#             additional_title='[NN]',
#             show_metrics=True,
#             batch_size=BATCH_SIZE
#         )
# y_pred = model_nn1.predict(X_valid)
# X_valid.shape, y_pred.shape

pred_generator = KerasBatchClassifier.batch_generator(X_valid, None, BATCH_SIZE, True)
add_part = 1 if X_valid.shape[0] % BATCH_SIZE != 0 else 0
y_pred_nn1 = model_nn1.model.predict_generator(pred_generator, X_valid.shape[0] // BATCH_SIZE + add_part)

In [62]:
X_valid.shape, y_pred_nn1.shape  # 1000 => 127909, 500 ==> 64000

((50713, 50044), (50713, 2))

In [65]:
(y_pred_nn1[:, 1] < 0.1).sum()

50713

In [64]:
print('roc_auc: {}'.format(roc_auc_score(y_score=y_pred_nn1[:, 1], y_true=y_valid)))
print('accuracy: {}'.format(accuracy_score(y_pred=y_pred_nn1[:, 1]>=0.5, y_true=y_valid)))
print('precision: {}'.format(precision_score(y_pred=y_pred_nn1[:, 1]>=0.5, y_true=y_valid, average='macro')))  

roc_auc: 0.5
accuracy: 0.9909490663143573
precision: 0.49547453315717865


  'precision', 'predicted', average, warn_for)


In [None]:
ypred_ = model_nn1.predict_proba(X_valid, BATCH_SIZE)
    

print('roc_auc: {}'.format(roc_auc_score(y_score=ypred_, y_true=y_valid)))
print('accuracy: {}'.format(accuracy_score(y_pred=ypred_, y_true=y_valid)))
# print('precision: {}'.format(precision_score(y_pred=ypred_, y_true=y_valid, average='macro')))    

ypred_
# print('roc_auc: {}'.format(roc_auc_score(y_score=y_pred_nn1[:,0]>=0.1, y_true=y_valid)))

## Logit

In [12]:
# time_split = TimeSeriesSplit(n_splits=12)

# logit = LogisticRegression(C=1, random_state=17)

# # c_values = np.logspace(-2,2, 10)
# # c_values = np.arange(0,5.,step=0.5)
# # c_values = np.concatenate((np.arange(0.1,1,0.1), np.arange(1,5,0.5)))
# c_values = np.concatenate((np.arange(0.5, 2,step=0.5), np.arange(2, 3.6, 0.1)))


# clf = LogisticRegression(C=1, random_state=17)  # RandomForestClassifier(random_state=17)

# tree_params = {
#         'max_depth': [2, 5, 10],
#         'max_features': [3, 5, 20]}

# #c_values = np.logspace(-4, 10, 40)
# logit_params = {
#         'C': [0.1, 0.05, 0.15, 2.0309, 3.5],
#         'solver': ['lbfgs']#, 'sag', 'saga'],
# #        'penalty' : ['l1', 'l2']
#         },

# clf_grid_searcher = GridSearchCV(
#         estimator=clf,
#         param_grid=logit_params,
#         scoring='roc_curve',
#         n_jobs=-1,
#         cv=time_split,
#         verbose=10)


# clf_grid_searcher.fit(X_train_new, y_train_new)
# print(clf_grid_searcher.score(X_train_new, y_train_new))
# print(clf_grid_searcher.best_score_, clf_grid_searcher.best_params_)


Fitting 12 folds for each of 5 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    8.0s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   18.0s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   25.8s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   35.2s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   51.0s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  1.8min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  1.8min finished


0.9931947218095609
0.9104206626981047 {'C': 2.0309, 'solver': 'lbfgs'}


In [15]:
%%time

# -- RandomForestClassifier :
#all : 0.868859388644 {'max_depth': 2, 'max_features': 3}
#- day_time - eve -long_sess: 0.866209364039 {'max_depth': 2, 'max_features': 5} \
#               --    -sess_len    0.85821693632 {'max_depth': 10, 'max_features': 5} \
# - year_month  0.881829021147 {'max_depth': 5, 'max_features': 10} / !!!!!!
# - week_day    0.863236779933 {'max_depth': 2, 'max_features': 5}  \
# - hour_Val    0.837083132261 {'max_depth': 10, 'max_features': 3}  \
# - hour_cat    0.859024053379 {'max_depth': 5, 'max_features': 3}  \
# - mon_cat     0.871291662775 {'max_depth': 10, 'max_features': 3}   / !!!!!!
# - is_morning  0.867803370795 {'max_depth': 10, 'max_features': 5} \
# - is_daytime  0.85062178331 {'max_depth': 5, 'max_features': 3}  \
# - is_evening  0.865505228744 {'max_depth': 10, 'max_features': 5}  \
# - is_long_ses 0.868926633714 {'max_depth': 10, 'max_features': 5}  / !!!!


# -- LogisticRegression :
#all : 0.840667160623 {'C': 1.0}
#- day_time - eve -long_sess:
#               --    -sess_len
# - year_month     0.856734589393 {'C': 1}  / !!!!!
# - week_day       0.838524546676 {'C': 0.15} \
# - hour_Val       0.840135383489 {'C': 1} \ ~=
# - hour_cat       0.78535407395 {'C': 0.15} \\\
# + hour_val(ohe): 0.842485256959 {'C': 1} / !!!!
# - hour_cat, + hour_val(ohe) 0.840135383489 {'C': 1} \
# - mon_cat        0.832168325842 {'C': 0.05}  \
# - is_morning     0.838739921881 {'C': 1}   \
# - is_daytime     0.84074001553 {'C': 1}  / !!!!
# - is_evening     0.840012789159 {'C': 1}  \
# - is_long_ses    0.835454317425 {'C': 1}  \
# all 5quantiles  : 0.840667160623 {'C': 1.0} -
# all 2quantiles  : 0.840667160623 {'C': 1} -
# all 2quantiles (0.1, 0.9)  : 0.840667160623 {'C': 1} -
# +uniq_sites(ohe):0.844042953365 {'C': 1}  / !!!!
# +uniq_sites(sc): 0.840681126851 {'C': 1} / !
# +uniq_sites + hour_val(ohe) (+hour_cat): 0.84578279885 {'C': 1} / !!!! ???????????
# -hour_cat +uniq_sites + hour_val(ohe):  0.843802942673 {'C': 1} / !!!
# +hour_cat +uniq_sites - hour_val(ohe):  0.843802942673 {'C': 1} / !!!
# -- +year :        0.845713208798 {'C': 1} / !!!!!
# -hour +log1p(hour)     0.845895327182 {'C': 1} / !!!!!
#   -- -hour_cat         0.786349908391 {'C': 0.15} \ ---
#   -- -year_mon         0.854384041716 {'C': 1} / !!!
#0.993002559728
#0.909865994457 {'C': 2.0309176209047348, 'random_state': 17, 'solver': 'lbfgs'}
# 0.910397293314 {'C': 2.0309, 'solver': 'lbfgs'}



# + Tfidf:          0.907521078033 {'C': 3.5}
# + Tfidf+ transformer:  0.90336907447 {'C': 3.5}
#                        0.903462223502 {'C': 4.6415888336127775}
#  -hour +log1p(hour) +year -year_month  0.908940245543 {'C': 3.5} (0.90840662684 {'C': 3.5})
#      --      n_iniq :                 -0.85375820649 {'C': 1}


# clf = LogisticRegression(
#         random_state=17,
#         **clf_grid_searcher.best_params_)
# clf.fit(X_train_new, y_train_new)
# clf.score(X_train_new, y_train_new)
# 0.99544488308533252

Wall time: 7.33 s


### Submission

In [40]:
#transform_pipeline.steps[0][1].transformer_list[2][1].steps[1][1]
#feature_names = [f[0] for f in transform_pipeline.steps[0][1].transformer_list]
#feat_importances = pd.Series(clf.feature_importances_, index=feature_names)
#feat_importances.nlargest(15).plot(kind='barh')


# (0.8884620990228279, {'C': 3.5000000000000013})
# weekend+weekday 0.876261489918 {'C': 3.5000000000000013}
# weekday  0.876293321626 {'C': 3.5000000000000013}
# def : 100000   0.890742942071 {'C': 3.5000000000000013}
#                0.890858325471 {'C': 4.200000000000002}
#      +seconds+n_unique_sites  0.893673781638 {'C': 3.4000000000000012}


# max_train_data = 10000

pred_test_generator = KerasBatchClassifier.batch_generator(X_test_nn1, None, BATCH_SIZE, True)
add_part = 1 if X_test_nn1.shape[0] % BATCH_SIZE != 0 else 0
nn_test_pred = model_nn1.model.predict_generator(
    pred_test_generator, 
    X_test_nn1.shape[0] // BATCH_SIZE + add_part)

In [41]:
nn_test_pred[:, 1]

array([3.71030625e-03, 1.86632933e-05, 1.02668775e-04, ...,
       4.98598616e-04, 8.60656837e-06, 2.77944288e-04], dtype=float32)

In [42]:
write_to_submission_file(nn_test_pred[:, 1], 'no_ym_fg_nn2.csv')
#  ==> 0.87874
# logit_test_pred = clf.predict_proba(transformed_test_df)[:, 1]
# write_to_submission_file(logit_test_pred, 'no_ym_fg_.csv')

In [None]:
# ==>  0.94952