In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm, tqdm_notebook
import time

import os
print(os.listdir("../input"))
import gc


# Any results you write to the current directory are saved as output.
import tensorflow as tf
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from keras import layers
from keras import backend as K
from keras import regularizers
from keras.constraints import max_norm
from keras.models import Sequential
from keras.callbacks import ModelCheckpoint, LearningRateScheduler, EarlyStopping, ReduceLROnPlateau
from keras.models import load_model
from keras.models import Model
from keras.initializers import glorot_uniform
from keras.layers import Input,Dense,Activation,ZeroPadding2D,BatchNormalization,Flatten,Conv2D,AveragePooling2D,MaxPooling2D,Dropout,concatenate
from sklearn import preprocessing

import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve
#from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score
from keras.utils import Sequence,to_categorical

#GPU = 7
#os.environ['CUDA_VISIBLE_DEVICES'] = str(GPU)

import warnings
warnings.filterwarnings("ignore")

['train.csv', 'xx', 'sample_submission.csv', 'test.csv']


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [18]:
class DataGenerator(Sequence):
    'Generates data for Keras'
    def __init__(self, **kwargs):
        'Initialization'
        self.params = kwargs
        self.X = self.params['X']
        self.shuffle = self.params['shuffle']
        self.y = self.params['y']
        self.aug = self.params['aug']
        self.indexes = np.arange(self.y.shape[0])
        if self.shuffle:
            np.random.shuffle(self.indexes)

    def __len__(self):
        'Denotes the number of batches per epoch'
        batch_size = self.params['batch_size']
        return int(np.floor(self.indexes.shape[0] / batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        batch_size = self.params['batch_size']
        indexes = self.indexes[index*batch_size:(index+1)*batch_size]

        X, y = self.__data_generation(indexes)
        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, indexes):
        'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
        X_base = self.X['base'][indexes]
        X_noise1 = self.X['noise1'][indexes]
        y = self.y[indexes]
        if self.aug:
            X_base,X_noise1,y = self.aug_(X_base,X_noise1,y)
        return {'base':X_base,'noise1':X_noise1}, y
    
    def aug_(self,xb,xn1,y,t=2):
        xb_pos,xb_neg,xn1_pos,xn1_neg = [],[],[],[]
        for i in range(t):
            mask = y>0
            x1 = xb[mask].copy()
            x2 = xn1[mask].copy()
            ids = np.arange(x1.shape[0])
            for c in range(x1.shape[1]):
                np.random.shuffle(ids)
                x1[:,c] = x1[ids][:,c]
                x2[:,c] = x2[ids][:,c]
            xb_pos.append(x1)
            xn1_pos.append(x2)
        
        for i in range(t):
            mask = y==0
            x1 = xb[mask].copy()
            x2 = xn1[mask].copy()
            ids = np.arange(x1.shape[0])
            for c in range(x1.shape[1]):
                np.random.shuffle(ids)
                x1[:,c] = x1[ids][:,c]
                x2[:,c] = x2[ids][:,c]
            xb_neg.append(x1)
            xn1_neg.append(x2)
    

        xb_pos = np.vstack(xb_pos)
        xb_neg = np.vstack(xb_neg)
        xn1_pos = np.vstack(xn1_pos)
        xn1_neg = np.vstack(xn1_neg)

        ys = np.ones(xb_pos.shape[0])
        yn = np.zeros(xb_neg.shape[0])
        xb = np.vstack([xb,xb_pos,xb_neg])
        xn1 = np.vstack([xn1,xn1_pos,xn1_neg])
        y = np.concatenate([y,ys,yn])
        return xb,xn1,y

In [3]:
# define helper functions. auc, plot_history
def auc(y_true, y_pred):
    #auc = tf.metrics.auc(y_true, y_pred)[1]
    y_pred = y_pred.ravel()
    y_true = y_true.ravel()
    return roc_auc_score(y_true, y_pred)

def auc_2(y_true, y_pred):
    return tf.py_func(roc_auc_score, (y_true, y_pred), tf.double)

def plot_history(histories, key='binary_crossentropy'):
    plt.figure(figsize=(16,10))
    #plt.plot([0, 1], [0, 1], 'k--')
    for name, history in histories:
        val = plt.plot(history.epoch, history.history['val_'+key], '--', label=name.title()+' Val')

    plt.plot(history.epoch, history.history[key], color=val[0].get_color(), label=name.title()+' Train')

    plt.xlabel('Epochs')
    plt.ylabel(key.replace('_',' ').title())
    plt.legend()

    plt.xlim([0,max(history.epoch)])
    plt.ylim([0, 0.4])
    plt.show()

In [4]:
def shuffle_col_vals_fix(x1, groups):
    group_size = x1.shape[1]//groups
    xs = [x1[:, i*group_size:(i+1)*group_size] for i in range(groups)]
    rand_x = np.array([np.random.choice(x1.shape[0], size=x1.shape[0], replace=False) for i in range(group_size)]).T
    grid = np.indices(xs[0].shape)
    rand_y = grid[1]
    res = [x[(rand_x, rand_y)] for x in xs]
    return np.hstack(res)

def augment_fix_fast(x,y,groups,t1=2, t0=2):
    # In order to make the sync version augment work, the df should be the form of:
    # var_1, var_2, var_3 | var_1_count, var_2_count, var_3_count | var_1_rolling, var_2_rolling, var_3_rolling
    # for the example above, 3 groups of feature, groups = 3
    xs,xn = [],[]
    for i in range(t1):
        mask = y>0
        x1 = x[mask].copy()
        x1 = shuffle_col_vals_fix(x1, groups)
        xs.append(x1)

    for i in range(t0):
        mask = (y==0)
        x1 = x[mask].copy()
        x1 = shuffle_col_vals_fix(x1, groups)
        xn.append(x1)

    xs = np.vstack(xs); xn = np.vstack(xn)
    ys = np.ones(xs.shape[0]);yn = np.zeros(xn.shape[0])
    x = np.vstack([x,xs,xn]); y = np.concatenate([y,ys,yn])
    return x,y

In [5]:
%%time
# load data 
train_df = pd.read_csv('../input/train.csv')
test_df =  pd.read_csv("../input/test.csv")
base_features = [x for x in train_df.columns.values.tolist() if x.startswith('var_')]

CPU times: user 10.4 s, sys: 804 ms, total: 11.2 s
Wall time: 11.2 s


In [6]:
%%time
# mark real vs fake
train_df['real'] = 1

for col in base_features:
    test_df[col] = test_df[col].map(test_df[col].value_counts())
a = test_df[base_features].min(axis=1)

test_df = pd.read_csv('../input/test.csv')
test_df['real'] = (a == 1).astype('int')

train = train_df.append(test_df).reset_index(drop=True)
del test_df, train_df; gc.collect()

CPU times: user 14 s, sys: 17 s, total: 31 s
Wall time: 31 s


In [7]:
%%time
# count features
for col in tqdm(base_features):
    train[col + 'size'] = train[col].map(train.loc[train.real==1, col].value_counts())
cnt_features = [col + 'size' for col in base_features]

100%|██████████| 200/200 [00:09<00:00, 22.09it/s]

CPU times: user 8.16 s, sys: 932 ms, total: 9.09 s
Wall time: 9.05 s





In [8]:
%%time
# magice features 1
for col in tqdm(base_features):
#        train[col+'size'] = train.groupby(col)['target'].transform('size')
    train.loc[train[col+'size']>1,col+'no_noise'] = train.loc[train[col+'size']>1,col]
noise1_features = [col + 'no_noise' for col in base_features]

100%|██████████| 200/200 [03:50<00:00,  1.15s/it]

CPU times: user 1min, sys: 2min 50s, total: 3min 51s
Wall time: 3min 50s





In [9]:
%%time
# fill NA as 0, inspired by lightgbm
train[noise1_features] = train[noise1_features].fillna(train[noise1_features].mean())

CPU times: user 956 ms, sys: 860 ms, total: 1.82 s
Wall time: 1.81 s


In [10]:
%%time
train_df = train[train['target'].notnull()]
test_df = train[train['target'].isnull()]
all_features = base_features + noise1_features

CPU times: user 436 ms, sys: 764 ms, total: 1.2 s
Wall time: 1.2 s


In [11]:
%%time
scaler = preprocessing.StandardScaler().fit(train_df[all_features].values)
df_trn = pd.DataFrame(scaler.transform(train_df[all_features].values), columns=all_features)
df_tst = pd.DataFrame(scaler.transform(test_df[all_features].values), columns=all_features)
y = train_df['target'].values

CPU times: user 1.98 s, sys: 2.09 s, total: 4.06 s
Wall time: 4.06 s


In [12]:
def get_keras_data(dataset, cols_info):
    X = {}
    base_feats, noise_feats = cols_info
    X['base'] = np.reshape(np.array(dataset[base_feats].values), (-1, len(base_feats), 1))
    X['noise1'] = np.reshape(np.array(dataset[noise_feats].values), (-1, len(noise_feats), 1))
    return X

In [13]:
%%time
cols_info = [base_features, noise1_features]
#X = get_keras_data(df_trn[all_features], cols_info)
X_test = get_keras_data(df_tst[all_features], cols_info)

CPU times: user 272 ms, sys: 764 ms, total: 1.04 s
Wall time: 1.03 s


In [14]:
# define network structure -> 2D CNN
def Convnet(cols_info, classes=1):
    base_feats, noise1_feats = cols_info
    
    # base_feats
    X_base_input = Input(shape=(len(base_feats), 1), name='base')
    X_base = Dense(16)(X_base_input)
    X_base = Activation('relu')(X_base)
    X_base = Flatten(name='base_last')(X_base)
    
    # noise1
    X_noise1_input = Input(shape=(len(noise1_feats), 1), name='noise1')
    X_noise1 = Dense(16)(X_noise1_input)
    X_noise1 = Activation('relu')(X_noise1)
    X_noise1 = Flatten(name='nose1_last')(X_noise1)
    
    X = concatenate([X_base, X_noise1])
    X = Dense(classes, activation='sigmoid')(X)
    
    model = Model(inputs=[X_base_input, X_noise1_input],outputs=X)
    
    return model
model = Convnet(cols_info)
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
base (InputLayer)               (None, 200, 1)       0                                            
__________________________________________________________________________________________________
noise1 (InputLayer)             (None, 200, 1)       0                                            
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, 200, 16)      32          base[0][0]                       
__________________________________________________________________________________________________
dense_2 (Dense)                 (None, 200, 16)      32          noise1[0][0]                     
__________________________________________________________________________________________________
activation

In [15]:
try:
    del df_tst
except:
    pass
gc.collect()

22

In [16]:
# parameters
SEED = 2019
n_folds = 5
debug_flag = True
folds = 5
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=SEED)

In [None]:
%%time
#transformed_shape = tuple([-1] + list(shape))
#X_test = np.reshape(X_test, transformed_shape)

i = 0
result = pd.DataFrame({"ID_code": test_df.ID_code.values})
val_aucs = []
valid_X = train_df[['target']]
valid_X['predict'] = 0
for train_idx, val_idx in skf.split(df_trn, y):
    if i == folds:
        break
    i += 1    
    X_train, y_train = df_trn.iloc[train_idx], y[train_idx]
    X_valid, y_valid = df_trn.iloc[val_idx], y[val_idx]
    
    #aug
    X_train, y_train = augment_fix_fast(X_train.values, y_train, groups=2, t1=2, t0=2)
    X_train = pd.DataFrame(X_train, columns=all_features)
    
    X_train = get_keras_data(X_train, cols_info)
    X_valid = get_keras_data(X_valid, cols_info)
    #X_train = np.reshape(X_train, transformed_shape)
    #X_valid = np.reshape(X_valid, transformed_shape)
    
    model_name = 'nn/NN_fold{}.h5'.format(str(i))
    
    model = Convnet(cols_info)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', 'binary_crossentropy', auc_2])
    checkpoint = ModelCheckpoint(model_name, monitor='val_auc_2', verbose=1, 
                                 save_best_only=True, mode='max', save_weights_only = True)
    reduceLROnPlat = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=4, 
                                       verbose=1, mode='min', epsilon=0.0001)
    earlystop = EarlyStopping(monitor='val_auc_2', mode='max', patience=10, verbose=1)
    
    if 0:
        history = model.fit(X_train, y_train, 
                        epochs=300, 
                        batch_size=1024 * 2, 
                        validation_data=(X_valid, y_valid), 
                        callbacks=[checkpoint, reduceLROnPlat, earlystop])
    else:
        training_generator = DataGenerator(X=X_train,y=y_train,aug=1,batch_size=1024*2,shuffle=True)
        validation_generator = DataGenerator(X=X_valid,y=y_valid,aug=0,batch_size=1024*2,shuffle=False)
        history = model.fit_generator(generator=training_generator,
                        validation_data=validation_generator,
                        epochs=300,  
                        callbacks=[checkpoint, reduceLROnPlat, earlystop])
    train_history = pd.DataFrame(history.history)
    train_history.to_csv('nn/train_profile_fold{}.csv'.format(str(i)), index=None)
    
    # load and predict
    model.load_weights(model_name)
    
    #predict
    y_pred_keras = model.predict(X_valid).ravel()
    
    # AUC
    valid_X['predict'].iloc[val_idx] = y_pred_keras
    
    fpr_keras, tpr_keras, thresholds_keras = roc_curve(y_valid, y_pred_keras)
    auc_valid = roc_auc_score(y_valid, y_pred_keras)
    val_aucs.append(auc_valid)
    
    prediction = model.predict(X_test)
    result["fold{}".format(str(i))] = prediction

Epoch 1/300

Epoch 00001: val_auc_2 improved from -inf to 0.87458, saving model to nn/NN_fold1.h5
Epoch 2/300

Epoch 00002: val_auc_2 improved from 0.87458 to 0.87938, saving model to nn/NN_fold1.h5
Epoch 3/300

Epoch 00003: val_auc_2 improved from 0.87938 to 0.89289, saving model to nn/NN_fold1.h5
Epoch 4/300

Epoch 00004: val_auc_2 improved from 0.89289 to 0.90598, saving model to nn/NN_fold1.h5
Epoch 5/300

Epoch 00005: val_auc_2 improved from 0.90598 to 0.91255, saving model to nn/NN_fold1.h5
Epoch 6/300

Epoch 00006: val_auc_2 improved from 0.91255 to 0.91534, saving model to nn/NN_fold1.h5
Epoch 7/300

Epoch 00007: val_auc_2 improved from 0.91534 to 0.91742, saving model to nn/NN_fold1.h5
Epoch 8/300

Epoch 00008: val_auc_2 improved from 0.91742 to 0.91893, saving model to nn/NN_fold1.h5
Epoch 9/300

Epoch 00009: val_auc_2 improved from 0.91893 to 0.91986, saving model to nn/NN_fold1.h5
Epoch 10/300

Epoch 00010: val_auc_2 improved from 0.91986 to 0.92083, saving model to nn/NN_f


Epoch 00026: val_auc_2 improved from 0.92327 to 0.92348, saving model to nn/NN_fold1.h5
Epoch 27/300

Epoch 00027: val_auc_2 did not improve from 0.92348
Epoch 28/300

Epoch 00028: val_auc_2 improved from 0.92348 to 0.92365, saving model to nn/NN_fold1.h5
Epoch 29/300

Epoch 00029: val_auc_2 did not improve from 0.92365
Epoch 30/300

Epoch 00030: val_auc_2 did not improve from 0.92365
Epoch 31/300

Epoch 00031: val_auc_2 did not improve from 0.92365
Epoch 32/300

Epoch 00032: val_auc_2 did not improve from 0.92365

Epoch 00032: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
Epoch 33/300

Epoch 00033: val_auc_2 did not improve from 0.92365
Epoch 34/300

Epoch 00034: val_auc_2 did not improve from 0.92365
Epoch 35/300

Epoch 00035: val_auc_2 did not improve from 0.92365
Epoch 36/300

Epoch 00036: val_auc_2 did not improve from 0.92365

Epoch 00036: ReduceLROnPlateau reducing learning rate to 0.0001250000059371814.
Epoch 37/300

Epoch 00037: val_auc_2 did not improve 


Epoch 00014: val_auc_2 improved from 0.92001 to 0.92030, saving model to nn/NN_fold2.h5
Epoch 15/300

Epoch 00015: val_auc_2 improved from 0.92030 to 0.92060, saving model to nn/NN_fold2.h5
Epoch 16/300

Epoch 00016: val_auc_2 improved from 0.92060 to 0.92078, saving model to nn/NN_fold2.h5
Epoch 17/300

Epoch 00017: val_auc_2 improved from 0.92078 to 0.92104, saving model to nn/NN_fold2.h5
Epoch 18/300

Epoch 00018: val_auc_2 did not improve from 0.92104
Epoch 19/300

Epoch 00019: val_auc_2 improved from 0.92104 to 0.92113, saving model to nn/NN_fold2.h5
Epoch 20/300

Epoch 00020: val_auc_2 did not improve from 0.92113
Epoch 21/300

Epoch 00021: val_auc_2 improved from 0.92113 to 0.92170, saving model to nn/NN_fold2.h5
Epoch 22/300

Epoch 00022: val_auc_2 did not improve from 0.92170
Epoch 23/300

Epoch 00023: val_auc_2 did not improve from 0.92170
Epoch 24/300

Epoch 00024: val_auc_2 did not improve from 0.92170
Epoch 25/300

Epoch 00025: val_auc_2 did not improve from 0.92170

Epoc

In [42]:
for i in range(len(val_aucs)):
    print('Fold_%d AUC: %.6f' % (i+1, val_aucs[i]))

Fold_1 AUC: 0.922040
Fold_2 AUC: 0.921082
Fold_3 AUC: 0.924821
Fold_4 AUC: 0.921676
Fold_5 AUC: 0.921367


In [18]:
for i in range(len(val_aucs)):
    print('Fold_%d AUC: %.6f' % (i+1, val_aucs[i]))

Fold_1 AUC: 0.921823
Fold_2 AUC: 0.920845
Fold_3 AUC: 0.924355
Fold_4 AUC: 0.921661
Fold_5 AUC: 0.921352


In [43]:
# summary on results
auc_mean = np.mean(val_aucs)
auc_std = np.std(val_aucs)
auc_all = roc_auc_score(valid_X.target, valid_X.predict)
print('%d-fold auc mean: %.9f, std: %.9f. All auc: %6f.' % (n_folds, auc_mean, auc_std, auc_all))

5-fold auc mean: 0.922197336, std: 0.001349837. All auc: 0.922151.


In [19]:
# summary on results
auc_mean = np.mean(val_aucs)
auc_std = np.std(val_aucs)
auc_all = roc_auc_score(valid_X.target, valid_X.predict)
print('%d-fold auc mean: %.9f, std: %.9f. All auc: %6f.' % (n_folds, auc_mean, auc_std, auc_all))

5-fold auc mean: 0.922007239, std: 0.001220462. All auc: 0.921973.


In [20]:
y_all = result.values[:, 1:]
result['target'] = np.mean(y_all, axis = 1)
to_submit = result[['ID_code', 'target']]
to_submit.to_csv('NN_submission.csv', index=None)
result.to_csv('nn/NN_all_prediction.csv', index=None)
valid_X['ID_code'] = train_df['ID_code']
valid_X = valid_X[['ID_code', 'target', 'predict']].to_csv('nn/NN_oof.csv', index=None)