In [1]:
%config IPCompleter.greedy=True


In [2]:
import pandas as pd

df_train = pd.read_csv('train_kaggle.csv')
df_train

Unnamed: 0,Id,Label
0,0,1
1,1,0
2,2,0
3,3,1
4,4,0
5,5,1
6,6,0
7,7,1
8,8,0
9,9,0


In [3]:
import seaborn as sns

sns.distplot(df_train['Label']);

In [4]:
import numpy as np

def load_dataframe(id):
    train_data = np.load("train/train/{}.npy".format(id))
    return pd.DataFrame(data=train_data)

In [5]:
def load_test_dataframe(id):
    test_data = np.load("test/test/{}.npy".format(id))
    return pd.DataFrame(data=test_data)

In [6]:
def get_missing_data(df):
    total = df.isnull().sum().sort_values(ascending=False)
    percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)
    missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    return missing_data

In [7]:
def clear_missing_data(df, missing_indices):
    df = df.drop(missing_indices,1)
    # for col in df_temp.columns:
    #    df = df.drop(df.loc[df[col].isnull()].index)
    count = df.isnull().sum().max() #just checking that there's no missing data missing...
    if count > 0:
        print(count)
        return pd.DataFrame()
    return df

In [8]:
import tensorflow as tf


In [9]:
import numpy as np

dataframes = np.load('allData.npy', allow_pickle = True)
dataframes.shape

(18662,)

In [10]:
PAD_SIZE = 500

def pad_data(dfs):
    data = []
    for i in range(len(dfs)):
        df = dfs[i]
        diff = PAD_SIZE - df.shape[0]
        if diff > 0:
            df = np.pad(df, [(0, diff), (0,0)], 'constant')
        else:
            df = df[:PAD_SIZE]
        data.append(df)
    data = np.stack(data)
    return data

In [11]:
XTrain = pad_data(dataframes)
XTrain.shape

(18662, 500, 102)

In [12]:
np.save('train_padded.npy', XTrain)

In [13]:
XTrain[0].shape

(500, 102)

In [14]:
Y = df_train['Label'].values
Y.shape

(18662,)

In [15]:
df_test = pd.read_csv('sample_solution.csv')
df_test

Unnamed: 0,Id,Predicted
0,0,0.417022
1,1,0.720324
2,2,0.000114
3,3,0.302333
4,4,0.146756
5,5,0.092339
6,6,0.186260
7,7,0.345561
8,8,0.396767
9,9,0.538817


In [16]:
testdatas = []
for id in df_test['Id']:
    dfi = load_test_dataframe(id)
    testdatas.append(dfi.values)

In [17]:
print(len(testdatas))

6051


In [18]:
XTest = pad_data(np.array(testdatas))

In [19]:
np.save('test_padded.npy', XTest)

In [20]:
XTest.shape

(6051, 500, 102)

In [21]:
features = [
    (0, 8),
    (8, 12),
    (12, 28),
    (28, 44),
    (44, 52),
    (52, 64),
    (64, 80),
    (80, 92),
    (92, 102)
]
features

[(0, 8),
 (8, 12),
 (12, 28),
 (28, 44),
 (44, 52),
 (52, 64),
 (64, 80),
 (80, 92),
 (92, 102)]

In [22]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(XTrain, Y, test_size=0.1, random_state=42)

In [40]:
from tensorflow import keras

def select_model(trainShape):
    METRICS = [
          keras.metrics.BinaryAccuracy(name='accuracy'),
          keras.metrics.AUC(name='auc'),
    ]

    inputs = keras.Input(shape=(trainShape[1], trainShape[2])) 
    x = keras.layers.Conv1D(filters=6, kernel_size=5, padding='same', activation='relu')(inputs)
    x = keras.layers.MaxPooling1D(pool_size=2)(x)
    x = keras.layers.Dropout(0.4)(x)
    x = keras.layers.Conv1D(filters=16, kernel_size=5, padding='same', activation='relu')(x)
    x = keras.layers.MaxPooling1D(pool_size=2)(x)
    x = keras.layers.Dropout(0.4)(x)
    x = keras.layers.Flatten()(x)
    x = keras.layers.Dense(120, activation='relu')(x)
    x = keras.layers.Dense(84, activation='relu')(x)
    x = keras.layers.Dense(2, activation='softmax')(x)

    model = keras.Model(inputs=inputs, outputs=x)

    model.compile(loss='categorical_crossentropy',
                 optimizer='adam',
                 metrics=METRICS)
    return model

In [41]:
def select_es():
    early_stopping = tf.keras.callbacks.EarlyStopping(
        monitor='val_auc', 
        verbose=1,
        patience=10,
        mode='max',
        restore_best_weights=True)
    return early_stopping

In [53]:
from tensorflow.keras.utils import to_categorical

EPOCHS = 100
firstModel = select_model(XTrain.shape)
baseline_history = firstModel.fit(
    X_train,
    to_categorical(y_train),
    epochs=2,
    callbacks = [select_es()],
    validation_data=(X_val, to_categorical(y_val)))

Train on 16795 samples, validate on 1867 samples
Epoch 1/2
Epoch 2/2


In [54]:
np.max(baseline_history.history['val_auc'])

0.9499407

In [57]:
btHist = (np.max(baseline_history.history['val_auc']), None, firstModel)
btHist

(0.9499407,
 None,
 <tensorflow.python.keras.engine.training.Model at 0x25ae5741908>)

In [None]:
def select_features(features, selected, n):
    if n >= len(features):
        x_trainp = None
        x_valp = None
        hasSelected = False
        for i in range(n):
            if selected[i] == False:
                continue
            x_train_i = X_train[:,:, features[i][0]: features[i][1]]
            x_val_i = X_val[:,:, features[i][0]: features[i][1]]
            if hasSelected == False:
                x_trainp = x_train_i
                x_valp = x_val_i
                hasSelected = True
            else:                
                x_trainp = np.concatenate((x_trainp, x_train_i), axis=2)
                x_valp = np.concatenate((x_valp, x_val_i), axis=2)
        if hasSelected == True:
            EPOCHS = 35
            print(selected)
            model = select_model(x_trainp.shape)
            baseline_history = model.fit(
                x_trainp,
                to_categorical(y_train),
                epochs=EPOCHS,
                callbacks = [select_es()],
                validation_data=(x_valp, to_categorical(y_val)))                
            val_auc = np.max(baseline_history.history['val_auc'])
            print(val_auc)
            return (val_auc, selected, model)
        return (0, None, None)
    selected[n] = True
    hist1 = select_features(features, selected, n + 1)
    selected[n] = False
    hist2 = select_features(features, selected, n + 1)
    if hist1[0] > hist2[0]:
        return hist1
    return hist2
select_features(features, np.ones(len(features), dtype=bool), 0)

[ True  True  True  True  True  True  True  True  True]
Train on 16795 samples, validate on 1867 samples
Epoch 1/35
Epoch 2/35
Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35
Epoch 7/35
Epoch 8/35
Epoch 9/35
Epoch 10/35
Epoch 11/35
Epoch 12/35
Epoch 13/35
Epoch 14/35
Epoch 15/35
Epoch 16/35
Epoch 17/35
Epoch 18/35
Epoch 19/35
Epoch 20/35
Epoch 21/35
Epoch 22/35
Epoch 23/35
Epoch 24/35
Epoch 25/35
Epoch 26/35
Epoch 27/35
Epoch 28/35
Epoch 29/35
Epoch 00029: early stopping
0.98676157
[ True  True  True  True  True  True  True  True False]
Train on 16795 samples, validate on 1867 samples
Epoch 1/35
Epoch 2/35
Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35
Epoch 7/35
Epoch 8/35
Epoch 9/35
Epoch 10/35
  960/16795 [>.............................] - ETA: 50s - loss: 0.1446 - accuracy: 0.9396 - auc: 0.9873

In [None]:
print(bestHistory[0])
print(bestHistory[1])

In [None]:
YTest = bestHistory[2].predict(XTest)
YTest

In [None]:
df_test['Predicted'] = YTest[:, 0]
df_test

In [None]:
df_test.to_csv('test.csv', index=False)
