In [2]:
%config IPCompleter.greedy=True


In [3]:
import pandas as pd

df_train = pd.read_csv('train_kaggle.csv')
df_train

Unnamed: 0,Id,Label
0,0,1
1,1,0
2,2,0
3,3,1
4,4,0
5,5,1
6,6,0
7,7,1
8,8,0
9,9,0


In [4]:
import seaborn as sns

sns.distplot(df_train['Label']);

In [5]:
import numpy as np

def load_dataframe(id):
    train_data = np.load("train/train/{}.npy".format(id))
    return pd.DataFrame(data=train_data)

In [6]:
def load_test_dataframe(id):
    test_data = np.load("test/test/{}.npy".format(id))
    return pd.DataFrame(data=test_data)

In [7]:
def get_missing_data(df):
    total = df.isnull().sum().sort_values(ascending=False)
    percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)
    missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    return missing_data

In [8]:
def clear_missing_data(df, missing_indices):
    df = df.drop(missing_indices,1)
    # for col in df_temp.columns:
    #    df = df.drop(df.loc[df[col].isnull()].index)
    count = df.isnull().sum().max() #just checking that there's no missing data missing...
    if count > 0:
        print(count)
        return pd.DataFrame()
    return df

In [9]:
import tensorflow as tf


In [10]:
import numpy as np

dataframes = np.load('allData.npy', allow_pickle = True)
dataframes.shape

(18662,)

In [11]:
PAD_SIZE = 500

def pad_data(dfs):
    data = []
    for i in range(len(dfs)):
        df = dfs[i]
        diff = PAD_SIZE - df.shape[0]
        if diff > 0:
            df = np.pad(df, [(0, diff), (0,0)], 'constant')
        else:
            df = df[:PAD_SIZE]
        data.append(df)
    data = np.stack(data)
    return data

In [12]:
XTrain = pad_data(dataframes)
XTrain.shape

(18662, 500, 102)

In [13]:
np.save('train_padded.npy', XTrain)

In [14]:
XTrain[0].shape

(500, 102)

In [15]:
Y = df_train['Label'].values
Y.shape

(18662,)

In [16]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(XTrain, Y, test_size=0.1, random_state=42)

In [18]:
df_test = pd.read_csv('sample_solution.csv')
df_test

Unnamed: 0,Id,Predicted
0,0,0.417022
1,1,0.720324
2,2,0.000114
3,3,0.302333
4,4,0.146756
5,5,0.092339
6,6,0.186260
7,7,0.345561
8,8,0.396767
9,9,0.538817


In [19]:
testdatas = []
for id in df_test['Id']:
    dfi = load_test_dataframe(id)
    testdatas.append(dfi.values)

In [20]:
print(len(testdatas))

6051


In [21]:
XTest = pad_data(np.array(testdatas))

In [22]:
np.save('test_padded.npy', XTest)

In [23]:
XTest.shape

(6051, 500, 102)

In [24]:
from tensorflow import keras

METRICS = [
      keras.metrics.BinaryAccuracy(name='accuracy'),
      keras.metrics.AUC(name='auc'),
]

inputs = keras.Input(shape=(XTrain.shape[1], XTrain.shape[2])) 
x = keras.layers.Conv1D(filters=6, kernel_size=5, padding='same', activation='relu')(inputs)
x = keras.layers.MaxPooling1D(pool_size=2)(x)
x = keras.layers.Dropout(0.4)(x)
x = keras.layers.Conv1D(filters=16, kernel_size=5, padding='same', activation='relu')(x)
x = keras.layers.MaxPooling1D(pool_size=2)(x)
x = keras.layers.Dropout(0.4)(x)
x = keras.layers.Flatten()(x)
x = keras.layers.Dense(120, activation='relu')(x)
x = keras.layers.Dense(84, activation='relu')(x)
x = keras.layers.Dense(2, activation='softmax')(x)

model = keras.Model(inputs=inputs, outputs=x)

model.compile(loss='categorical_crossentropy',
             optimizer='adam',
             metrics=METRICS)

# Take a look at the model summary
model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 500, 102)]        0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 500, 6)            3066      
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 250, 6)            0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 250, 6)            0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 250, 16)           496       
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 125, 16)           0         
_________________________________________________________________
dropout_3 (Dropout)          (None, 125, 16)           0   

In [25]:
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_auc', 
    verbose=1,
    patience=10,
    mode='max',
    restore_best_weights=True)

In [None]:
from tensorflow.keras.utils import to_categorical

EPOCHS = 100
baseline_history = model.fit(
    X_train,
    to_categorical(y_train),
    epochs=EPOCHS,
    callbacks = [early_stopping],
    validation_data=(X_val, to_categorical(y_val)))

Train on 16795 samples, validate on 1867 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100

In [None]:
YTest = model.predict(XTest)
YTest

In [None]:
df_test['Predicted'] = YTest[:, 0]
df_test

In [None]:
df_test.to_csv('test.csv', index=False)
