In [18]:
pip install datatable

Note: you may need to restart the kernel to use updated packages.


In [19]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datatable as dt
import tensorflow as tf
from tensorflow import keras
from warnings import filterwarnings
filterwarnings('ignore')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [20]:
### set seeds
my_seed = 1

np.random.seed(my_seed)
tf.random.set_seed(my_seed)

## <span style="background:#818181;padding:0.3em;width:100%;display:block;border-radius:0.1em;color:white;font-family:Monospace">Introduction</span>

<div style="font-size: 1em; font-family: Verdana">
    <b>Hi,</b><br><br>
    I just wanted to share my baseline-model with you guys.<br>
    I've just recently started getting into 'deep learning' and read a lot of basics.<br>
    This is the reason why I decided to use this month competition to get some practice with Neural-Networks.<br><br>
    Also make sure to check out my EDA for TPS November 2021 <a href="https://www.kaggle.com/mlanhenke/tps-11-simple-basic-eda">here</a>. <br><br>
    <em>If you like this notebook or copy any parts of it please make sure to leave an upvote...</em><br><br>
    <em><b>Thanks for taking some time to stop by and read my notebook!</b></em>
</div>



## <span style="background:#818181;padding:0.3em;width:100%;display:block;border-radius:0.1em;color:white;font-family:Monospace">Import Data & Pre-Processing</span>

In [21]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

### load dataframes
df_train = dt.fread('train.csv').to_pandas()
df_test = dt.fread('test.csv').to_pandas()

sample_submission = pd.read_csv('sample_submission.csv')

### split into X, y
X = df_train.drop(columns=['id','target']).copy()
y = df_train['target'].copy()

X_test = df_test.drop(columns='id').copy()

### standardize data
scaler = StandardScaler()
# scaler = MinMaxScaler()
# scaler = RobustScaler()

X = pd.DataFrame(columns=X.columns, data=scaler.fit_transform(X))
X_test = pd.DataFrame(columns=X_test.columns, data=scaler.transform(X_test))

## <span style="background:#818181;padding:0.3em;width:100%;display:block;border-radius:0.1em;color:white;font-family:Monospace">Modeling</span>

In [22]:
### check gpu before training
print("Tensorflow Number of GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Tensorflow Number of GPUs Available:  0


In [23]:
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

### define callbacks
early_stopping = EarlyStopping(
    monitor='val_loss', 
    min_delta=0, 
    patience=20, 
    verbose=0,
    mode='min', 
    baseline=None, 
    restore_best_weights=True
)

reduce_lr = ReduceLROnPlateau(
    monitor='val_loss', 
    factor=0.2,
    patience=5,
    mode='min'
)

In [24]:
from tensorflow.keras import Sequential, Model
from tensorflow.keras.regularizers import l2
from tensorflow.keras.layers import Dense, Flatten, InputLayer, Dropout, Input

### create baseline-model
def get_model():
    inp = Input(shape=X.shape[1], name='input')
    h = Dense(128, activation='swish')(inp)
    h = Dropout(0.25)(h)
    h = Dense(64, activation='swish')(h)
    h = Dropout(0.25)(h)
    h = Dense(32, activation='swish')(h)
    h = Dropout(0.25)(h)
    h = Dense(1, activation='sigmoid')(h)
    
    model = Model(inputs=inp, outputs=h)
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
        loss=tf.keras.losses.BinaryCrossentropy(label_smoothing=1e-3),
        metrics=['AUC']
    )
    return model

In [25]:
X

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f90,f91,f92,f93,f94,f95,f96,f97,f98,f99
0,-0.382553,0.705772,-0.315075,0.347277,-0.229657,-0.875660,0.660314,-0.197064,-0.286162,-0.289270,...,-0.537157,-0.872508,-0.258806,-0.595537,-0.199502,-0.196145,1.067358,-0.400887,-0.167145,0.443374
1,-0.347377,-0.530387,-0.417061,0.472862,-0.187909,1.623543,-0.910506,-1.963980,1.309644,-0.229122,...,0.573313,0.658473,-0.252018,0.548089,0.019765,2.392938,-1.806811,-0.008064,-0.412110,-0.371198
2,-0.517136,-0.643571,-0.132486,-0.293650,-0.361533,0.364863,1.507175,0.824771,-0.480372,-0.183401,...,0.408845,1.580886,-0.127714,-0.226174,-0.062423,-0.516946,-1.968603,-0.294434,-0.078904,0.094984
3,-0.613619,-1.448884,0.857867,-0.490286,-0.409357,-0.656445,-0.001055,1.255833,1.281843,-0.025780,...,-0.768719,0.667692,-0.314304,-0.565262,-0.179472,-0.311897,-1.306572,-0.407556,-0.272505,-0.295118
4,-0.592913,0.783666,-0.272802,-0.323841,-0.382205,-0.293270,0.930480,-1.684456,0.543499,-0.237512,...,-0.510155,-0.346112,-0.361423,0.386926,-0.042986,-0.329511,-0.965117,-0.993613,-0.393636,-0.189697
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
599995,-0.504355,-0.551500,1.937940,-1.010581,1.585028,-0.552446,0.731375,-0.944482,-0.673772,-0.371060,...,-0.219391,-0.578403,-0.136659,-0.349704,0.173589,-0.158944,1.211901,-0.557888,-0.251825,-0.216101
599996,-0.324356,1.498871,-0.736379,-1.076453,-0.335670,0.477988,-0.145691,2.175439,-1.084892,-0.341576,...,0.337680,-0.712645,-0.114522,0.188558,0.192870,-0.310986,0.850947,-0.472142,0.007868,-0.141134
599997,1.932651,-1.284110,-0.435725,-0.268614,-0.282193,0.778789,-0.028331,0.377122,-0.967558,-0.258913,...,0.407256,-0.170709,-0.223473,0.420855,-0.254249,-0.212734,-1.880861,-1.093736,-0.169690,-0.265813
599998,0.393168,0.855770,0.854070,-1.529262,0.819921,0.131849,-0.113459,-0.652033,-0.369953,0.776531,...,0.010571,-1.347007,-0.153752,0.100857,-0.396098,-0.323202,-1.713743,0.339410,-0.223636,-0.448741


In [None]:
from sklearn.model_selection import StratifiedKFold

EPOCHS = 100
BATCH_SIZE = 1024
VERBOSE = 1
N_SPLITS = 10

### cross-validation 
cv = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=my_seed)

scores = {fold:None for fold in range(cv.n_splits)}
predictions = []

for fold, (idx_train, idx_valid) in enumerate(cv.split(X, y)):
    X_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
    X_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]

    model = get_model()

    print('**'*20)
    print(f"Fold {fold+1} || Training")
    print('**'*20)
    
    history = model.fit(
        X_train, y_train,
        validation_data=(X_valid, y_valid),
        batch_size=BATCH_SIZE,
        epochs=EPOCHS,
        verbose=VERBOSE,
        callbacks=[
            early_stopping,
            reduce_lr
        ]
    )
    
    scores[fold] = (history.history)
    
    print(f"Fold {fold+1} || Max Validation AUC: {np.max(scores[fold]['val_auc'])}")
    
    prediction = model.predict(X_test, batch_size=BATCH_SIZE).reshape(1,-1)[0]
    predictions.append(prediction)

print('**'*20)
print('Finished Training')
print('**'*20)

overall_auc = [np.max(scores[fold]['val_auc']) for fold in range(cv.n_splits)]
print('Overall Mean AUC: ', np.mean(overall_auc))

****************************************
Fold 1 || Training
****************************************
Epoch 1/100
Epoch 2/100

## <span style="background:#818181;padding:0.3em;width:100%;display:block;border-radius:0.1em;color:white;font-family:Monospace">Evaluation</span>

In [None]:
### plot train and valid loss over number of epochs
fig, ax = plt.subplots(2, 5, tight_layout=True, figsize=(20,5))
ax = ax.flatten()

for fold in range(cv.n_splits):
    df_eval = pd.DataFrame({'train_loss': scores[fold]['loss'], 'valid_loss': scores[fold]['val_loss']})

    min_train = np.round(np.min(df_eval['train_loss']),5)
    min_valid = np.round(np.min(df_eval['valid_loss']),5)
    delta = np.round(min_valid - min_train,5)
    
    sns.lineplot(
        x=df_eval.index,
        y=df_eval['train_loss'],
        label='train_loss',
        ax = ax[fold]
    )

    sns.lineplot(
        x=df_eval.index,
        y=df_eval['valid_loss'],
        label='valid_loss',
        ax = ax[fold]
    )
    
    ax[fold].set_ylabel('')
    ax[fold].set_xlabel(f"Fold {fold+1}\nmin_train: {min_train}\nmin_valid: {min_valid}\ndelta: {delta}", fontstyle='italic')

sns.despine()

## <span style="background:#818181;padding:0.3em;width:100%;display:block;border-radius:0.1em;color:white;font-family:Monospace">Submission</span>

In [None]:
### average predictions over each fold and create submission file
sample_submission['target'] = np.mean(np.column_stack(predictions), axis=1)
sample_submission.to_csv('./nn_baseline.csv', index=False)