In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
#from pathlib import Path
#Path("../input/jane-street-market-prediction/js-model-v1").mkdir(parents=True, exist_ok=True)

In [None]:
import os
os.getcwd()

# Load In Data
* [Did Jane Street modify their trading model around day 85?](https://www.kaggle.com/c/jane-street-market-prediction/discussion/201930)

In [None]:
import pandas as pd

train = pd.read_csv('../input/jane-street-market-prediction/train.csv')
print('Load in data successful!')

In [None]:
import numpy as np

SEED = 8
np.random.seed(SEED)

TRAINING_PGTS = False
MANUAL_VALIDATING = False
TRAINING = False

print(f'TRAINING_PGTS = {TRAINING_PGTS} \n\
MANUAL_VALIDATING = {MANUAL_VALIDATING} \n\
TRAINING = {TRAINING}')

In [None]:
LOAD_PGTS = False
LOAD_MANUAL_VALIDATING = True
LOAD_TRAINING = False

print(f'LOAD_PGTS = {TRAINING_PGTS} \n\
LOAD_MANUAL_VALIDATING = {MANUAL_VALIDATING} \n\
LOAD_TRAINING = {TRAINING}')

# Preprocessing

In [None]:
train = train.query('date > 85').reset_index(drop = True)
train = train[train['weight'] != 0]

#train['action'] = ((train['resp'].values) > 0).astype(int)

In [None]:
features = [c for c in train.columns if 'feature' in c]

In [None]:
is_null = train[features].isnull().mean() * 100 
print(is_null[is_null>10])
# pct of missing of all features are acceptable

In [None]:
# calculate mean before filling all missings
f_mean = np.nanmean(train[features[1:]].values, axis=0)
features_mean = train.loc[:, features].mean()

In [None]:
# filling missing features using mean
train.fillna(train.mean(), inplace=True)

In [None]:
Q1 = train[features].quantile(0.05)
Q3 = train[features].quantile(0.95)
IQR = Q3 - Q1

In [None]:
train = train[~((train[features] < Q1 - 1 * IQR) |(train[features] > (Q3 + 1 * IQR))).any(axis=1)]

In [None]:
#resp_cols = ['resp_3', 'resp_4', 'resp']
resp_cols = ['resp_1', 'resp_2', 'resp_3', 'resp_4', 'resp']
#resp_cols = ['resp']

X_train = train[features]
y_train = np.stack([(train[c] > 0).astype('int') for c in resp_cols]).T
print(X_train.shape, y_train.shape)

In [None]:
if MANUAL_VALIDATING or TRAINING:
    # train: 80%, validate: 20% ( (499-85) * 0.8 + 85 = 416 )
    X_train_v = train[train['date'] <= 416]
    y_train_v = np.stack([(X_train_v[c] > 0).astype('int') for c in resp_cols]).T
    X_train_v = X_train_v.loc[:, features].values

    X_test_v = train[train['date'] > 416]
    y_test_v = np.stack([(X_test_v[c] > 0).astype('int') for c in resp_cols]).T
    X_test_v = X_test_v.loc[:, features].values

    print(X_test_v.shape, y_test_v.shape)

# del train
print('Done data preprocessing!')

# Create Model

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input, BatchNormalization, Dropout, Dense, Activation
# !pip install tensorflow_addons
import tensorflow_addons as tfa


def create_mlp(num_columns, num_labels, hidden_units,
               dropout_rates, label_smoothing, learning_rate):
  
    inp = Input(shape=(num_columns,))
    x = BatchNormalization()(inp)
    x = Dropout(dropout_rates[0])(x)

    for i in range(len(hidden_units)):
        x = Dense(hidden_units[i])(x)
        x = BatchNormalization()(x)
        x = Activation(tf.keras.activations.swish)(x)
        x = Dropout(dropout_rates[i+1])(x)

    x = Dense(num_labels)(x)
    out = Activation('sigmoid')(x)

    model = tf.keras.models.Model(inputs=inp, outputs=out)
    model.compile(
      optimizer = tfa.optimizers.RectifiedAdam(learning_rate=learning_rate),
      loss = tf.keras.losses.BinaryCrossentropy(label_smoothing=label_smoothing),
      metrics = tf.keras.metrics.AUC(name='AUC')
    )

    return model

# Parameters Setting And Model Visualization

In [None]:
from keras.utils import plot_model

#tf.keras.backend.clear_session()
epochs = [50, 40] # PGTSCV folds all stopped bf. 40
#batch_size = [4096, 8192]
batch_size = 2048
hidden_units = [160, 240, 320, 320,240]
dropout_rates = [0.2, 0.2, 0.2, 0.2,0.2,0.2]
label_smoothing = 1e-2
learning_rate = 5e-3

display(plot_model(create_mlp(
    len(features), 5, hidden_units, 
    dropout_rates, label_smoothing, learning_rate
)))

# Manually Validate / Model Load In

* [tf.keras.callbacks.EarlyStopping](https://www.tensorflow.org/api_docs/python/tf/keras/callbacks/EarlyStopping)

* [What is ROC_AUC Curve](https://medium.com/analytics-vidhya/what-is-roc-auc-curve-52d71b93fa2f) AUC is Area Under Curve which is the area under the ROC plot. <br>So how AUC is important. whenever you want to compare the performance of two ML models so the one with higher AUC is performing well than the other. <br>As the AUC will only increase if and only if ROC has points on the top left as it conveys that for some threshold the model has high TPR and low FPR.

* [Lookahead bug when loading saved model #1373](https://github.com/tensorflow/addons/issues/1373)

In [None]:
# callbacks = EarlyStopping(monitor='AUC', patience=20, verbose=2, mode ='auto')
# # EarlyStopping doesn't apply when there is no validation data
# # , and the AUC should be changed to val_(sth).

if MANUAL_VALIDATING:

    er = EarlyStopping(patience = 25, 
                    restore_best_weights = True, 
                    monitor = 'val_loss')
    ReduceLR = tf.keras.callbacks.ReduceLROnPlateau(monitor = 'val_loss',
                                                  factor = 0.1,
                                                  patience = 20,
                                                  verbose = 1,
                                                  mode = 'auto',
                                                    min_lr=0.001)
    model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
      filepath = 'js_model_v1_weights.h5',
      save_weights_only = True,
      verbose = 0,
      monitor = 'val_loss',
      save_best_only = True)

    nn_callbacks = [er, ReduceLR, model_checkpoint_callback]

    clf = create_mlp(len(features), 5, hidden_units, 
                  dropout_rates, label_smoothing, learning_rate)
    history = clf.fit(
      X_train_v, y_train_v, 
      epochs = epochs[0], 
      batch_size = batch_size, 
      verbose = 1, 
      validation_data = (X_test_v, y_test_v), 
      callbacks = nn_callbacks
    )
    
    # print('See what is the Learning Curve looks like...')
    # import matplotlib.pyplot as plt

    # plot train&val AUC by training epoch
    plt.plot(history.history['AUC'])
    plt.plot(history.history['val_AUC'])
    plt.title('Model accuracy')
    plt.ylabel('AUC')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Test'], loc='upper right')
    plt.show()

    # plot train&val loss by training epoch
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('Model loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Test)'], loc='upper right')
    plt.show()

    #del X_train_v, y_train_v, X_test_v, y_test_v
    clf.summary()
    models.append(clf)

if LOAD_MANUAL_VALIDATING:
    clf = create_mlp(len(features), 5, hidden_units,
                  dropout_rates, label_smoothing, learning_rate)
    clf.load_weights('../input/jsnnmodelsv2/js_model_v1_weights.h5')

    print('Loading succeeded!\n')

    clf.summary()  

    models.append(clf)



In [None]:
if TRAINING:
    tf.keras.backend.clear_session()
    tf.random.set_seed(SEED)
    clf = create_mlp(len(features), 5, hidden_units, 
                  dropout_rates, label_smoothing, learning_rate)
    clf.fit(X_train_v, y_train_v, epochs=epochs[0], # epochs[0] == 200
          batch_size=batch_size, verbose=1, validation_data=(X_test_v, y_test_v),
          # callbacks=nn_callbacks, 
          )

    clf.save_weights('js_model_v2_weights.h5')
    print('Training succeeded!\n')
    clf.summary()  
    models.append(clf)
    
if LOAD_TRAINING:
    clf = create_mlp(len(features), 5, hidden_units,
                  dropout_rates, label_smoothing, learning_rate)
    clf.load_weights('../input/jsnnmodelsv2/js_model_v2_weights.h5')

    print('Loading succeeded!\n')
    clf.summary()
    models.append(clf)
  



# Submit

* [20210204 | Speed up your prediction](https://www.kaggle.com/tocha4/20210204-speed-up-your-prediction) # Adopt this notebook and predict faster.

In [None]:
th = 0.5 # 0.502
f = np.median
#f = np.amax
#models = models[-1] # :, -4 (timeout error)
#models = models[0:4]

import janestreet
env = janestreet.make_env()

In [None]:
test_df_columns = ['weight'] + [f'feature_{i}' for i in range(130)] + ['date']
index_features = [n for n, col in enumerate(test_df_columns) if col in features]


for (test_df, pred_df) in tqdm(env.iter_test()):
    if test_df['weight'].values[0] > 0:
    
#     x_tt = test_df.loc[:, features].values
        x_tt = test_df.values[0][index_features].reshape(1, -1)
    
        if np.isnan(x_tt[:, 1:].sum()):
            x_tt[:, 1:] = np.nan_to_num(x_tt[:, 1:]) + np.isnan(x_tt[:, 1:]) * f_mean
    
        pred = np.mean([model(x_tt, training=False).numpy() for model in models], axis=0)
        #pred = models(x_tt, training=False).numpy() # one model only
        pred = f(pred)
        #rint(pred, flush = True)
    
#     pred_df.action = np.where(pred >= th, 1, 0).astype(int)
        pred_df['action'].values[0] = int(pred >= th)
        #pred_df['action'].values[0] = pred 
    else:
        pred_df['action'].values[0] = 0
        
    env.predict(pred_df)

# References

* [OWN Jane Street with Keras NN](https://www.kaggle.com/tarlannazarov/own-jane-street-with-keras-nn)

* [Jane Street with Keras NN overfit](https://www.kaggle.com/code1110/jane-street-with-keras-nn-overfit)

* [【中文思路】Try to use NN baseline](https://www.kaggle.com/chixujohnny/try-to-use-nn-baseline)

* [Purged Time Series CV, XGBoost, Optuna 🔪📆](https://www.kaggle.com/marketneutral/purged-time-series-cv-xgboost-optuna#Time-Series-Cross-Validation)

* [[JaneStreet\] MLP inference (stage3)](https://www.kaggle.com/code1110/janestreet-mlp-inference-stage3)

* [Pytorch Resnet Starter[Training]🔥🔥🔥](https://www.kaggle.com/a763337092/pytorch-resnet-starter-training/comments)