### Prepare input data

In [None]:
%%bash
mkdir -p ../input
mkdir -p ../output
cd ../input

export KAGGLE_USERNAME="fess38"
export KAGGLE_KEY="071966146ec1ebef62023a5efa0574b1"
kaggle competitions download -c jane-street-market-prediction

unzip jane-street-market-prediction.zip
rm jane-street-market-prediction.zip

### Imports

In [2]:
import warnings
warnings.filterwarnings("ignore")

import datetime
import json
import os
import pickle
import random
import sys

In [74]:
import numpy as np
import pandas as pd

from catboost import sum_models, CatBoostClassifier, CatBoostRegressor, Pool

from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split, GroupShuffleSplit, TimeSeriesSplit
from sklearn.preprocessing import StandardScaler

import tensorflow as tf
import tensorflow.keras.backend as K
import tensorflow.keras.layers as L
from tensorflow.keras.callbacks import Callback, ReduceLROnPlateau, ModelCheckpoint, EarlyStopping

In [4]:
%matplotlib inline

import matplotlib as mpl
import matplotlib.dates as mdates
import matplotlib.pyplot as plt

plt.style.use("seaborn")
mpl.rcParams["figure.figsize"] = (11, 5)
mpl.rcParams["figure.dpi"]= 100
mpl.rcParams["lines.linewidth"] = 0.75

### Init

In [5]:
input_data_path = "../input/"
output_data_path = "../output/"
features = ["feature_" + str(i) for i in range(130)]

In [6]:
def seed_all(seed):
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    pass

random_state = 42
seed_all(random_state)

### Tools

In [90]:
from numba import njit

@njit(fastmath=True)
def utility_score(date, weight, resp, action):
    pi = np.bincount(date, weight * resp * action)
    t = np.sum(pi) / np.sqrt(np.sum(pi**2)) * np.sqrt(250 / len(pi))
    return min(max(t, 0), 6) * np.sum(pi)

def apply_tf_model(df, model):
    return np.stack(model.predict(df), axis=1)[0]

def estimate_model(df, model, features=features, threshold=0):
    expected_score = utility_score(
        df["date"].values,
        df["weight"].values,
        df["resp"].values,
        df["action"].values
    )
    actual_score = utility_score(
        df["date"].values,
        df["weight"].values,
        df["resp"].values,
        apply_tf_model
        (apply_tf_model(df[features], model) > threshold).astype(int)
        if "tensorflow" in str(type(model))
        else (model.predict(df[features], prediction_type="RawFormulaVal") > threshold).astype(int)
    )
    print(int(expected_score), int(actual_score), round(actual_score / expected_score, 2))
    
def feature_importances(model, top_n=20):
    values = sorted(list(zip(model.feature_names_, model.feature_importances_)), key=lambda x: -x[1])
    for value in values[:top_n]:
        print(value[0], ": ", str(round(value[1], 2)))

### Read data

In [8]:
df = pd.read_csv(input_data_path + "train.csv")
df = df.astype({c: np.float32 for c in df.select_dtypes(include="float64").columns})
df["action"] = (df["resp"] > 0).astype(int)

features_info = pd.read_csv(input_data_path + "features.csv")
features_info.set_index(keys=["feature"], inplace=True)

#### Fill nan

In [9]:
def fillna_mean(df):
    features_mean = df[features].mean()
    df[features] = df[features].fillna(features_mean)
    with open(prepared_data_path + "features_mean.pkl", "wb") as f:
        pickle.dump(features_mean, f)

In [10]:
def fillna_ffill(df):
    df[features] = df[features].fillna(method = "ffill").fillna(0)

In [11]:
def fillna_mean_by_feature_0(df):
    features_mean = df[features].groupby("feature_0").mean()
    features_mean["feature_0"] = features_mean.index
    df.sort_values(by="feature_0", inplace=True)
    df[features] = pd.concat([
        df[df["feature_0"] == -1][features].fillna(features_mean.loc[-1]),
        df[df["feature_0"] == 1][features].fillna(features_mean.loc[1])
    ])
    df = df.sample(frac=1).reset_index(drop=True)
    with open(output_data_path + "features_mean.pkl", "wb") as f:
        pickle.dump(features_mean, f)

In [12]:
fillna_mean_by_feature_0(df)

## Train

In [13]:
catboost_models = {}
tf_models = {}

#### Split to 2-stage train and validation

In [14]:
date_splits = {
    "train_1": [0, 224],
    "train_2": [225, 449],
    "val": [450, 499]
}
for name, interval in date_splits.items():
    df["is_" + name] = df["date"].apply(lambda x: x >= interval[0] and x <= interval[1])    

#### Normalize data

In [15]:
scaler = StandardScaler()
scaler.fit(df[(df["is_train_1"])|(df["is_train_2"])][features])
df[features] = scaler.transform(df[features])

In [106]:
with open(output_data_path + "scaler.pkl", "wb") as f:
        pickle.dump(scaler, f)

#### Shuffle by dates

In [16]:
date_to_index = {}
dates = list(set(df["date"].values))
np.random.shuffle(dates)
for i, date in enumerate(dates):
    date_to_index[date] = i
df["order_id"] = df["date"].apply(lambda x: date_to_index[x])
df.sort_values(by=["order_id", "ts_id"], inplace=True)
df.reset_index(drop=True, inplace=True)

#### Catboost with random train/test split

In [17]:
train, test = train_test_split(df[df["is_train_1"]], test_size=0.2, random_state=random_state)

model = CatBoostClassifier(
    loss_function="Logloss",
    custom_metric=["Precision", "Recall", "F1"],
    iterations=1000,
    learning_rate=None,
    random_seed=random_state,
    l2_leaf_reg=3,
    use_best_model=True,
    depth=8,
    auto_class_weights="Balanced",
    od_type="Iter",
    od_wait=100,
    task_type="GPU" if tf.config.list_physical_devices("GPU") else "CPU",
    metric_period=250,
    verbose=True
)

model.fit(
    X=Pool(
        data=train[features],
        label=train["action"],
        weight=train["weight"]
    ),
    eval_set=Pool(
        data=test[features],
        label=test["action"],
        weight=test["weight"]
    )
)
estimate_model(df[df["is_train_1"]], model)
estimate_model(df[df["is_val"]], model)
estimate_model(df, model)
feature_importances(model, 5)
catboost_models["random split"] = model
del train, test

0:	learn: 0.6928620	test: 0.6930616	best: 0.6930616 (0)	total: 122ms	remaining: 2m 1s
250:	learn: 0.6451014	test: 0.6793427	best: 0.6793406 (249)	total: 3.6s	remaining: 10.7s
500:	learn: 0.6173234	test: 0.6731434	best: 0.6731434 (500)	total: 7.01s	remaining: 6.98s
750:	learn: 0.5944547	test: 0.6679814	best: 0.6679814 (750)	total: 10.4s	remaining: 3.46s
999:	learn: 0.5756203	test: 0.6635319	best: 0.6635319 (999)	total: 13.9s	remaining: 0us
bestTest = 0.6635319251
bestIteration = 999
95950 40883 0.43
20083 398 0.02
224162 44106 0.2
feature_51 :  10.11
feature_41 :  5.79
feature_43 :  3.13
feature_44 :  2.89
feature_45 :  2.84


#### Catboost with date train/test split

In [18]:
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=random_state)
for train_idx, test_idx in gss.split(X=df[df["is_train_1"]].values, groups=df[df["is_train_1"]]["order_id"].values):
    pass

model = CatBoostClassifier(
    loss_function="Logloss",
    custom_metric=["Precision", "Recall", "F1"],
    iterations=1000,
    learning_rate=None,
    random_seed=random_state,
    l2_leaf_reg=3,
    use_best_model=True,
    depth=8,
    auto_class_weights="Balanced",
    od_type="Iter",
    od_wait=100,
    task_type="GPU" if tf.config.list_physical_devices("GPU") else "CPU",
    metric_period=250,
    verbose=True
)

model.fit(
    X=Pool(
        data=df[df["is_train_1"]].iloc[train_idx][features],
        label=df[df["is_train_1"]].iloc[train_idx]["action"],
        weight=df[df["is_train_1"]].iloc[train_idx]["weight"],
        group_id=df[df["is_train_1"]].iloc[train_idx]["date"]
    ),
    eval_set=Pool(
        data=df[df["is_train_1"]].iloc[test_idx][features],
        label=df[df["is_train_1"]].iloc[test_idx]["action"],
        weight=df[df["is_train_1"]].iloc[test_idx]["weight"],
        group_id=df[df["is_train_1"]].iloc[test_idx]["date"]
    )
)
estimate_model(df[df["is_train_1"]], model)
estimate_model(df[df["is_val"]], model)
estimate_model(df, model)
feature_importances(model, 5)
catboost_models["group by date split"] = model
del train_idx, test_idx

0:	learn: 0.6928000	test: 0.6931739	best: 0.6931739 (0)	total: 18.4ms	remaining: 18.4s
bestTest = 0.6923080877
bestIteration = 32
Shrink model to first 33 iterations.
95950 14778 0.15
20083 278 0.01
224162 10909 0.05
feature_51 :  10.94
feature_41 :  8.54
feature_5 :  6.9
feature_42 :  5.37
feature_39 :  4.59


### MLP

In [65]:
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=random_state)
for train_idx, test_idx in gss.split(X=df[df["is_train_1"]].values, groups=df[df["is_train_1"]]["order_id"].values):
    pass

inp = L.Input(shape = (len(features),))
#x = L.BatchNormalization()(inp)
#x = L.Dropout(0.2)(x)
x = L.Dense(64)(inp)
x = L.Dropout(0.2)(x)
x = L.Dense(32)(x)
x = L.Dense(1)(x)
out = L.Activation("sigmoid")(x)

model = tf.keras.models.Model(inputs = inp, outputs = out)
model.compile(
    optimizer = tf.keras.optimizers.Adam(learning_rate=1e-2),
    loss = tf.keras.losses.BinaryCrossentropy(), 
    metrics = tf.keras.metrics.AUC(name = "AUC")
)

model.fit(
    df[df["is_train_1"]].iloc[train_idx][features],
    df[df["is_train_1"]].iloc[train_idx]["action"],
    validation_data=(
        df[df["is_train_1"]].iloc[test_idx][features],
        df[df["is_train_1"]].iloc[test_idx]["action"]
    ),
    epochs=25, 
    batch_size=8*1024,
    callbacks=[],
    verbose=1
)

estimate_model(df[df["is_val"]], model)
tf_models["mlp"] = model
K.clear_session()
del train_idx, test_idx

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
20083 1 0.0


### Resulting model

In [104]:
extended_features = features[:]
counter = 1
for name, model in catboost_models.items():
    extended_features.append(name)
    df[name] = model.predict(df[features])
    model.save_model(output_data_path + "catboost_model_" + str(counter) + ".cbm")
    counter += 1
for name, model in tf_models.items():
    extended_features.append(name)
    df[name] = apply_tf_model(df[features], model)
    model.save(output_data_path + "tf_model_" + str(counter) + ".h5")
    counter += 1

In [105]:
model = CatBoostClassifier(
    loss_function="Logloss",
    custom_metric=["Precision", "Recall", "F1"],
    iterations=2000,
    learning_rate=None,
    random_seed=random_state,
    l2_leaf_reg=3,
    use_best_model=False,
    depth=8,
    auto_class_weights="Balanced",
    od_type="Iter",
    od_wait=100,
    task_type="GPU" if tf.config.list_physical_devices("GPU") else "CPU",
    metric_period=250,
    verbose=True
)

model.fit(
    X=Pool(
        data=df[df["is_train_2"]][extended_features],
        label=df[df["is_train_2"]]["action"],
        weight=df[df["is_train_2"]]["weight"],
        group_id=df[df["is_train_2"]]["date"]
    )
)
estimate_model(df[df["is_train_2"]], model, extended_features)
estimate_model(df[df["is_train_1"]], model, extended_features)
estimate_model(df[df["is_val"]], model, extended_features)
estimate_model(df, model, extended_features)
feature_importances(model, 5)
model.save_model(output_data_path + "model.cbm")