### Prepare input data

In [None]:
%%bash
mkdir -p ../input
mkdir -p ../output
cd ../input

export KAGGLE_USERNAME="fess38"
export KAGGLE_KEY="071966146ec1ebef62023a5efa0574b1"
kaggle competitions download -c jane-street-market-prediction

unzip jane-street-market-prediction.zip
rm jane-street-market-prediction.zip

### Imports

In [1]:
import warnings
warnings.filterwarnings("ignore")

import datetime
import json
import os
import pickle
import random
import sys
import time

In [119]:
import numpy as np
import pandas as pd

from catboost import sum_models, CatBoostClassifier, CatBoostRegressor, Pool
from catboost.utils import get_gpu_device_count

from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split, ParameterGrid, ShuffleSplit
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.utils import class_weight

import tensorflow as tf
import tensorflow.keras as K
import tensorflow.keras.layers as L
from tensorflow.keras.models import Model

import kerastuner as kt

import datatable as dt
from tqdm import tqdm

In [3]:
%matplotlib inline

import matplotlib as mpl
import matplotlib.dates as mdates
import matplotlib.pyplot as plt

plt.style.use("seaborn")
mpl.rcParams["figure.figsize"] = (11, 5)
mpl.rcParams["figure.dpi"]= 100
mpl.rcParams["lines.linewidth"] = 0.75

### Init

In [4]:
input_data_path = "../input/"
output_data_path = "../output/"
features = ["feature_" + str(i) for i in range(130)]

In [5]:
np.set_printoptions(precision=4)
np.set_printoptions(suppress=True)

random_state = 42
random.seed(random_state)
np.random.seed(random_state)
tf.random.set_seed(random_state)
os.environ["PYTHONHASHSEED"] = str(random_state)

### Tools

In [6]:
from numba import njit

@njit(fastmath=True)
def utility_score(date, weight, resp, action):
    pi = np.bincount(date, weight * resp * action)
    t = np.sum(pi) / np.sqrt(np.sum(pi**2)) * np.sqrt(250 / len(pi))
    return int(min(max(t, 0), 6) * np.sum(pi))

def split_df(df, date_splits):
    for name, interval in date_splits.items():
        df["is_" + name] = df["date"].apply(lambda x: x >= interval[0] and x <= interval[1])

#### Catboost

In [77]:
def feature_importances(model, top_n=20):
    values = sorted(list(zip(model.feature_names_, model.feature_importances_)), key=lambda x: -x[1])
    for value in values[:top_n]:
        print(value[0], ": ", str(round(value[1], 2)))

def estimate_model(df, model, features=features, threshold=0, print_result=True):
    expected_score = utility_score(
        df["date"].values,
        df["weight"].values,
        df["resp"].values,
        df["action"].values
    )
    actual_score = utility_score(
        df["date"].values,
        df["weight"].values,
        df["resp"].values,
        (
            (
                np.stack(model(df[features].values, training=False).numpy(), axis=1)[0]
                if "tensorflow" in str(type(model))
                else model.predict(df[features], prediction_type="RawFormulaVal")
            ) > threshold
        ).astype(int)
        
    )
    share = round(actual_score / expected_score, 2)
    if print_result:
        print(expected_score, actual_score, share)
    return actual_score

### Read data

In [8]:
%%time
df = dt.fread(input_data_path + "train.csv").to_pandas()
df = df.astype({c: np.float32 for c in df.select_dtypes(include="float64").columns})
df["action"] = (df["resp"] > 0).astype(int)
df = df.sample(frac=1).reset_index(drop=True)

features_info = pd.read_csv(input_data_path + "features.csv")
features_info.set_index(keys=["feature"], inplace=True)

CPU times: user 34.3 s, sys: 18 s, total: 52.4 s
Wall time: 3min 4s


#### Fill nan

In [9]:
def fillna_mean(df):
    features_mean = df[features].mean()
    df[features] = df[features].fillna(features_mean)
    with open(output_data_path + "features_mean.pkl", "wb") as f:
        pickle.dump(features_mean, f)

In [10]:
def fillna_ffill(df):
    df[features] = df[features].fillna(method = "ffill").fillna(0)

In [11]:
def fillna_mean_by_feature_0(df):
    features_mean = df[features].groupby("feature_0").mean()
    features_mean["feature_0"] = features_mean.index
    df.sort_values(by="feature_0", inplace=True)
    df[features] = pd.concat([
        df[df["feature_0"] == -1][features].fillna(features_mean.loc[-1]),
        df[df["feature_0"] == 1][features].fillna(features_mean.loc[1])
    ])
    df = df.sample(frac=1).reset_index(drop=True)
    with open(output_data_path + "features_mean.pkl", "wb") as f:
        pickle.dump(features_mean, f)

In [12]:
fillna_mean_by_feature_0(df)

## Train

#### Split

In [88]:
dates = list(set(df["date"].values))
split = ShuffleSplit(n_splits=5, train_size=0.6, test_size=0.2, random_state=random_state)
counter = 1
for train_dates, test_dates in split.split(dates):
    train_dates = set(train_dates)
    test_dates = set(test_dates)
    df["is_train_" + str(counter)] = df["date"].apply(lambda x: x in train_dates)
    df["is_test_" + str(counter)] = df["date"].apply(lambda x: x in test_dates)
    df["is_val_" + str(counter)] = df["date"].apply(lambda x: x not in train_dates and x not in test_dates)
    counter += 1

### End2End Catboost model

In [None]:
date_splits = {
    "train": [0, 449],
    "val": [450, 499]
}
split_df(df, date_splits)

#### Grid search

In [None]:
params_grid = ParameterGrid({
    "iterations": [2000],
    "learning_rate": [0.001],
    "l2_leaf_reg": [3],
    "depth": [16],
    "random_strength": [1],
    "bagging_temperature": [1],
    "border_count": [128],
    "grow_policy": ["SymmetricTree", "Depthwise", "Lossguide"],
    "use_weight": [0],
    "use_group_id": [0]
})
params_grid = sorted(list(params_grid), key=lambda x: x["use_group_id"])

grid_search_result = []
dates = list(set(df[df["is_train"]]["date"].values))
sorted_by_dates, sorted_randomly = False, False

for params in tqdm(params_grid, desc="Params Tuning"):
    scores = []
    if params["use_group_id"] and not sorted_by_dates:
        df.sort_values(by=["order_id", "rnd"], inplace=True)
        df.reset_index(drop=True, inplace=True)
        sorted_by_dates = True
        sorted_randomly = False
    if not params["use_group_id"] and not sorted_randomly:
        df.sort_values(by=["rnd"], inplace=True)
        df.reset_index(drop=True, inplace=True)
        sorted_by_dates = False
        sorted_randomly = True

    for i in range(3):
        train_dates, test_dates = train_test_split(dates, test_size=0.2, random_state=random_state+i)    
        model = CatBoostClassifier(
            loss_function="Logloss",
            iterations=params["iterations"],
            learning_rate=params["learning_rate"],
            random_seed=random_state,
            l2_leaf_reg=params["l2_leaf_reg"],
            use_best_model=True,
            depth=params["depth"],
            random_strength=params["random_strength"],
            bagging_temperature=params["bagging_temperature"],
            border_count=params["border_count"],
            grow_policy=params["grow_policy"],
            auto_class_weights="Balanced",
            early_stopping_rounds=100,
            task_type="GPU" if get_gpu_device_count() else "CPU",
            verbose=False
        )
        
        model.fit(
            X=Pool(
                data=df[(df["is_train"]) & (df["date"].isin(train_dates))][features],
                label=df[(df["is_train"]) & (df["date"].isin(train_dates))]["action"],
                weight=
                    df[(df["is_train"]) & (df["date"].isin(train_dates))]["weight"]
                    if params["use_weight"] else None,
                group_id=
                    df[(df["is_train"]) & (df["date"].isin(train_dates))]["date"]
                    if params["use_group_id"] else None
            ),
            eval_set=Pool(
                data=df[(df["is_train"]) & (df["date"].isin(test_dates))][features],
                label=df[(df["is_train"]) & (df["date"].isin(test_dates))]["action"],
                weight=
                    df[(df["is_train"]) & (df["date"].isin(test_dates))]["weight"]
                    if params["use_weight"] else None,
                group_id=
                    df[(df["is_train"]) & (df["date"].isin(test_dates))]["date"]
                    if params["use_group_id"] else None
            )
        )
        scores.append(estimate_model(df[df["is_val"]], model, print_result=False))
        pass
    grid_search_result.append({
        "params": params,
        "score": sum(scores) / len(scores),
        "best_iteration": model.best_iteration_,
        "best_score": model.best_score_
    })
    grid_search_result = sorted(grid_search_result, key=lambda x: -x["score"])
    with open(output_data_path + "grid_search_result.json", "w") as f:
        f.write(json.dumps(grid_search_result, indent=2))
    pass

#### Use best params

In [None]:
params = {
    "iterations": 60,
    "learning_rate": 0.03,
    "l2_leaf_reg": 3,
    "depth": 12,
    "random_strength": 1,
    "bagging_temperature": 1,
    "border_count": 128,
    "grow_policy": "SymmetricTree",
    "use_weight": 0,
    "use_group_id": 1
}

df.sort_values(
    by=["order_id", "rnd"] if params["use_group_id"] else ["rnd"],
    inplace=True
)
df.reset_index(drop=True, inplace=True)

model = CatBoostClassifier(
    loss_function="Logloss",
    iterations=params["iterations"],
    learning_rate=params["learning_rate"],
    random_seed=random_state,
    l2_leaf_reg=params["l2_leaf_reg"],
    depth=params["depth"],
    random_strength=params["random_strength"],
    bagging_temperature=params["bagging_temperature"],
    border_count=params["border_count"],
    grow_policy=params["grow_policy"],
    auto_class_weights="Balanced",
    task_type="GPU" if get_gpu_device_count() else "CPU",
    verbose=False
)

model.fit(
    X=Pool(
        data=df[features],
        label=df["action"],
        weight=df["weight"] if params["use_weight"] else None,
        group_id=df["date"] if params["use_group_id"] else None
    )
)

estimate_model(df[df["is_train"]], model)
estimate_model(df[df["is_val"]], model)
estimate_model(df, model)
model.save_model(output_data_path + "model.cbm")

#### Params analysis

In [None]:
with open(output_data_path + "grid_search_result.json", "r") as f:
    grid_search_results = json.loads(f.read())
data = {}
param = "use_group_id"
for grid_search_result in grid_search_results:
    params = grid_search_result["params"]
    data[params[param]] = data.get(params[param], []) + [grid_search_result["score"]]
for key, values in data.items():
    values = np.array(values)
    print(key, int(np.mean(values)), int(np.median(values)), int(max(values)))

### Keras autoencoder

In [None]:
scaler = StandardScaler()
scaler.fit(df[features])
df[features] = scaler.transform(df[features])

In [125]:
def create_autoencoder(encoding_dim):    
    def apply_bn_and_dropout(x):
        return L.Dropout(0.2)(L.BatchNormalization()(x))
    
    inp = L.Input(len(features))
    x = L.BatchNormalization()(inp)
    x = L.GaussianNoise(0.1)(x)
    x = L.Dense(encoding_dim, activation="relu")(x)
    x = apply_bn_and_dropout(x)
    x = L.Dense(encoding_dim, activation="relu")(x)
    x = apply_bn_and_dropout(x)
    encoded = L.Dense(encoding_dim, activation="relu")(x)
    
    input_encoded = L.Input(encoding_dim)
    x = L.Dense(encoding_dim, activation="relu")(input_encoded)
    x = apply_bn_and_dropout(x)
    x = L.Dense(encoding_dim, activation="relu")(x)
    x = apply_bn_and_dropout(x)
    decoded = L.Dense(len(features), activation="linear")(x)

    encoder = Model(inp, encoded, name="encoder_" + str(encoding_dim))
    decoder = Model(input_encoded, decoded, name="decoder")
    autoencoder = Model(inp, decoder(encoder(inp)), name="autoencoder")
    return encoder, autoencoder

In [126]:
encoders = []
K.backend.clear_session()
for encode_dim in [96, 64, 32]: 
    encoder, autoencoder = create_autoencoder(encode_dim)
    autoencoder.compile(optimizer=K.optimizers.RMSprop(1e-4), loss="mse")
    
    autoencoder.fit(
        df[features],
        df[features],
        batch_size=1024*4,
        epochs=100,
        verbose=0,
        callbacks=[K.callbacks.EarlyStopping(patience=10, restore_best_weights=True)],
        validation_split=0.2,
        shuffle=True
    )
    encoder.save(output_data_path + "encoder_{}.h5".format(encode_dim))
    encoders.append(encoder)

In [None]:
new_features = encoder(df[features].values, training=False).numpy()
size = len(new_features[0])
new_columns = ["enc_features_{}_{}".format(size, i) for i in range(size)]
df[new_columns] = pd.DataFrame(new_features, index=df.index)
extended_features = features[:] + new_columns
del new_features, new_columns

In [132]:
encoders = [
    K.models.load_model(output_data_path + "encoder_96.h5"),
    K.models.load_model(output_data_path + "encoder_64.h5"),
    K.models.load_model(output_data_path + "encoder_32.h5")
]
for encoder in encoders:
    encoder.trainable = False

In [135]:
def mlp(encoders):
    def apply_bn_and_dropout(x):
        return L.Dropout(0.2)(L.BatchNormalization()(x))
    
    inp = L.Input(len(features))
    x = L.Concatenate()( [inp] + [encoder(inp) for encoder in encoders])
    x = L.BatchNormalization()(x)
    x = L.Dense(128, activation="relu")(x)
    x = apply_bn_and_dropout(x)
    x = L.Dense(128, activation="relu")(x)
    x = apply_bn_and_dropout(x)
    x = L.Dense(128, activation="relu")(x)
    x = apply_bn_and_dropout(x)
    x = L.Dense(1)(x)
    output = L.Activation("sigmoid")(x)

    return Model(inputs=inp, outputs=output, name="Model")

In [136]:
K.backend.clear_session()
model = mlp(encoders)
model.compile(
    optimizer=K.optimizers.Adam(1e-4),
    loss="binary_crossentropy", 
    metrics=[
        tf.keras.metrics.Precision(),
        tf.keras.metrics.Recall()
    ]
)
model.summary()

Model: "Model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 130)]        0                                            
__________________________________________________________________________________________________
encoder_96 (Functional)         (None, 96)           32488       input_1[0][0]                    
__________________________________________________________________________________________________
encoder_64 (Functional)         (None, 64)           17736       input_1[0][0]                    
__________________________________________________________________________________________________
encoder_32 (Functional)         (None, 32)           7080        input_1[0][0]                    
______________________________________________________________________________________________

In [137]:
weights = class_weight.compute_class_weight(
    "balanced",
    np.unique(df[df["is_train_1"]]["action"]),
    df[df["is_train_1"]]["action"]
)
model.fit(
    df[df["is_train_1"]][features],
    df[df["is_train_1"]]["action"],
    batch_size=1024*8,
    epochs=100,
    callbacks=[K.callbacks.EarlyStopping(patience=10, restore_best_weights=True)],
    validation_data=(
        df[df["is_test_1"]][features],
        df[df["is_test_1"]]["action"]
    ),
    class_weight={
        0: weights[0],
        1: weights[1]
    }
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100


<tensorflow.python.keras.callbacks.History at 0x7fe2b3eea1d0>

In [138]:
estimate_model(df[df["is_val_1"]], model, threshold=0.50)
estimate_model(df[df["is_val_2"]], model, threshold=0.50)
estimate_model(df[df["is_val_3"]], model, threshold=0.50)
estimate_model(df[df["is_val_4"]], model, threshold=0.50)
estimate_model(df[df["is_val_5"]], model, threshold=0.50)
estimate_model(df, model, threshold=0.50)

46303 2096 0.05
47788 1535 0.03
40814 1142 0.03
42194 1776 0.04
45483 903 0.02
224162 13248 0.06


13248

In [139]:
model.save(output_data_path + "model.h5")

### Other

In [None]:
train, test = train_test_split(df[df["is_train"]], test_size=0.2, random_state=random_state)

model = CatBoostClassifier(
    loss_function="Logloss",
    custom_metric=["Precision", "Recall", "F1"],
    iterations=1000,
    learning_rate=None,
    random_seed=random_state,
    l2_leaf_reg=3,
    use_best_model=True,
    depth=8,
    auto_class_weights="Balanced",
    od_type="Iter",
    od_wait=100,
    task_type="GPU" if get_gpu_device_count() else "CPU",
    metric_period=250,
    verbose=True
)

model.fit(
    X=Pool(
        data=pd.concat([train[features].reset_index(drop=True), pd.DataFrame(encoder(train[features].values, training=False).numpy()).reset_index(drop=True)], axis=1),
        label=train["action"].values,
        weight=train["weight"].values
    ),
    eval_set=Pool(
        data=pd.concat([test[features].reset_index(drop=True), pd.DataFrame(encoder(test[features].values, training=False).numpy()).reset_index(drop=True)], axis=1),
        label=test["action"].values,
        weight=test["weight"].values
    )
)
estimate_model(df[df["is_train"]], model)
estimate_model(df[df["is_val"]], model)
estimate_model(df, model)
del train, test

In [None]:
utility_score(
    df[df["is_val"]]["date"].values,
    df[df["is_val"]]["weight"].values,
    df[df["is_val"]]["resp"].values,
    (model.predict(
        pd.concat([df[df["is_val"]][features].reset_index(drop=True), pd.DataFrame(encoder(df[df["is_val"]][features].values, training=False).numpy()).reset_index(drop=True)], axis=1),
        prediction_type="RawFormulaVal") > 0).astype(int)
)

In [None]:
i = L.Input(130)
encoded = L.BatchNormalization()(i)
encoded = L.GaussianNoise(0.1)(encoded)
encoded = L.Dense(64,activation='relu')(encoded)
decoded = L.Dropout(0.2)(encoded)
decoded = L.Dense(130, name='decoded')(decoded)
x = L.Dense(64,activation='relu')(decoded)
x = L.BatchNormalization()(x)
x = L.Dropout(0.2)(x)
x = L.Dense(64,activation='relu')(x)
x = L.BatchNormalization()(x)
x = L.Dropout(0.2)(x)    
x = L.Dense(1, activation='sigmoid', name='label_output')(x)

encoder = tf.keras.models.Model(inputs=i,outputs=encoded)
autoencoder = tf.keras.models.Model(inputs=i,outputs=[decoded,x])

autoencoder.compile(optimizer=tf.keras.optimizers.Adam(0.0001),loss={'decoded':'mse', 'label_output':'binary_crossentropy'})

In [None]:
autoencoder.fit(
    df[df["is_train"]][features],
    (df[df["is_train"]][features], df[df["is_train"]]["action"]),
    epochs=25,
    batch_size=4096, 
    validation_split=0.1,
    callbacks=[EarlyStopping('val_loss', patience=10,restore_best_weights=True)],
    verbose=1
)

In [None]:
precision_score(
    (model.predict(df[df["is_val"]][features], prediction_type="RawFormulaVal") > -0.3).astype(int),
    df[df["is_val"]]["action"]
)

In [None]:
recall_score(
    (model.predict(df[df["is_val"]][features], prediction_type="RawFormulaVal") > -0.3).astype(int),
    df[df["is_val"]]["action"]
)

In [None]:
utility_score(
    df[df["is_val"]]["date"].values,
    df[df["is_val"]]["weight"].values,
    df[df["is_val"]]["resp"].values,
    #df[df["is_val"]]["action"].values
    (model.predict(df[df["is_val"]][features], prediction_type="RawFormulaVal") > -0.0).astype(int)
)

In [None]:
len(df[df["is_val"]].query("weight > 3"))

### 2-stage model

#### Split to 2-stage train and validation

In [None]:
date_splits = {
    "train_1": [0, 224],
    "train_2": [225, 449],
    "val": [450, 499]
}
split_df(df, date_splits)

#### Normalize data

In [None]:
scaler = StandardScaler()
scaler.fit(df[(df["is_train_1"])|(df["is_train_2"])][features])
df[features] = scaler.transform(df[features])

In [None]:
with open(output_data_path + "scaler.pkl", "wb") as f:
        pickle.dump(scaler, f)

#### Catboost with random train/test split

In [None]:
train, test = train_test_split(df[df["is_train_1"]], test_size=0.2, random_state=random_state)

model = CatBoostClassifier(
    loss_function="Logloss",
    custom_metric=["Precision", "Recall", "F1"],
    iterations=1000,
    learning_rate=None,
    random_seed=random_state,
    l2_leaf_reg=3,
    use_best_model=True,
    depth=8,
    auto_class_weights="Balanced",
    od_type="Iter",
    od_wait=100,
    task_type="GPU" if get_gpu_device_count() else "CPU",
    metric_period=250,
    verbose=True
)

model.fit(
    X=Pool(
        data=train[features],
        label=train["action"],
        weight=train["weight"]
    ),
    eval_set=Pool(
        data=test[features],
        label=test["action"],
        weight=test["weight"]
    )
)
estimate_model(df[df["is_train_1"]], model)
estimate_model(df[df["is_val"]], model)
estimate_model(df, model)
feature_importances(model, 5)
catboost_models["random split"] = model
del train, test

#### Catboost with date train/test split

In [None]:
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=random_state)
for train_idx, test_idx in gss.split(X=df[df["is_train_1"]].values, groups=df[df["is_train_1"]]["order_id"].values):
    pass

model = CatBoostClassifier(
    loss_function="Logloss",
    custom_metric=["Precision", "Recall", "F1"],
    iterations=1000,
    learning_rate=None,
    random_seed=random_state,
    l2_leaf_reg=3,
    use_best_model=True,
    depth=8,
    auto_class_weights="Balanced",
    od_type="Iter",
    od_wait=100,
    task_type="GPU" if get_gpu_device_count() else "CPU",
    metric_period=250,
    verbose=True
)

model.fit(
    X=Pool(
        data=df[df["is_train_1"]].iloc[train_idx][features],
        label=df[df["is_train_1"]].iloc[train_idx]["action"],
        weight=df[df["is_train_1"]].iloc[train_idx]["weight"],
        group_id=df[df["is_train_1"]].iloc[train_idx]["date"]
    ),
    eval_set=Pool(
        data=df[df["is_train_1"]].iloc[test_idx][features],
        label=df[df["is_train_1"]].iloc[test_idx]["action"],
        weight=df[df["is_train_1"]].iloc[test_idx]["weight"],
        group_id=df[df["is_train_1"]].iloc[test_idx]["date"]
    )
)
estimate_model(df[df["is_train_1"]], model)
estimate_model(df[df["is_val"]], model)
estimate_model(df, model)
feature_importances(model, 5)
catboost_models["group by date split"] = model
del train_idx, test_idx

### MLP

In [None]:
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=random_state)
for train_idx, test_idx in gss.split(X=df[df["is_train_1"]].values, groups=df[df["is_train_1"]]["order_id"].values):
    pass

inp = L.Input(shape = (len(features),))
#x = L.BatchNormalization()(inp)
#x = L.Dropout(0.2)(x)
x = L.Dense(64)(inp)
x = L.Dropout(0.2)(x)
x = L.Dense(32)(x)
x = L.Dense(1)(x)
out = L.Activation("sigmoid")(x)

model = tf.keras.models.Model(inputs = inp, outputs = out)
model.compile(
    optimizer = tf.keras.optimizers.Adam(learning_rate=1e-2),
    loss = tf.keras.losses.BinaryCrossentropy(), 
    metrics = tf.keras.metrics.AUC(name = "AUC")
)

model.fit(
    df[df["is_train_1"]].iloc[train_idx][features],
    df[df["is_train_1"]].iloc[train_idx]["action"],
    validation_data=(
        df[df["is_train_1"]].iloc[test_idx][features],
        df[df["is_train_1"]].iloc[test_idx]["action"]
    ),
    epochs=1000, 
    batch_size=8*1024,
    callbacks=[],
    verbose=1
)

estimate_model(df[df["is_val"]], model)
tf_models["mlp"] = model
K.backend.clear_session()
del train_idx, test_idx

### Resulting model

In [None]:
extended_features = features[:]
counter = 1
for name, model in catboost_models.items():
    extended_features.append(name)
    df[name] = model.predict(df[features])
    model.save_model(output_data_path + "catboost_model_" + str(counter) + ".cbm")
    counter += 1
for name, model in tf_models.items():
    extended_features.append(name)
    df[name] = apply_tf_model(df[features], model)
    model.save(output_data_path + "tf_model_" + str(counter) + ".h5")
    counter += 1

In [None]:
model = CatBoostClassifier(
    loss_function="Logloss",
    custom_metric=["Precision", "Recall", "F1"],
    iterations=2000,
    learning_rate=None,
    random_seed=random_state,
    l2_leaf_reg=3,
    use_best_model=False,
    depth=8,
    auto_class_weights="Balanced",
    od_type="Iter",
    od_wait=100,
    task_type="GPU" if get_gpu_device_count() else "CPU",
    metric_period=250,
    verbose=True
)

model.fit(
    X=Pool(
        data=df[df["is_train_2"]][extended_features],
        label=df[df["is_train_2"]]["action"],
        weight=df[df["is_train_2"]]["weight"],
        group_id=df[df["is_train_2"]]["date"]
    )
)
estimate_model(df[df["is_train_2"]], model, extended_features)
estimate_model(df[df["is_train_1"]], model, extended_features)
estimate_model(df[df["is_val"]], model, extended_features)
estimate_model(df, model, extended_features)
feature_importances(model, 5)
model.save_model(output_data_path + "model.cbm")