### Prepare input data

In [None]:
%%capture
%%bash
mkdir -p ../input
mkdir -p ../output
cd ../input

export KAGGLE_USERNAME="fess38"
export KAGGLE_KEY="071966146ec1ebef62023a5efa0574b1"
kaggle competitions download -c jane-street-market-prediction

unzip jane-street-market-prediction.zip
rm jane-street-market-prediction.zip

### Imports

In [None]:
import warnings
warnings.filterwarnings("ignore")

import datetime
import json
import os
import pickle
import random
import sys

In [None]:
import numpy as np
import pandas as pd
from catboost import sum_models, CatBoostClassifier, CatBoostRegressor, Pool
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, GroupShuffleSplit, TimeSeriesSplit
import tensorflow as tf

In [None]:
%matplotlib inline

import matplotlib as mpl
import matplotlib.dates as mdates
import matplotlib.pyplot as plt

plt.style.use("seaborn")
mpl.rcParams["figure.figsize"] = (11, 5)
mpl.rcParams["figure.dpi"]= 100
mpl.rcParams["lines.linewidth"] = 0.75

### Init

In [None]:
input_data_path = "../input/"
output_data_path = "../output/"
features = ["feature_" + str(i) for i in range(130)]

In [None]:
def seed_all(seed):
    random.seed(seed)
    np.random.seed(seed)
    #tf.random.set_seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    pass

random_state = 42
seed_all(random_state)

### Tools

In [None]:
from numba import njit

@njit(fastmath=True)
def utility_score(date, weight, resp, action):
    pi = np.bincount(date, weight * resp * action)
    t = np.sum(pi) / np.sqrt(np.sum(pi ** 2)) * np.sqrt(250 / len(pi))
    return min(max(t, 0), 6) * np.sum(pi)

def estimate_model(df, model):
    expected_score = utility_score(
        df["date"].values,
        df["weight"].values,
        df["resp"].values,
        df["action"].values
    )
    actual_score = utility_score(
        df["date"].values,
        df["weight"].values,
        df["resp"].values,
        (model.predict(df[features], prediction_type="RawFormulaVal") > 0).astype(int)
    )
    print(int(expected_score), int(actual_score), round(actual_score / expected_score, 2))
    
def feature_importances(model, top_n=20):
    values = sorted(list(zip(model.feature_names_, model.feature_importances_)), key=lambda x: -x[1])
    for value in values[:top_n]:
        print(value[0], ": ", str(round(value[1], 2)))

### Read data

In [None]:
df = pd.read_csv(input_data_path + "train.csv")
df = df.astype({c: np.float32 for c in df.select_dtypes(include="float64").columns})
df["action"] = (df["resp"] > 0).astype(int)

features_info = pd.read_csv(input_data_path + "features.csv")
features_info.set_index(keys=["feature"], inplace=True)

#### Fill nan

In [None]:
def fillna_mean(df):
    features_mean = df[features].mean()
    df[features] = df[features].fillna(features_mean)
    with open(prepared_data_path + "features_mean.pkl", "wb") as f:
        pickle.dump(features_mean, f)

In [None]:
def fillna_ffill(df):
    df[features] = df[features].fillna(method = "ffill").fillna(0)

In [None]:
def fillna_mean_by_feature_0(df):
    features_mean = df[features].groupby("feature_0").mean()
    features_mean["feature_0"] = features_mean.index
    df.sort_values(by="feature_0", inplace=True)
    df[features] = pd.concat([
        df[df["feature_0"] == -1][features].fillna(features_mean.loc[-1]),
        df[df["feature_0"] == 1][features].fillna(features_mean.loc[1])
    ])
    df = df.sample(frac=1).reset_index(drop=True)
    with open(output_data_path + "features_mean.pkl", "wb") as f:
        pickle.dump(features_mean, f)

In [None]:
fillna_mean_by_feature_0(df)

### Train

In [None]:
date_to_index = {}
dates = list(set(df["date"].values))
np.random.shuffle(dates)
for i, date in enumerate(dates):
    date_to_index[date] = i
df["order_id"] = df["date"].apply(lambda x: date_to_index[x])
df.sort_values(by=["order_id", "ts_id"], inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
df_train = df[df["date"] < 400]
df_val = df[df["date"] >= 400]

### Catboost with random train/test split

In [None]:
train, test = train_test_split(df_train, test_size=0.2, random_state=random_state)
model = CatBoostClassifier(
    loss_function="Logloss",
    custom_metric=["Precision", "Recall", "F1"],
    iterations=1000,
    learning_rate=None,
    random_seed=random_state,
    l2_leaf_reg=3,
    use_best_model=True,
    depth=4,
    auto_class_weights="Balanced",
    od_type="Iter",
    od_wait=100,
    task_type="GPU" if tf.test.is_gpu_available() else "CPU",
    metric_period=250,
    verbose=True
)

model.fit(
    X=Pool(
        data=train[features],
        label=train["action"],
        weight=train["weight"] + 1
    ),
    eval_set=Pool(
        data=test[features],
        label=test["action"],
        weight=test["weight"] + 1
    )
)
estimate_model(df_train, model)
estimate_model(df_val, model)
estimate_model(df, model)

In [None]:
model.save_model(prepared_data_path + "model.cbm")

In [None]:
models = []

gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=random_state)
for train_idx, test_idx in gss.split(X=df_train["order_id"].values, groups=df_train["order_id"].values):
    model = catboost_model()
    
    model.fit(
        X=Pool(
            data=df_train.iloc[train_idx][features],
            label=df_train.iloc[train_idx]["action"],
            #weight=df.iloc[train_idx]["weight"] + 1,
            #group_id=df_train.iloc[train_idx]["date"],
        ),
        eval_set=Pool(
            data=df_train.iloc[test_idx][features],
            label=df_train.iloc[test_idx]["action"],
            #weight=df.iloc[test_idx]["weight"] + 1,
            #group_id=df_train.iloc[test_idx]["date"],
        )
    )
    estimate_model(df_train.iloc[test_idx], model)
    models.append(model)

model = sum_models(models)
estimate_model(df_val, model)

In [None]:
model = catboost_model(use_best_model=False)
model.fit(
    X=Pool(
        data=df_train[features],
        label=df_train["action"],
        weight=df_train["weight"] + 1,
        group_id=df_train["date"],
    )
)
estimate_model(df_val, model)