# Introduction

# Setup

In [None]:
%pip install pandas numpy scikit-learn xgboost matplotlib seaborn

In [None]:
# Import libraries
import gc
import jo_wilder

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import KFold, GroupKFold
from xgboost import XGBClassifier
from sklearn.metrics import f1_score

In [None]:
# Setup matplotlib
%matplotlib inline

# Data loading

In [None]:
# Path to files
test_csv_path = "./data/test.csv"
train_csv_path = "./data/train.csv"
target_labels_csv = "./data/train_labels.csv"

In [None]:
level_group_cat_type = pd.CategoricalDtype(
    categories=["0-4", "5-12", "13-22"], ordered=True
)

In [None]:
train_df = pd.read_csv(
    train_csv_path,
    index_col="index",
    dtype={
        "session_id": "int64",
        "elapsed_time": "int32",
        "event_name": "category",
        "name": "category",
        "level": "int8",
        "page": "Int8",
        "room_coor_x": "float32",
        "room_coor_y": "float32",
        "screen_coor_x": "float32",
        "screen_coor_y": "float32",
        "hover_duration": "float32",
        "text": "str",
        "fqid": "category",
        "room_fqid": "category",
        "text_fqid": "category",
        "fullscreen": "int8",
        "hq": "int8",
        "music": "int8",
        "level_group": level_group_cat_type,
    },
)

In [None]:
train_df.head()

In [None]:
target_df = pd.read_csv(target_labels_csv)

In [None]:
target_df["session"] = target_df.session_id.apply(lambda x: int(x.split("_")[0]))

In [None]:
target_df["q"] = target_df.session_id.apply(lambda x: int(x.split("_")[-1][1:]))

In [None]:
target_df["correct"] = target_df["correct"].astype("int8")
target_df["q"] = target_df["q"].astype("int8")

In [None]:
target_df.head()

# Feature engineering

In [None]:
categorical_cols = ["event_name", "fqid", "room_fqid", "text"]
numerical_cols = [
    "elapsed_time",
    "level",
    "page",
    "room_coor_x",
    "room_coor_y",
    "screen_coor_x",
    "screen_coor_y",
    "hover_duration",
]

In [None]:
events = train_df.event_name.unique().tolist()
events

In [None]:
def feature_engineer(train_df):
    dfs = []

    for c in categorical_cols:
        tmp = train_df.groupby(["session_id", "level_group"])[c].agg("nunique")
        tmp.name = tmp.name + "_nunique"

        dfs.append(tmp)

    for c in numerical_cols:
        tmp = train_df.groupby(["session_id", "level_group"])[c].agg("mean")
        tmp.name = tmp.name + "_mean"

        dfs.append(tmp)

    for c in numerical_cols:
        tmp = train_df.groupby(["session_id", "level_group"])[c].agg("std")
        tmp.name = tmp.name + "_std"

        dfs.append(tmp)

    for c in events:
        train_df[c] = (train_df.event_name == c).astype("int8")

    for c in events + ["elapsed_time"]:
        tmp = train_df.groupby(["session_id", "level_group"])[c].agg("sum")
        tmp.name = tmp.name + "_sum"

        dfs.append(tmp)

    train_df = train_df.drop(events, axis=1)

    df = pd.concat(dfs, axis=1)
    df = df.fillna(-1)
    df = df.reset_index()
    df = df.set_index("session_id")

    return df

In [None]:
df = feature_engineer(train_df)

In [None]:
del train_df
gc.collect()

In [None]:
df.head()

# Train model

In [None]:
features = [c for c in df.columns if c != "level_group"]
users = df.index.unique()

In [None]:
gkf = GroupKFold(n_splits=5)
oof = pd.DataFrame(
    data=np.zeros((len(users), 18)),
    index=users,
)
models = {}

In [None]:
for i, (train_index, test_index) in enumerate(gkf.split(X=df, groups=df.index)):
    print("#" * 25)
    print("### Fold", i + 1)
    print("#" * 25)

    xgb_params = {
        "objective": "binary:logistic",
        "eval_metric": "logloss",
        "learning_rate": 0.05,
        "max_depth": 4,
        "n_estimators": 1000,
        "early_stopping_rounds": 50,
        "tree_method": "hist",
        "subsample": 0.8,
        "colsample_bytree": 0.4,
        "use_label_encoder": False,
    }

    for t in range(1, 19):
        if t <= 3:
            grp = "0-4"
        elif t <= 13:
            grp = "5-12"
        elif t <= 22:
            grp = "13-22"

        # TRAIN DATA
        train_x = df.iloc[train_index]
        train_x = train_x.loc[train_x.level_group == grp]
        train_users = train_x.index.values
        train_y = target_df.loc[target_df.q == t].set_index("session").loc[train_users]

        # VALID DATA
        valid_x = df.iloc[test_index]
        valid_x = valid_x.loc[valid_x.level_group == grp]
        valid_users = valid_x.index.values
        valid_y = target_df.loc[target_df.q == t].set_index("session").loc[valid_users]

        # TRAIN MODEL
        clf = XGBClassifier(**xgb_params)
        clf.fit(
            train_x[features].astype("float32"),
            train_y["correct"],
            eval_set=[(valid_x[features].astype("float32"), valid_y["correct"])],
            verbose=0,
        )
        print(f"{t}({clf.best_ntree_limit}), ", end="")

        # SAVE MODEL, PREDICT VALID OOF
        models[f"{grp}_{t}"] = clf
        oof.loc[valid_users, t - 1] = clf.predict_proba(
            valid_x[features].astype("float32")
        )[:, 1]

    print()

# CV score

In [None]:
true = oof.copy()
for k in range(18):
    # GET TRUE LABELS
    tmp = target_df.loc[target_df.q == k + 1].set_index("session").loc[users]
    true[k] = tmp.correct.values

In [None]:
scores = []
thresholds = []

best_score = 0
best_threshold = 0

for threshold in np.arange(0.4, 0.81, 0.01):
    print(f"{threshold:.02f}, ", end="")
    preds = (oof.values.reshape((-1)) > threshold).astype("int")
    m = f1_score(true.values.reshape((-1)), preds, average="macro")
    scores.append(m)
    thresholds.append(threshold)
    if m > best_score:
        best_score = m
        best_threshold = threshold

In [None]:
plt.figure(figsize=(20, 5))
plt.plot(thresholds, scores, "-o", color="blue")
plt.scatter([best_threshold], [best_score], color="blue", s=300, alpha=1)
plt.xlabel("Threshold", size=14)
plt.ylabel("Validation F1 Score", size=14)
plt.title(
    f"Threshold vs. F1_Score with Best F1_Score = {best_score:.3f} at Best Threshold = {best_threshold:.3}",
    size=18,
)
plt.show()

In [None]:
print("When using optimal threshold...")
for k in range(18):
    # COMPUTE F1 SCORE PER QUESTION
    m = f1_score(
        true[k].values, (oof[k].values > best_threshold).astype("int"), average="macro"
    )
    print(f"Q{k}: F1 =", m)

# COMPUTE F1 SCORE OVERALL
m = f1_score(
    true.values.reshape((-1)),
    (oof.values.reshape((-1)) > best_threshold).astype("int"),
    average="macro",
)
print("==> Overall F1 =", m)

# Infer test data

In [None]:
# Create environment
env = jo_wilder.make_env()
iter_test = env.iter_test()

In [None]:
# Clear memory
del target_df, df, oof, true
gc.collect()

In [None]:
limits = {"0-4": (1, 4), "5-12": (4, 14), "13-22": (14, 19)}

for test, sample_submission in iter_test:
    # FEATURE ENGINEER TEST DATA
    df = feature_engineer(test)

    # INFER TEST DATA
    grp = test.level_group.values[0]
    a, b = limits[grp]
    for t in range(a, b):
        clf = models[f"{grp}_{t}"]
        p = clf.predict_proba(df[features].astype("float32"))[0, 1]
        mask = sample_submission.session_id.str.contains(f"q{t}")
        sample_submission.loc[mask, "correct"] = int(p > best_threshold)

    env.predict(sample_submission)

# Submission

In [None]:
df = pd.read_csv("submission.csv")
df.head()

In [None]:
df.correct.mean()