# Introduction

This notebook contains the code for the feature engineering, modeling and predictions. A big part of the code is based on [this notebook by Chris Deotte](https://www.kaggle.com/code/cdeotte/xgboost-baseline-0-680) since at first it didn't occur to me to load the dataset in chunks to avoid the memory issues that the main dataset size was causing.
For the model, we use the [XGBoost](https://xgboost.readthedocs.io/en/latest/) library.

# Setup


In [None]:
%pip install numpy pandas scikit-learn xgboost matplotlib seaborn

In [None]:
# Import libraries
import gc

# import jo_wilder

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import GroupKFold
from xgboost import XGBClassifier
from sklearn.metrics import f1_score

In [None]:
# Setup matplotlib
%matplotlib inline

# Data loading


In [None]:
# Path to files
test_csv_path = "./data/test.csv"
train_csv_path = "./data/train.csv"
target_labels_csv = "./data/train_labels.csv"

In [None]:
# Load only session_id column
tmp = pd.read_csv(train_csv_path, usecols=[0])
tmp = tmp.groupby("session_id")["session_id"].agg("count")

In [None]:
# Calculate chunks and skips
pieces = 20
chunks = int(np.ceil(len(tmp) / pieces))

In [None]:
reads = []
skips = [0]

for k in range(pieces):
    a = k * chunks
    b = (k + 1) * chunks

    if b > len(tmp):
        b = len(tmp)

    r = tmp.iloc[a:b].sum()
    reads.append(r)
    skips.append(skips[-1] + r)

print(f"pieces: {pieces} of sizes: {reads}")

In [None]:
train_df = pd.read_csv(train_csv_path, nrows=reads[0])
train_df.head()

In [None]:
target_df = pd.read_csv(target_labels_csv)

In [None]:
target_df["session"] = target_df.session_id.apply(lambda x: int(x.split("_")[0]))

In [None]:
target_df["q"] = target_df.session_id.apply(lambda x: int(x.split("_")[-1][1:]))

In [None]:
target_df["correct"] = target_df["correct"].astype("int8")
target_df["q"] = target_df["q"].astype("int8")

In [None]:
target_df.head()

# Feature engineering


In [None]:
categorical_cols = [
    "event_name",
    "fqid",
    "room_fqid",
    "text",
    "text_fqid",
]

numerical_cols = [
    "elapsed_time",
    "level",
    "page",
    "room_coor_x",
    "room_coor_y",
    "screen_coor_x",
    "screen_coor_y",
    "hover_duration",
]

In [None]:
event_list = train_df["event_name"].unique().tolist()
event_list

In [None]:
name_list = train_df["text"].unique().tolist()
name_list

In [None]:
fqid_list = train_df["fqid"].unique().tolist()
fqid_list

In [None]:
room_list = train_df["room_fqid"].unique().tolist()
room_list

In [None]:
groupby_cols = ["session_id", "level_group"]

In [None]:
def feature_engineer(train_df):
    dfs = []

    agg_functions = {c: ["mean", "std", "sum", "max", "min"] for c in numerical_cols}

    for c, funcs in agg_functions.items():
        tmp = train_df.groupby(groupby_cols)[c].agg(funcs)
        tmp.columns = [f"{c}_{agg_name}" for agg_name in funcs]
        dfs.append(tmp)

    for c in categorical_cols:
        tmp = train_df.groupby(groupby_cols)[c].agg("nunique")
        tmp.name = f"{tmp.name}_nunique"
        dfs.append(tmp)

    for c in event_list:
        train_df[c] = (train_df["event_name"] == c).astype(np.int8)

    for c in event_list:
        tmp = train_df.groupby(groupby_cols).agg({c: "sum", "elapsed_time": "sum"})
        tmp.rename(
            columns={c: f"{c}_sum", "elapsed_time": f"{c}_elapsed_time_sum"},
            inplace=True,
        )
        dfs.append(tmp)

    for c in room_list:
        train_df[c] = (train_df["room_fqid"] == c).astype(np.int8)

    for c in room_list:
        tmp = train_df.groupby(groupby_cols)[c].agg("sum")
        tmp.name = f"{tmp.name}_sum"
        dfs.append(tmp)

    # Frequency encoding of fqid
    fqid_counts = train_df["fqid"].value_counts()
    train_df["fqid_freq_encoded"] = train_df["fqid"].map(fqid_counts)

    tmp = train_df.groupby(groupby_cols)["fqid_freq_encoded"].agg(
        ["mean", "sum", "max", "min"]
    )
    tmp.columns = [f"fqid_freq_encoded_{agg_name}" for agg_name in tmp.columns]
    dfs.append(tmp)

    train_df.drop(columns=["fqid", "fqid_freq_encoded"], inplace=True)

    # Frequency encoding of text
    text_counts = train_df["text"].value_counts()
    train_df["text_freq_encoded"] = train_df["text"].map(text_counts)

    tmp = train_df.groupby(groupby_cols)["text_freq_encoded"].agg(
        ["mean", "sum", "max", "min"]
    )
    tmp.columns = [f"text_freq_encoded_{agg_name}" for agg_name in tmp.columns]
    dfs.append(tmp)

    train_df.drop(columns=["text", "text_freq_encoded"], inplace=True)

    # Event frequency
    event_freq = (
        train_df.groupby(groupby_cols)["event_name"]
        .value_counts()
        .unstack(fill_value=0)
    )
    event_freq.columns = [f"{c}_freq" for c in event_freq.columns]
    dfs.append(event_freq)

    # Session duration
    session_duration = (
        train_df.groupby(groupby_cols)["elapsed_time"].max()
        - train_df.groupby(groupby_cols)["elapsed_time"].min()
    )
    session_duration.name = "session_duration"
    dfs.append(session_duration)

    # Event duration
    event_duration = (
        train_df.groupby(groupby_cols + ["event_name"])["elapsed_time"].max()
        - train_df.groupby(groupby_cols + ["event_name"])["elapsed_time"].min()
    )
    event_duration = event_duration.unstack(fill_value=0)
    event_duration.columns = [f"{c}_duration" for c in event_duration.columns]
    dfs.append(event_duration)

    # Event interval
    train_df["event_interval"] = train_df.groupby(groupby_cols)["elapsed_time"].diff()
    event_interval = train_df.groupby(groupby_cols)["event_interval"].mean()
    event_interval.name = "event_interval"
    dfs.append(event_interval)

    df = pd.concat(dfs, axis=1).fillna(-1)
    df = df.reset_index().set_index("session_id")

    _ = gc.collect()
    return df

In [None]:
# Process train_df in chunks
all_chunks = []
for k in range(pieces):
    rows = 0
    if k > 0:
        rows = range(1, skips[k] + 1)
        train_df = pd.read_csv(train_csv_path, skiprows=rows, nrows=reads[k])

    df = feature_engineer(train_df)
    all_chunks.append(df)

In [None]:
# Clean memory
del train_df
_ = gc.collect()

In [None]:
# Concatenate all chunks
df = pd.concat(all_chunks, axis=0)

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.columns

# Train model


In [None]:
features = [c for c in df.columns if c != "level_group"]
users = df.index.unique()

In [None]:
gkf = GroupKFold(n_splits=7)
oof = pd.DataFrame(
    data=np.zeros((len(users), 18)),
    index=users,
)
models = {}

In [None]:
for i, (train_index, test_index) in enumerate(gkf.split(X=df, groups=df.index)):
    print(f"Fold {i + 1} => ", end="")

    xgb_params = {
        "objective": "binary:logistic",
        "eval_metric": "logloss",
        "learning_rate": 0.05,
        "max_depth": 4,
        "n_estimators": 1000,
        "early_stopping_rounds": 50,
        "tree_method": "hist",
        "subsample": 0.8,
        "colsample_bytree": 0.4,
        "use_label_encoder": False,
    }

    for t in range(1, 19):
        if t <= 3:
            grp = "0-4"
        elif t <= 13:
            grp = "5-12"
        elif t <= 22:
            grp = "13-22"

        # Train data
        train_x = df.iloc[train_index]
        train_x = train_x.loc[train_x.level_group == grp]
        train_users = train_x.index.values
        train_y = target_df.loc[target_df.q == t].set_index("session").loc[train_users]

        # Valid data
        valid_x = df.iloc[test_index]
        valid_x = valid_x.loc[valid_x.level_group == grp]
        valid_users = valid_x.index.values
        valid_y = target_df.loc[target_df.q == t].set_index("session").loc[valid_users]

        # Train model
        clf = XGBClassifier(**xgb_params)
        clf.fit(
            train_x[features].astype("float32"),
            train_y["correct"],
            eval_set=[(valid_x[features].astype("float32"), valid_y["correct"])],
            verbose=0,
        )
        print(f"{t}({clf.best_ntree_limit}), ", end="")

        # Save model and predict valid oof
        models[f"{grp}_{t}"] = clf
        oof.loc[valid_users, t - 1] = clf.predict_proba(
            valid_x[features].astype("float32")
        )[:, 1]

    print()

# CV score


In [None]:
true = oof.copy()
for k in range(18):
    # Get labels for each question
    tmp = target_df.loc[target_df.q == k + 1].set_index("session").loc[users]
    true[k] = tmp.correct.values

In [None]:
scores = []
thresholds = []

best_score = 0
best_threshold = 0

for threshold in np.arange(0.4, 0.81, 0.01):
    print(f"{threshold:.02f}, ", end="")
    preds = (oof.values.reshape((-1)) > threshold).astype("int")
    m = f1_score(true.values.reshape((-1)), preds, average="macro")
    scores.append(m)
    thresholds.append(threshold)
    if m > best_score:
        best_score = m
        best_threshold = threshold

In [None]:
plt.figure(figsize=(20, 5))
plt.plot(thresholds, scores, "-o", color="blue")
plt.scatter([best_threshold], [best_score], color="blue", s=300, alpha=1)
plt.xlabel("Threshold", size=14)
plt.ylabel("Validation F1 Score", size=14)
plt.title(
    f"Threshold vs. F1_Score with Best F1_Score = {best_score:.3f} at Best Threshold = {best_threshold:.3}",
    size=18,
)
plt.show()

In [None]:
print("When using optimal threshold...")
for k in range(18):
    # Compute f1 score for each question
    m = f1_score(
        true[k].values, (oof[k].values > best_threshold).astype("int"), average="macro"
    )
    print(f"Q{k}: F1 =", m)

# Compute overall F1 score
m = f1_score(
    true.values.reshape((-1)),
    (oof.values.reshape((-1)) > best_threshold).astype("int"),
    average="macro",
)
print("==> Overall F1 =", m)

# Infer test data


In [None]:
assert False

In [None]:
# Create environment
env = jo_wilder.make_env()
iter_test = env.iter_test()

In [None]:
# Clear memory
del target_df, df, oof, true
gc.collect()

In [None]:
limits = {"0-4": (1, 4), "5-12": (4, 14), "13-22": (14, 19)}

for test, sample_submission in iter_test:
    # Feature engineering
    df = feature_engineer(test)

    # Inference
    grp = test.level_group.values[0]
    a, b = limits[grp]
    for t in range(a, b):
        clf = models[f"{grp}_{t}"]
        p = clf.predict_proba(df[features].astype("float32"))[0, 1]
        mask = sample_submission.session_id.str.contains(f"q{t}")
        sample_submission.loc[mask, "correct"] = int(p > best_threshold)

    env.predict(sample_submission)

# Submission


In [None]:
df = pd.read_csv("submission.csv")
df.head()

In [None]:
df.correct.mean()