In [None]:
import os
from collections import Counter, defaultdict
from pathlib import Path

from tqdm.notebook import tqdm
import json
import numpy as np
import pandas as pd
from fastai.tabular import * 

pd.set_option('display.max_colwidth', 200)
pd.set_option('display.max_columns', None)
pd.set_option('display.min_rows', 100)
pd.set_option('display.max_rows', 100)
home = Path("/kaggle/input/data-science-bowl-2019/")

My approach to 2019 DS bowl with fastai v1

I used these awesome notebooks:
    
    https://www.kaggle.com/robikscube/2019-data-science-bowl-an-introduction
    https://www.kaggle.com/amanooo/dsb-2019-regressors-and-optimal-rounding
    https://www.kaggle.com/tarandro/regression-with-less-overfitting

* [Looking at data](#Looking-at-data)
* [Preparing data](#Preparing-data)
* [Approach](#How-to-approach)
* [Train](#Train)
* [Submission](#Submission)

# Looking at data

In [None]:
specs = pd.read_csv(home/"specs.csv"); len(specs)
specs.head()

In [None]:
train_labels = pd.read_csv(home/"train_labels.csv"); len(train_labels)
train_labels.head(5)

In [None]:
pd.read_csv(home/"sample_submission.csv").head()

In [None]:
%%time
types = {"event_code": np.int16, "event_count": np.int16, "game_time": np.int32}
raw_train = pd.read_csv(home/"train.csv", dtype=types)
raw_train["timestamp"] = pd.to_datetime(raw_train["timestamp"]); len(raw_train)

In [None]:
raw_test = pd.read_csv(home/"test.csv", dtype=types)
raw_test["timestamp"] = pd.to_datetime(raw_test["timestamp"])
raw_test.head(5)

# Preparing data

In [None]:
raw_train.sample(5)

It seems `game_time` is not captured correctly - there's a huge window in between some events in a given session.

In [None]:
# raw_train[raw_train["game_session"] == "969a6c0d56aa4683"].tail()

The target.

First we will look at the target we intend to predict.

We are told: The intent of the competition is to use the gameplay data to forecast how many attempts a child will take to pass a given assessment (an incorrect answer is counted as an attempt). The outcomes in this competition are grouped into 4 groups (labeled `accuracy_group` in the data):

    3: the assessment was solved on the first attempt
    2: the assessment was solved on the second attempt
    1: the assessment was solved after 3 or more attempts
    0: the assessment was never solved

For each installation_id represented in the test set, you must predict the accuracy_group **of the last assessment** for that installation_id

We are told in the data description that:

* The file train_labels.csv has been provided to show how these groups would be computed on the assessments in the training set.
* Assessment attempts are captured in event_code 4100 for all assessments except for Bird Measurer, which uses event_code 4110.
* If the attempt was correct, it contains "correct":true.

We also know:

* The intent of the competition is to use the gameplay data to forecast how many attempts a child will take to pass a given assessment (an incorrect answer is counted as an attempt).
* Each application install is represented by an installation_id. This will typically correspond to one child, but you should expect noise from issues such as shared devices.
* In the training set, you are provided **the full history of gameplay data.**
* In the test set, **we have truncated the history after the start event of a single assessment, chosen randomly, for which you must predict the number of attempts.**
* Note that the training set contains many installation_ids which never took assessments, whereas every installation_id in the test set made an attempt on at least one assessment.


# How to approach

For test set we guess the group based on **single** installation_id. But the train\test datasets contain **repeatable** installation ids with different game sessions.
Hence it makes sense to guess the group for each assessment using history.

The questions:
* does randomly truncated history in test conflicts with the above?
From the test dataset, the last asssessment data is cleaned. So it looks like a session with only 1 event.
To have a good validation dataset however, it should be the same as test - https://www.kaggle.com/tarandro/regression-with-less-overfitting
* Is more than 1 correct submission impossible per session? Does it mean noise - two devices with the same id sharing the same session?

Remove `installation_id` without any assesments

In [None]:
# TODO keep them and see how it affects score
ids_with_subms = raw_train[raw_train.type == "Assessment"][['installation_id']].drop_duplicates()
raw_train = pd.merge(raw_train, ids_with_subms, on="installation_id", how="inner"); len(raw_train)

In [None]:
def get_accuracy(correct_data):
    # Rounding correct > 1 to 1 lowers the score. Why?
    correct = len(correct_data.loc[correct_data])
    wrong = len(correct_data.loc[~correct_data])
    accuracy = correct/(correct + wrong) if correct + wrong else 0
    return accuracy, correct, wrong

def get_group(accuracy):
    if not accuracy:
        return 0
    elif accuracy == 1:
        return 3
    elif accuracy >= 0.5:
        return 2
    return 1

In [None]:
# I prefer this over calculating average
def lin_comb(v1, v2, beta): return beta*v1 + (1-beta)*v2

In [None]:
# TRY
# do not remove assessments without attempts in train
# add number of passed assessments
# add time spent in each activity
# remove sessions with 1 row
# clear installation_id not found in train_labels
# event_data

def prepare(data: pd.DataFrame, one_hot: List[str], test=False) -> pd.DataFrame:
    one_hot_dict = defaultdict(int)

    prepared = []
    for id_, g in tqdm(data.groupby("installation_id", sort=False)):
        features = process_id(g, one_hot, one_hot_dict.copy(), test)
        if not features:
            continue
        if test:
            features[-1]["is_test"] = 1
        prepared.extend(features)
    return pd.DataFrame(prepared).fillna(0).sort_index(axis=1)

In [None]:
def process_id(id_data: pd.DataFrame, one_hot_cols, one_hot_dict, test: bool) -> pd.DataFrame:
    a_accuracy, a_group, a_correct, a_wrong, counter, accumulated_duration_mean = 0, 0, 0, 0, 0, 0
    a_groups = {"0":0, "1":0, "2":0, "3":0}
    # accumulated one_hot features per id for a given session, e.g. Bird Measurer: 50
    features = []

    for s, gs in id_data.groupby("game_session", sort=False):
        def update_counter(counter: dict, column: str):
            session_counter = Counter(gs[column])
            for value in session_counter.keys():
                counter[f"{column}_{value}"] += session_counter[value]
            return counter

        def process_session(gs):
            # share state with parent process_id()
            nonlocal one_hot_dict, a_groups, a_accuracy, a_group, a_correct, a_wrong, counter, accumulated_duration_mean
            # increment one hot columns for session, e.g. Bird Measurer: 50
            for c in one_hot_cols:
                one_hot_dict.update(update_counter(one_hot_dict, c))
    
            # an accumulated session duration mean
            duration = (gs["timestamp"].iloc[-1] - gs["timestamp"].iloc[0]).seconds
            accumulated_duration_mean = lin_comb(accumulated_duration_mean or duration, duration, beta=0.9)
            if gs["type"].iloc[0] != "Assessment":
                return

            guess_mask = ((gs["event_data"].str.contains("correct")) & 
             (((gs["event_code"] == 4100) &(~gs["title"].str.startswith("Bird")) | 
               ((gs["event_code"] == 4110) & (gs["title"].str.startswith("Bird"))))))
            answers = gs.loc[guess_mask].event_data.apply(lambda x: json.loads(x).get("correct"))

            # skip assessments without attempts in train
            if answers.empty and not test:
                return
            accuracy, correct, wrong = get_accuracy(answers)
            assert accuracy <= 1
            group = get_group(accuracy)
            processed = {"installation_id": id_data.installation_id.iloc[0], #"game_session": s,
                         "title": gs.title.iloc[0],
                         "last_timestamp": gs.timestamp.iloc[-1], "accumulated_duration_mean": accumulated_duration_mean,
                         "accumulated_correct": a_correct, "accumulated_incorrect": a_wrong,
                         "accumulated_accuracy_mean": a_accuracy/counter if counter > 0 else 0,
                         "accumulated_accuracy_group_mean": a_group/counter if counter > 0 else 0, 
                         "accuracy_group": group}
            processed.update(a_groups)
            counter += 1
            a_accuracy += accuracy
            a_correct += correct
            a_wrong += wrong
            a_group += group
            a_groups[str(group)] += 1
            processed.update(one_hot_dict)
            return processed
        
        gs.reset_index(inplace=True, drop=True)

        if (gs["timestamp"].iloc[-1] - gs["timestamp"].iloc[0]).seconds > 1800:
            gs.loc[:, "passed"] = gs.loc[:, "timestamp"].diff().apply(lambda x: x.seconds)
            id_max = gs.loc[:, "passed"].idxmax()
            if gs.loc[:, "passed"].max() > 1800:
                session = gs.iloc[:id_max]
                continued_session = gs.iloc[id_max:]
                fs = process_session(session)
                c_fs = process_session(continued_session)
                if fs:
                    features.append(fs)
                if c_fs:
                    features.append(c_fs)
                continue

        session_features = process_session(gs)
        if session_features:
            features.append(session_features)
        
    return features

In [None]:
# import gc; gc.collect()

In [None]:
one_hot_counters=["title", "type", "event_code", "event_id"]
train = prepare(raw_train, one_hot=one_hot_counters)
# train = prepare(raw_train.iloc[:1_000_000], one_hot=one_hot_counters)

In [None]:
add_datepart(train, "last_timestamp", prefix="last_", time=True)
train.tail()

In [None]:
test = prepare(raw_test, one_hot=one_hot_counters, test=True)

In [None]:
# for the case when one hot encoded columns don't match between datasets
add_datepart(test, "last_timestamp", prefix="last_", time=True);
# diff = train.columns.difference(test.columns)
# display(f"Test doesn't contain {diff}")
# for c in diff:
#     test[c] = 0

In [None]:
# why discard good data from test, let's use all the taken assessments for train!
train = (pd.concat([train, test[test["is_test"] == 0].drop(columns=["is_test"])],
                   ignore_index=True, sort=False)).fillna(0)
train.tail()

In [None]:
test = test.loc[test["is_test"] == 1]
test.drop(columns=["accuracy_group", "is_test"], inplace=True)
test.head()

In [None]:
del_cols = []
for col in train.columns.values:
    counts = train[col].value_counts().iloc[0]
    if (counts / train.shape[0]) >= 0.99:
        del_cols.append(col)
train.drop(columns=del_cols, inplace=True)
test.drop(columns=del_cols, inplace=True, errors="ignore")
display(f"Dropped {del_cols}")

# Train

In [None]:
procs = [FillMissing, Categorify, Normalize]

In [None]:
np.random.seed(42)

### Proper validation dataset

Let's assume the second hidden test is the same as this one. I.e. we predict the last assessment.

In [None]:
# grab the last assessments per id
valid_idx = [g.iloc[-1].name for i, g in train.groupby("installation_id", sort=False)]

In [None]:
threes = train.loc[valid_idx].query("accuracy_group == 3").index
zeroes = train.loc[valid_idx].query("accuracy_group == 0").index
others = train.loc[valid_idx].query("accuracy_group == 1 or accuracy_group == 2").index

In [None]:
valid_idx = sorted(pd.Series(threes).sample(len(others) // 2, random_state=42).to_list() +
             pd.Series(zeroes).sample(len(others) // 2, random_state=42).to_list() +
             others.to_list()
            )
len(valid_idx)

In [None]:
# let's randomly leave some data for the train
# valid_idx = np.random.choice(valid_idx, int(len(valid_idx) * 0.5), replace=False)
# valid_idx = pd.Series(valid_idx).sample(int(len(valid_idx) * 0.5), random_state=42).sort_values().values
# len(valid_idx)

In [None]:
# ids = train["installation_id"].unique()
# sampled_ids = np.random.choice(ids, int(len(ids) * 0.2))
# valid_idx = train[train["installation_id"].isin(sampled_ids)].drop_duplicates(["installation_id"],
#                                                                               keep="last").index
# len(valid_idx)

In [None]:
train.accuracy_group.value_counts(normalize=True)

In [None]:
train.loc[valid_idx].accuracy_group.value_counts(normalize=True)

In [None]:
date_cols = train.columns[train.columns.str.startswith("last", na=False)].to_list()

In [None]:
dep_var = "accuracy_group"
cat_names = list(filter(lambda x: x not in [dep_var, "last_Elapsed"], date_cols)) + ["title"]
cont_names = list(filter(lambda x: x not in ["installation_id", "game_session", dep_var] + cat_names,
                         train.columns.to_list()))

In [None]:
data = (TabularList.from_df(train, path=home, cat_names=cat_names, cont_names=cont_names, procs=procs)
        .split_by_idx(valid_idx=valid_idx)
        .label_from_df(cols=dep_var, label_cls=CategoryList)
        .add_test(TabularList.from_df(test, path=home, cat_names=cat_names, cont_names=cont_names, procs=procs))
        .databunch()
)

In [None]:
# data.show_batch()

In [None]:
def reduce_loss(loss, reduction='mean'):
    return loss.mean() if reduction=='mean' else loss.sum() if reduction=='sum' else loss

class LabelSmoothingCrossEntropy(nn.Module):
    def __init__(self, ε:float=0.1, reduction='mean'):
        super().__init__()
        self.ε,self.reduction = ε,reduction
    
    def forward(self, output, target):
        c = output.size()[-1]
        log_preds = F.log_softmax(output, dim=-1)
        loss = reduce_loss(-log_preds.sum(dim=-1), self.reduction)
        nll = F.nll_loss(log_preds, target, reduction=self.reduction)
        return lin_comb(loss/c, nll, self.ε)

In [None]:
## TODO - update kappa to regression
learn = tabular_learner(data, layers=[1000,500],
#                         metrics=[mean_absolute_error, explained_variance],
#                         y_range=[0, 3],
                        metrics=[KappaScore("quadratic")],
                        loss_func=LabelSmoothingCrossEntropy(),
#                         emb_drop=0.04,
#                         use_bn=False,
                       )

In [None]:
# learn.model_dir = "/kaggle/working"
# learn.lr_find()
# learn.recorder.plot()

In [None]:
learn.fit_one_cycle(5, 1e-03)

In [None]:
# for lr in [1e-03, 5e-03, 1e-02, 3e-02]:
#     learn = tabular_learner(data, layers=[200,100],
#                             metrics=[KappaScore("quadratic")],
#                             loss_func=LabelSmoothingCrossEntropy(),
#                            )
#     display(lr)
#     learn.fit_one_cycle(20, lr)

## Submission

In [None]:
# preds, y = learn.get_preds(ds_type=DatasetType.Test)
# labels = preds.flatten()
# display(labels[:10])
# pd.Series(labels.tolist()).hist(bins=100); 

In [None]:
# def rounder(preds):
#     y = preds.clone()
#     y[y < 0.7] = 0
#     y[(y >= 0.7) & (y < 1.4)] = 1
#     y[y >= 1.8] = 3
#     y[(y >= 1.4) & (y < 1.8)] = 2
#     return y.type(torch.IntTensor)
# labels = rounder(labels)
# # labels = labels.round().type(torch.IntTensor)
# pd.Series(labels.tolist()).hist(bins=4);

In [None]:
# pd.Series(preds.type(torch.IntTensor).flatten()).value_counts()

In [None]:
preds, y = learn.get_preds(ds_type=DatasetType.Test)
labels = np.argmax(preds, 1)
submission = pd.DataFrame({"installation_id": test.installation_id, "accuracy_group": labels})
submission.to_csv("submission.csv", index=False)
len(submission), submission.accuracy_group.value_counts(normalize=True)