# Submission Kernel

This code needs three kaggle datasets:
- riiid-nn-data (contains npy files with parts and tags)
- nn-data-no-lec (contains the .h5 file with all data organized by user/dataset)
- riiid-nn-model (contains the model.ckpt file to load)
    

In [None]:
import h5py
import numpy as np
import pandas as pd
import pytorch_lightning as pl
import torch
from pytorch_lightning import seed_everything
from pytorch_lightning.metrics.functional import accuracy
from torch import nn as nn
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset, random_split
from tqdm.auto import tqdm

pd.set_option("display.max_rows", 100)


SEED = 69
seed_everything(SEED)

In [None]:
torch.cuda.is_available()

### Loading tags and lectures

In [None]:
%%time

folder_path = "../input/riiid-nn-data"

questions_df = pd.read_csv("../input/riiid-test-answer-prediction/questions.csv")
lectures_df = pd.read_csv("../input/riiid-test-answer-prediction/lectures.csv")

print("Loading lectures arrays")
lectures_ids = np.load(f"{folder_path}/lecture_ids.npy")
lectures_parts = np.load(f"{folder_path}/lectures_parts.npy")
lectures_tags = lectures_df.tag.values

print("Loading questions arrays")
questions_parts = np.load(f"{folder_path}/questions_parts.npy")
questions_lectures_parts = np.concatenate([questions_parts, lectures_parts])


# process tags
def split_tags(t):
    try:
        return [int(i) for i in t.split(" ")]
    except AttributeError:
        return list()


# Get tags to be 2D array of shape (Q, T), where Q is question_idx, and T is the max number of tag possible (6)
questions_df["tags"] = questions_df.tags.apply(split_tags)
questions_tags = pd.DataFrame(questions_df["tags"].tolist(), index=questions_df.index)

# map lecture id to new id
lectures_mapping = dict(
    zip(lectures_df.lecture_id.values, (lectures_df.index + 13523).values)
)
lectures_df.lecture_id = lectures_df.index + 13523
lectures_tags = pd.DataFrame(
    lectures_df.tag.values, index=lectures_df.lecture_id.values
)

questions_lectures_tags = pd.concat([questions_tags, lectures_tags])
# pad with max tag + 1
questions_lectures_tags = (
    questions_lectures_tags.fillna(questions_lectures_tags.max().max() + 1)
    .astype(np.int)
    .values
)

### Preproc code

In [None]:
def preprocess_df(df):
    """
    Converts the lecture ids to proper content_ids
    Adds the answered_correctly column if not exists
    """
    df.content_type_id = df.content_type_id.astype(bool)

    # prior information
    df.prior_question_had_explanation = df.prior_question_had_explanation.fillna(
        0
    ).astype(np.uint8)
    df.prior_question_elapsed_time = (
        df.prior_question_elapsed_time.fillna(0).clip(upper=300000) / 300000
    )  # normalizes to 0-1

    # map lecture ids to new content_ids
    df.loc[df.content_type_id, "content_id"] = df[df.content_type_id].content_id.map(
        lectures_mapping
    )
    # if not answered correctly then add column with
    # y = 3 (padding) for all questions and y = 4 for lectures
    if "answered_correctly" not in df.columns:
        df["answered_correctly"] = df.content_type_id.map({False: 3, True: 4})

    return df


eps = 0.0000001


def get_time_elapsed_from_timestamp(arr):
    arr_seconds = np.diff(arr, prepend=0) / 1000
    return (np.log(arr_seconds + eps).astype(np.float32) - 3.5) / 20

### Pytorch Model

In [None]:
from pytorch_lightning.core.decorators import auto_move_data
from pytorch_lightning.metrics.functional.classification import auroc


def init_weights(m):
    if type(m) == nn.Linear:
        torch.nn.init.xavier_uniform_(m.weight)
        torch.nn.init.zeros_(m.bias)


class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=1000):
        super(PositionalEncoding, self).__init__()

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)
        )
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer("pe", pe)

    def forward(self, sequence_length):
        # returns embeds (sequence_length, 1, d_model)
        return self.pe[:sequence_length, :]


def init_weights(m):
    if type(m) == nn.Linear:
        torch.nn.init.xavier_uniform_(m.weight)
        torch.nn.init.zeros_(m.bias)


class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=1000):
        super(PositionalEncoding, self).__init__()

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)
        )
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer("pe", pe)

    def forward(self, sequence_length):
        # returns embeds (sequence_length, 1, d_model)
        return self.pe[:sequence_length, :]


class RIIDDTransformerModel(pl.LightningModule):
    def __init__(
        self,
        learning_rate=0.001,
        n_content_id=13943,  # number of different contents = 13942 + 1 (for padding)
        n_part=8,  # number of different parts = 7 + 1 (for padding)
        n_tags=189,  # number of different tags = 188 + 1 (for padding)
        n_correct=5,  # 0,1 (false, true), 2 (start token), 3 (padding), 4 (lecture)
        emb_dim=64,  # embedding dimension
        dropout=0.1,
        n_heads: int = 1,
        n_encoder_layers: int = 2,
        n_decoder_layers: int = 2,
        dim_feedforward: int = 256,
        activation: str = "relu",
        max_window_size=100,
        use_prior_q_times=False,
        use_prior_q_explanation=False,
    ):
        super(RIIDDTransformerModel, self).__init__()
        self.model_type = "RiiidTransformer"
        self.learning_rate = learning_rate
        self.max_window_size = max_window_size

        self.use_prior_q_times = use_prior_q_times
        self.use_prior_q_explanation = use_prior_q_explanation

        # save params of models to yml
        self.save_hyperparameters()

        #### EXERCISE SEQUENCE
        self.embed_content_id = nn.Embedding(n_content_id, emb_dim, padding_idx=13942)
        self.embed_parts = nn.Embedding(n_part, emb_dim, padding_idx=0)
        self.embed_tags = nn.Embedding(n_tags, emb_dim, padding_idx=188)

        # exercise weights to weight the mean embeded excercise embeddings
        self.exercise_weights = torch.nn.Parameter(torch.tensor([0.35, 0.55, 0.1]))

        ### RESPONSE SEQUENCE (1st time stamp of sequence is useless)
        self.embed_answered_correctly = nn.Embedding(
            n_correct, emb_dim, padding_idx=3
        )  # 2 + 1 for start token + 1 for padding_idn_inputs

        self.embed_timestamps = nn.Linear(1, emb_dim)

        # embed prior q time and q explanation
        self.embed_prior_q_time = nn.Linear(1, emb_dim)
        self.embed_prior_q_explanation = nn.Embedding(2, emb_dim)

        # response weights to weight the mean embeded response embeddings
        w = [0.5, 0.5]
        if use_prior_q_times:
            w.append(0.5)
        if use_prior_q_explanation:
            w.append(0.5)

        self.response_weights = torch.nn.Parameter(torch.tensor([w]))

        self.pos_encoder = PositionalEncoding(emb_dim)

        self.transformer = nn.Transformer(
            d_model=emb_dim,
            nhead=n_heads,
            num_encoder_layers=n_encoder_layers,
            num_decoder_layers=n_decoder_layers,
            dropout=dropout,
            dim_feedforward=dim_feedforward,
            activation=activation,
        )

        self.out_linear = nn.Linear(emb_dim, 2)
        init_weights(self)

    def generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = (
            mask.float()
            .masked_fill(mask == 0, float("-inf"))
            .masked_fill(mask == 1, float(0.0))
        )
        return mask

    def forward(
        self, content_ids, parts, answers, tags, timestamps, prior_q_times,
    ):
        # content_ids: (Source Sequence Length, Number of samples, Embedding)
        # tgt: (Target Sequence Length,Number of samples, Embedding)

        # if data is flat then expand to get Batch dim
        if len(content_ids.shape) == 1:
            content_ids = content_ids.unsqueeze(1)
            parts = parts.unsqueeze(1)
            answers = answers.unsqueeze(1)
            tags = tags.unsqueeze(1)
            timestamps = timestamps.unsqueeze(1)
            prior_q_times = prior_q_times.unsqueeze(1)

        sequence_length = content_ids.shape[0]

        # sequence that will go into encoder
        embeded_content = self.embed_content_id(content_ids)
        embeded_parts = self.embed_parts(parts)
        embeded_tags = self.embed_tags(tags).sum(dim=2)
        e_w = F.softmax(self.exercise_weights, dim=0)

        embeded_exercise_sequence = (
            torch.stack([embeded_content, embeded_parts, embeded_tags], dim=3) * e_w
        ).sum(dim=3)

        # sequence that will go into decoder
        embeded_answered_correctly = self.embed_answered_correctly(answers)
        embeded_timestamps = self.embed_timestamps(timestamps.unsqueeze(2))
        r_w = F.softmax(self.response_weights, dim=0)

        exercise_sequence_components = [embeded_answered_correctly, embeded_timestamps]
        if self.use_prior_q_times:
            embeded_q_times = self.embed_prior_q_time(prior_q_times.unsqueeze(2))
            # zero embedding - if start token
            embeded_q_times[0, torch.where(answers[0, :] == 2)[0], :] = 0
            exercise_sequence_components.append(embeded_q_times)

        embeded_responses = (
            torch.stack(exercise_sequence_components, dim=3) * r_w
        ).sum(dim=3)

        # adding positional vector
        embedded_positions = self.pos_encoder(sequence_length + 1)

        # add shifted position embedding ( start token is first position)
        embeded_responses = embeded_responses + embedded_positions[:-1, :, :]

        embeded_exercise_sequence = (
            embeded_exercise_sequence + embedded_positions[1:, :, :]
        )

        # mask of shape S x S -> prevents attention looking forward
        top_right_attention_mask = self.generate_square_subsequent_mask(
            sequence_length
        ).type_as(embeded_exercise_sequence)

        output = self.transformer(
            embeded_exercise_sequence,
            embeded_responses,
            tgt_mask=top_right_attention_mask,  # (S,S)
            src_mask=top_right_attention_mask,  # (T,T)
        )

        output = self.out_linear(output)
        return F.softmax(output, dim=2)[:, :, 1]

    def process_batch_step(self, batch):
        # return result
        return self(
            batch["content_ids"],
            batch["parts"],
            batch["answers"],
            batch["tags"],
            batch["timestamps"],
            batch["prior_q_times"],
        )

    @auto_move_data
    def predict_n_steps(self, batch, steps, return_all_preds=False):
        """
        Predicts n steps for all items in batch and return predictions
        only for those steps (flattened)
        steps: tensor of length B where each item is the number of steps that need to be taken
        """
        seq_length, n_users = batch["content_ids"].shape
        lengths = batch["length"]

        users = torch.arange(n_users)

        user_indexes = []
        sequence_indexes = []

        for i in range(steps.max().int(), 0, -1):
            preds = self.process_batch_step(batch)

            sequence_indexes_at_i = lengths[steps >= i] - i
            user_indexes_at_i = users[steps >= i]

            # get index for which to update the answers
            # since answers is shifted we want to map preds 0..98 -> answers 1:99
            answers_idx = torch.where(sequence_indexes_at_i + 1 != seq_length)
            a_seq_idx = sequence_indexes_at_i[answers_idx] + 1
            u_seq_idx = user_indexes_at_i[answers_idx]

            # set answer to either 0 or 1 if not lecture
            batch["answers"][a_seq_idx, u_seq_idx] = torch.where(
                batch["answers"][a_seq_idx, u_seq_idx] != 4,
                (preds[sequence_indexes_at_i[answers_idx], u_seq_idx] > 0.5).long(),
                batch["answers"][a_seq_idx, u_seq_idx],
            )

            user_indexes.append(user_indexes_at_i)
            sequence_indexes.append(sequence_indexes_at_i)

        if return_all_preds:
            return preds

        user_indexes = torch.cat(user_indexes)
        sequence_indexes = torch.cat(sequence_indexes)

        return (
            preds[sequence_indexes, user_indexes],
            batch["row_ids"][sequence_indexes, user_indexes],
        )

    @auto_move_data
    def predict_fast_single_user(
        self, content_ids, parts, answers, tags, timestamps, prior_q_times, n=1
    ):
        """
        Predicts n steps for a single user in batch and return predictions
        only for those steps (flattened)
        """
        length = len(content_ids)
        out_predictions = torch.zeros(n, device=self.device)
        for i in range(n, 0, -1):
            preds = self(content_ids, parts, answers, tags, timestamps, prior_q_times)
            out_predictions[n - i] = preds[length - i, 0]

            # answers are shifted (start token) so need + 1
            answer_idx = length - i + 1
            # don't update if at end of answers
            if answer_idx + 1 < len(answers):
                # don't update if true is lecture
                if answers[answer_idx] != 4:
                    answers[answer_idx] = (preds[length - i, 0] > 0.5).long()

        return out_predictions

In [None]:
model = RIIDDTransformerModel.load_from_checkpoint("../input/riiid-nn-model/model.ckpt")
model.freeze()
model.cuda()

### Pytorch Dataset and Collate Function

In [None]:
class InferenceRIIDDataset(Dataset):
    """RIID dataset."""

    def __init__(
        self, hdf5_file="feats.h5", window_size=100,
    ):
        """
        Args:
            user_mapping (np.array): array of all unique user ids 
            hdf5_file (string): location of hf5 feats file
        """
        self.hdf5_file = hdf5_file
        self.max_window_size = window_size
        self.cache = {}

    def open_hdf5(self):
        # opens the h5py file
        self.f = h5py.File(self.hdf5_file, "r")

    def __len__(self):
        if not hasattr(self, "f"):
            self.open_hdf5()
        return len(self.f.keys())

    def load_user_into_cache(self, user_id):
        """
        add a user to self.cache
        """

        if not hasattr(self, "f"):
            self.open_hdf5()

        if f"{user_id}" in self.f:
            content_ids = np.array(self.f[f"{user_id}/content_ids"], dtype="int64")
            answers = np.array(self.f[f"{user_id}/answered_correctly"], dtype="int64")
            timestamps = np.array(self.f[f"{user_id}/timestamps"], dtype="float32")
            prior_q_times = np.array(
                self.f[f"{user_id}/prior_question_elapsed_time"], dtype="float32"
            )

            self.cache[user_id] = {
                "content_ids": content_ids,
                "answers": answers,
                "timestamps": timestamps,
                "prior_q_times": prior_q_times,
                "row_ids": np.zeros(len(timestamps)),
                "steps": 0,
            }
        else:
            self.cache[user_id] = {
                "content_ids": np.array([], dtype="int64"),
                "timestamps": np.array([], dtype="float32"),
                "answers": np.array([], dtype="int64"),
                "prior_q_times": np.array([], dtype="float32"),
                "row_ids": np.array([], dtype="int64"),
                "steps": 0,
            }

    def update_user_rows(self, user_rows):
        if not hasattr(self, "f"):
            self.open_hdf5()

        user_id = user_rows.user_id.values[0]
        num_rows = len(user_rows)
        new_content_ids = user_rows.content_id.values
        new_timestamps = user_rows.timestamp.values
        new_answered_correctly = (
            user_rows.answered_correctly.values
        )  # should be 3 (3) for every question and 4 for lectures
        new_prior_q_times = user_rows.prior_question_elapsed_time.values

        new_row_ids = user_rows.row_id.values

        if user_id not in self.cache:
            self.load_user_into_cache(user_id)

        self.cache[user_id]["content_ids"] = np.concatenate(
            [self.cache[user_id]["content_ids"], new_content_ids]
        )[-(self.max_window_size + 1) :]

        self.cache[user_id]["timestamps"] = np.concatenate(
            [self.cache[user_id]["timestamps"], new_timestamps]
        )[-(self.max_window_size + 1) :]

        self.cache[user_id]["answers"] = np.concatenate(
            [self.cache[user_id]["answers"], new_answered_correctly]
        )[-(self.max_window_size + 1) :]

        self.cache[user_id]["prior_q_times"] = np.concatenate(
            [self.cache[user_id]["prior_q_times"], new_prior_q_times]
        )[-(self.max_window_size + 1) :]

        self.cache[user_id]["row_ids"] = np.concatenate(
            [self.cache[user_id]["row_ids"], new_row_ids]
        )[-(self.max_window_size + 1) :]

        self.cache[user_id]["steps"] = len(new_row_ids)

    def update_answered_correctly(self, answered_correctly_rows):
        user_id = answered_correctly_rows.name
        new_answered_correctly = (
            answered_correctly_rows.values
        )  # this is only the answers

        # update the correct answers for the questions (keep 4 - lectures)
        n = self.cache[user_id]["steps"]
        l = len(self.cache[user_id]["answers"])
        idxs = np.where(self.cache[user_id]["answers"][-n:] != 4)[0] + l - n

        self.cache[user_id]["answers"][idxs] = new_answered_correctly
        self.cache[user_id]["steps"] = 0

    def __getitem__(self, user_id):
        if user_id not in self.cache:
            self.load_user_into_cache(user_id)

        length = len(self.cache[user_id]["content_ids"])
        window_size = min(self.max_window_size, length)

        content_ids = self.cache[user_id]["content_ids"][-window_size:]
        answers = self.cache[user_id]["answers"][-window_size:]
        timestamps = self.cache[user_id]["timestamps"][-window_size:]
        prior_q_times = self.cache[user_id]["prior_q_times"][-window_size:]
        row_ids = self.cache[user_id]["row_ids"][-window_size:]

        # convert timestamps to time elapsed
        time_elapsed_timestamps = get_time_elapsed_from_timestamp(timestamps)

        # get tags
        tags = questions_lectures_tags[content_ids, :].astype(np.int64)

        # get parts
        parts = questions_lectures_parts[content_ids].astype(np.int64)

        # shift by one the answered_correctly sequence
        answers = np.roll(answers, 1)

        if length > self.max_window_size:
            answers[0] = self.cache[user_id]["answers"][-window_size - 1]
        else:
            answers[0] = 2

        return {
            "parts": torch.from_numpy(parts).long(),
            "tags": torch.from_numpy(tags).long(),
            "content_ids": torch.from_numpy(content_ids).long(),
            "answers": torch.from_numpy(answers).long(),
            "prior_q_times": torch.from_numpy(prior_q_times).float(),
            "timestamps": torch.from_numpy(time_elapsed_timestamps).float(),
            "length": window_size,
            "row_ids": torch.from_numpy(row_ids).int(),
            "steps": self.cache[user_id]["steps"],
        }

In [None]:
from torch.nn.utils.rnn import pad_sequence


# The collate function is used to merge individual data samples into a batch
# It handles the padding aspect
def inference_collate_fn(batch):
    # collate lenghts into 1D tensor
    items = {
        "length": torch.tensor([batch_item["length"] for batch_item in batch]),
        "steps": torch.tensor([batch_item["steps"] for batch_item in batch]),
    }

    # find shape that the batch will have
    max_length = items["length"].max()
    num_items = len(batch)
    if num_items > 1:
        # padding list
        for (key, padding) in [
            ("parts", 0),
            ("content_ids", 13942),
            ("answers", 3),
            ("timestamps", 0.0),  # note timestamps isnt an embedding
            ("prior_q_times", 0),
            ("tags", 188),
            ("row_ids", 0),
        ]:
            items[key] = pad_sequence(
                [batch_item[key] for batch_item in batch],
                batch_first=False,
                padding_value=padding,
            )
    else:
        for key in [
            "parts",
            "content_ids",
            "answers",
            "timestamps",
            "tags",
            "row_ids",
            "prior_q_times",
        ]:
            items[key] = batch[0][key].unsqueeze(1)
    return items

In [None]:
WINDOW_SIZE = 100
inference_dataset = InferenceRIIDDataset(
    hdf5_file="../input/nn-data-no-lec/feats.h5", window_size=WINDOW_SIZE
)

### Predict function

In [None]:
def predict_for_df(df):
    # select questions only
    unique_users_with_questions = df[~df.content_type_id].user_id.unique()
    df["answered_correctly"] = 0.5  # set useless value for column

    # batch with only lectures
    if len(unique_users_with_questions) < 1:
        pass  # return empy

    # case of single user with questions in batch
    elif len(unique_users_with_questions) == 1:
        item = inference_dataset[unique_users_with_questions[0]].copy()
        predictions = model.predict_fast_single_user(
            item["content_ids"],
            item["parts"],
            item["answers"],
            item["tags"],
            item["timestamps"],
            item["prior_q_times"],
            n=item["steps"],
        ).cpu()
        df.loc[
            df.user_id == unique_users_with_questions[0], "answered_correctly"
        ] = predictions.numpy()
    # case of multiple users with questions in batch
    else:
        batch = inference_collate_fn(
            [inference_dataset[u].copy() for u in unique_users_with_questions]
        )
        predictions, row_ids = model.predict_n_steps(batch, batch["steps"])
        df["answered_correctly"] = df["row_id"].map(
            dict(zip(row_ids.cpu().numpy(), predictions.cpu().numpy()))
        )
    return df[~df.content_type_id]


### Iterate over test

In [None]:
import riiideducation
env = riiideducation.make_env()
iter_test = env.iter_test()
set_predict = env.predict

In [None]:
%%time
previous_test_df = None
for (current_test, sample_prediction_df) in iter_test:
    if previous_test_df is not None:
        previous_test_df["answered_correctly"] = eval(current_test["prior_group_answers_correct"].iloc[0])
        previous_test_df[~previous_test_df.content_type_id].groupby(
            "user_id"
        ).answered_correctly.apply(inference_dataset.update_answered_correctly)
        
    # your feature extraction and model training code here
    current_test = preprocess_df(current_test)
    previous_test_df = current_test.copy()
    current_test = current_test[~current_test.content_type_id]

    # add current to cache
    current_test[
        ["row_id", "user_id", "content_id", "timestamp", "answered_correctly", "prior_question_elapsed_time"]
    ].groupby("user_id").apply(
        lambda user_rows: inference_dataset.update_user_rows(user_rows)
    )

    # your prediction code here
    current_test = predict_for_df(current_test)
    set_predict(current_test[["row_id", "answered_correctly"]])