In [8]:
import pandas as pd
import numpy as np
import torch
from torch import nn, optim
from torch.utils.data import Dataset
from torch.nn import functional as F
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from os.path import exists
from os import mkdir

In [9]:
# File
# dir = '../input/riiid-test-answer-prediction/'
dir = './'
FEATURE_FOLDER_PATH = dir + 'riiid_features/'

# Hyper parameters
BATCH_SIZE = 32
LEARNING_RATE = 0.001
WEIGHT_DECAY = 0

In [10]:

# Read large datasets: https://www.kaggle.com/rohanrao/tutorial-on-reading-large-datasets
train_dtypes = {
    "row_id": "int64",
    "timestamp": "int64",
    "user_id": "int32",
    "content_id": "int16",
    "content_type_id": "boolean",
    "task_container_id": "int16",
    "user_answer": "int8",
    "answered_correctly": "int8",
    "prior_question_elapsed_time": "float32",
    "prior_question_had_explanation": "boolean"
}

# why we removed columns:
#   row_id: redundant
#   task_container_id: tells you what container this question is in
#       the max container size is 5 so not really significant
#   user_answer: doesnt really affect if the answer is correct
#   prior_question_had_explanation: majority of learning (we assume) will be done from lectures, not answer explanations
#

req_cols = ['timestamp', 'user_id', 'content_id', 'content_type_id', 'answered_correctly', 'prior_question_elapsed_time',
            'prior_question_had_explanation']

# function to convert the milliseconds to seconds at load time
# messes with the dtypes above and doesn't really save time so it is commented (see converters)
def mil_to_sec(val):
    if val == '':
        return np.NaN
    return round(int(val)/1000)

train_columns = pd.read_csv(dir + 'train.csv', usecols=req_cols, nrows=1).columns
t_index = {col: i for (i, col) in enumerate(train_columns)}
data_df = pd.read_csv(dir + 'train.csv', usecols=req_cols, dtype=train_dtypes,
                       # converters={'timestamp': mil_to_sec,
                       #             'prior_question_elapsed_time': mil_to_sec},
                       nrows=10000000)
# the following lines are kinda cheating since we don't actually have all the data at once
# train_df = raw_df[raw_df['content_type_id'] == 0]
# lecture_events_df = raw_df[raw_df['content_type_id'] == 1]

questions_df = pd.read_csv(dir + 'questions.csv')
lectures_df = pd.read_csv(dir + 'lectures.csv')
example_test_df = pd.read_csv(dir + 'example_test.csv')

In [11]:
student_questions_df = data_df[data_df['content_type_id'] == 0]
student_lectures_df = data_df[data_df['content_type_id'] == 1]

In [19]:
df1 = student_lectures_df.merge(lectures_df, left_on='content_id', right_on='lecture_id').groupby('user_id').agg(
    {
        'tag': lambda x: list(x),
        'timestamp': lambda x: list(x),
        'type_of': lambda x: list(x)
     }
)
df1.columns=['tags', 'ltimestamp', 'type_of']

In [20]:
df2 = student_questions_df.merge(questions_df, left_on='content_id', right_on='question_id').groupby('user_id').agg(
    {
        'answered_correctly': lambda x: list(x),
        'timestamp': lambda x: list(x)
     }
)
df2.columns=['answered_correctly', 'qtimestamp']

time_features = df2.join(df1)

In [26]:
time_features.columns

Index(['answered_correctly', 'qtimestamp', 'tags', 'ltimestamp', 'type_of'], dtype='object')

In [24]:
students = {}
for row in time_features.to_numpy():

array([[list([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1]),
        list([0, 56943, 118363, 131167, 137965, 157063, 176092, 194190, 212463, 230983, 255381, 280033, 302994, 328686, 352686, 376162, 398020, 418008, 437272, 468511, 490100, 510583, 534187, 557677, 575289, 597863, 621464, 645415, 670520, 692971, 710402, 732421, 1219624, 1252621, 1284094, 1320874, 1359412, 1415188, 1468285, 667861680, 667971812, 667971812, 667971812, 668090043, 668090043, 668090043]),
        nan, nan, nan],
       [list([1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]),
        list([0, 32683, 62000, 83632, 189483, 189483, 189483, 258793, 258793, 258793, 330528, 330528, 330528, 382790, 382790, 382790, 419266, 445527, 475421, 492328, 513206, 523644, 554504, 554504, 554504, 554504, 571323, 571323, 571323, 571323]),
        nan, nan, nan],
       [list([0, 0, 0, 1, 0, 1, 1, 1, 