In [68]:

import pandas as pd
import numpy as np
import torch
from torch import nn, optim
from torch.nn import functional as F
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [69]:
# dir = '../input/riiid-test-answer-prediction/'
dir = './'
# Read large datasets: https://www.kaggle.com/rohanrao/tutorial-on-reading-large-datasets
train_dtypes = {
    "row_id": "int64",
    "timestamp": "int64",
    "user_id": "int32",
    "content_id": "int16",
    "content_type_id": "boolean",
    "task_container_id": "int16",
    "user_answer": "int8",
    "answered_correctly": "int8",
    "prior_question_elapsed_time": "float32",
    "prior_question_had_explanation": "boolean"
}

# why we removed columns:
#   row_id: redundant
#   task_container_id: tells you what container this question is in
#       the max container size is 5 so not really significant
#   user_answer: doesnt really affect if the answer is correct
#   prior_question_had_explanation: majority of learning (we assume) will be done from lectures, not answer explanations
#

req_cols = ['timestamp', 'user_id', 'content_id', 'content_type_id', 'answered_correctly']
            # 'prior_question_elapsed_time']

# function to convert the milliseconds to seconds at load time
# messes with the dtypes above and doesn't really save time so it is commented (see converters)
def mil_to_sec(val):
    if val == '':
        return np.NaN
    return round(int(val)/1000)

train_columns = pd.read_csv(dir + 'train.csv', usecols=req_cols, nrows=1).columns
train = pd.read_csv(dir + 'train.csv', usecols=req_cols, dtype=train_dtypes,
                       # converters={'timestamp': mil_to_sec,
                       #             'prior_question_elapsed_time': mil_to_sec},
                       nrows=1000).to_numpy()
# the following lines are kinda cheating since we don't actually have all the data at once
# train_df = raw_df[raw_df['content_type_id'] == 0]
# lecture_events_df = raw_df[raw_df['content_type_id'] == 1]

questions_df = pd.read_csv(dir + 'questions.csv')
lectures_df = pd.read_csv(dir + 'lectures.csv')
example_test_df = pd.read_csv(dir + 'example_test.csv')

# train_df = train_df.drop(['content_id', 'content_type_id'], axis=1) # TODO: replace

# we need a way to dummify the user id
# pd.get_dummies(train_df, columns=['user_id']) # DO NOT TRY THIS IT WILL OVERLOAD RAM

In [96]:
class StudentKnowledge:
    def __init__(self, num_tags):
        self.lectures = []
        self.lecture_timestamp = []
        self.questions = []
        self.question_timestamp = []
        self.num_tags = num_tags

    def add_lecture(self, lecture_tag, lecture_timestamp):
        self.lectures.append(lecture_tag)
        self.lecture_timestamp.append(lecture_timestamp)

    def add_question(self, question_answered_right, question_timestamp):
        self.questions.append(question_answered_right)
        self.question_timestamp.append(question_timestamp)

    def get_features(self, timestamp):
        features = np.zeros(self.num_tags + 1)
        before_ts_lecture = [lt >= timestamp for lt in self.lecture_timestamp]
        valid_tags_lecture = [self.lectures[i] for i in range(len(self.lectures)) if before_ts_lecture[i]]
        before_ts_question = [qt >= timestamp for qt in self.question_timestamp]
        question_avg = sum([self.questions[i] for i in range(len(self.questions)) if before_ts_question[i]]) / sum(before_ts_question)
        features[valid_tags_lecture] = 1
        features[-1] = question_avg
        return features

class StudentDataset:
    def __init__(self, lectures, questions, train_columns):
        self.students = {}
        self.lecture_tags = {row['lecture_id']: row['tag'] for _, row in lectures.iterrows()}
        self.question_tags = {row['question_id']: [int(x) for x in str(row['tags']).split() if x != 'nan'] for _, row in questions.iterrows()}
        self.t_index = {col: i for i, col in enumerate(train_columns)}

        a = questions['tags'].apply(lambda r: [int(x) for x in str(r).split() if x != 'nan']).to_numpy()
        self.num_tags = len(np.unique([x for b in a for x in b]))

    def read(self, row):
        student_id = row[self.t_index['user_id']]
        content_type_id = row[self.t_index['content_type_id']]
        content_id = row[self.t_index['content_id']]
        timestamp = row[self.t_index['timestamp']]
        answered_right = row[self.t_index['answered_correctly']]
        self.read_info(student_id, content_type_id, content_id, timestamp, answered_right)

    def read_info(self, student_id, content_type_id, content_id, timestamp, answered_right):
        if student_id not in self.students:
            self.students[student_id] = StudentKnowledge(self.num_tags)
        if content_type_id == 0:
            self.students[student_id].add_question(answered_right, timestamp)
        else:
            self.students[student_id].add_lecture(self.lecture_tags[content_id], timestamp)

    # Must receive a question vector, lectures not valid
    def get_features(self, rows):
        user_ids = rows[:,self.t_index['user_id']]
        timestamps = rows[:,self.t_index['timestamp']]
        question_ids = rows[:, self.t_index['content_id']]

        question_features = np.zeros((len(rows), self.num_tags))
        for i, question_id in enumerate(question_ids):
            question_features[i,self.question_tags[question_id]] = 1

        student_features = np.array([self.students[user_id].get_features(timestamp)
                                     for user_id, timestamp in zip(user_ids, timestamps)])

        return np.hstack([question_features, student_features])

student_dataset = StudentDataset(lectures_df, questions_df, train_columns)

In [100]:
for row in tqdm(train):
    student_dataset.read(row)

100%|██████████| 1000/1000 [00:00<00:00, 404153.40it/s]


In [102]:
# Example features with train
student_dataset.get_features(train[0:10])

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.69565217],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.68888889],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.68181818],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.64102564],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.63157895],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.62162162]])