In [1]:
import pandas as pd
import numpy as np
import torch
from torch import nn, optim
from torch.utils.data import Dataset
from torch.nn import functional as F
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from os.path import exists
from os import mkdir

In [2]:
# File
# dir = '../input/riiid-test-answer-prediction/'
dir = './'
FEATURE_FOLDER_PATH = dir + 'riiid_features/'

# Hyper parameters
BATCH_SIZE = 32
LEARNING_RATE = 0.001
WEIGHT_DECAY = 0

In [22]:

# Read large datasets: https://www.kaggle.com/rohanrao/tutorial-on-reading-large-datasets
train_dtypes = {
    "row_id": "int64",
    "timestamp": "int64",
    "user_id": "int32",
    "content_id": "int16",
    "content_type_id": "boolean",
    "task_container_id": "int16",
    "user_answer": "int8",
    "answered_correctly": "int8",
    "prior_question_elapsed_time": "float32",
    "prior_question_had_explanation": "boolean"
}

# why we removed columns:
#   row_id: redundant
#   task_container_id: tells you what container this question is in
#       the max container size is 5 so not really significant
#   user_answer: doesnt really affect if the answer is correct
#   prior_question_had_explanation: majority of learning (we assume) will be done from lectures, not answer explanations
#

req_cols = ['timestamp', 'user_id', 'content_id', 'content_type_id', 'answered_correctly', 'prior_question_elapsed_time',
            'prior_question_had_explanation']

# function to convert the milliseconds to seconds at load time
# messes with the dtypes above and doesn't really save time so it is commented (see converters)
def mil_to_sec(val):
    if val == '':
        return np.NaN
    return round(int(val)/1000)

train_columns = pd.read_csv(dir + 'train.csv', usecols=req_cols, nrows=1).columns
t_index = {col: i for (i, col) in enumerate(train_columns)}
train = pd.read_csv(dir + 'train.csv', usecols=req_cols, dtype=train_dtypes,
                       # converters={'timestamp': mil_to_sec,
                       #             'prior_question_elapsed_time': mil_to_sec},
                       nrows=10000000).to_numpy()
# the following lines are kinda cheating since we don't actually have all the data at once
# train_df = raw_df[raw_df['content_type_id'] == 0]
# lecture_events_df = raw_df[raw_df['content_type_id'] == 1]

questions_df = pd.read_csv(dir + 'questions.csv')
lectures_df = pd.read_csv(dir + 'lectures.csv')
example_test_df = pd.read_csv(dir + 'example_test.csv')

# train_df = train_df.drop(['content_id', 'content_type_id'], axis=1) # TODO: replace

# we need a way to dummify the user id
# pd.get_dummies(train_df, columns=['user_id']) # DO NOT TRY THIS IT WILL OVERLOAD RAM

In [23]:
prior_col = train[:,t_index['prior_question_elapsed_time']]
train[prior_col == np.nan]

array([], shape=(0, 7), dtype=object)

In [24]:
def jagged_to_fixed_matrix(x, n , fill_value):
    a = np.full((len(x), n), fill_value)
    for i,xi in enumerate(x):
        a[i][0:min(n,len(xi))] = np.array(xi)[0:min(n, len(xi))]
    return a

def create_batch_iter(n, skip):
    a = list(range(0, n, skip)) + [len(train)]
    return [(a[i], a[i + 1]) for i in range(len(a) - 1)]

In [32]:
class StudentKnowledge:
    def __init__(self, num_tags, max_tags=30, question_avg_default=0.67):
        self.lectures = []
        self.lecture_timestamp = []
        self.questions = []
        self.question_timestamp = []
        self.num_tags = num_tags

        self.question_avg_default = question_avg_default
        self.max_tags = max_tags

    def add_lecture(self, lecture_tag, lecture_timestamp):
        self.lectures.append(lecture_tag)
        self.lecture_timestamp.append(lecture_timestamp)

    def add_question(self, question_answered_right, question_timestamp):
        self.questions.append(question_answered_right)
        self.question_timestamp.append(question_timestamp)

    def get_features(self, row):
        timestamp = row[t_index['timestamp']]
        prior_question_elapsed = row[t_index['prior_question_elapsed_time']]
        prior_question_explain = row[t_index['prior_question_had_explanation']]
        # Lecture Tag Indices
        before_ts_lecture = [lt > timestamp for lt in self.lecture_timestamp]
        valid_lectures = [self.lectures[i] for i in range(len(self.lectures)) if before_ts_lecture[i]]

        # Additional Features
        before_ts_question = [qt > timestamp for qt in self.question_timestamp]
        num_questions_answered = sum(before_ts_question)
        if num_questions_answered == 0:
            question_avg = self.question_avg_default
        else:
            question_avg = sum([self.questions[i] for i in range(len(self.questions)) if before_ts_question[i]]) / num_questions_answered
        prior_question_elapsed = 0 if str(prior_question_elapsed) == 'nan' else prior_question_elapsed
        prior_question_explain = False if str(prior_question_explain) == '<NA>' else prior_question_explain

        return valid_lectures, [question_avg, prior_question_elapsed, prior_question_explain]



class StudentDataset:
    def __init__(self, lectures, questions, train_columns, tags_len=30):
        self.t_index = {col: i for i, col in enumerate(train_columns)}

        self.students = {}
        self.lecture_tags = {row['lecture_id']: row['tag'] for _, row in lectures.iterrows()}
        self.question_tags = {row['question_id']: [int(x) for x in str(row['tags']).split() if x != 'nan'] \
                              for _, row in questions.iterrows()}

        a = questions['tags'].apply(lambda r: [int(x) for x in str(r).split() if x != 'nan']).to_numpy()
        self.num_tags = len(np.unique([x for b in a for x in b]))
        self.tags_len = tags_len

    def read(self, row):
        student_id = row[self.t_index['user_id']]
        content_type_id = row[self.t_index['content_type_id']]
        content_id = row[self.t_index['content_id']]
        timestamp = row[self.t_index['timestamp']]
        answered_right = row[self.t_index['answered_correctly']]

        self.read_info(student_id, content_type_id, content_id, timestamp, answered_right)

    def read_info(self, student_id, content_type_id, content_id, timestamp, answered_right):
        if student_id not in self.students:
            self.students[student_id] = StudentKnowledge(self.num_tags)
        if content_type_id == 0:
            self.students[student_id].add_question(answered_right, timestamp)
        else:
            self.students[student_id].add_lecture(self.lecture_tags[content_id], timestamp)

    # Must receive question rows, lectures not valid
    def get_features(self, rows):
        user_ids = rows[:,self.t_index['user_id']]
        question_ids = rows[:, self.t_index['content_id']]

        question_tags = [self.question_tags[question_id] for question_id in question_ids]
        question_tags = jagged_to_fixed_matrix(question_tags, self.tags_len, self.num_tags)

        student_tags, additional_features = zip(
            *[self.students[user_id].get_features(rows[i]) for i, user_id in enumerate(user_ids)]
        )
        student_tags = jagged_to_fixed_matrix(list(student_tags), self.tags_len, self.num_tags)
        return question_tags, student_tags, list(additional_features)

class Network(nn.Module):
  def __init__(self, num_embeddings, extra_input_dim, hidden_1, hidden_2, embedding_dim=20):
    super().__init__()
    self.embedding = nn.Embedding(num_embeddings=num_embeddings, embedding_dim=embedding_dim)

    self.layers = nn.ModuleList([
      nn.Linear(extra_input_dim + 2 * embedding_dim, hidden_1),
      nn.ReLU(),
      nn.Linear(hidden_1, hidden_2),
      nn.ReLU(),
      nn.Linear(hidden_2, 1)
    ])

  def forward(self, student_indices, question_indices, features):
    y1 = torch.cat(self.embedding(student_indices), dim=1)
    y2 = torch.cat(self.embedding(question_indices), dim=1)

    y = torch.cat((y1, y2, features), dim=1)
    for layer in self.layers:
        y = layer(y)
    return y

In [26]:
student_dataset = StudentDataset(lectures_df, questions_df, train_columns)

for row in tqdm(train):
    student_dataset.read(row)

100%|██████████| 10000000/10000000 [00:11<00:00, 875314.10it/s]


In [27]:
# Select only the questions
train = train[train[:,t_index['content_type_id']]==0][0:10000]

In [28]:
# if not exists(FEATURE_FOLDER_PATH):
#     mkdir(FEATURE_FOLDER_PATH)
#     chunksize = 10000
#     for i,j in tqdm(create_batch_iter(len(train), chunksize)):
#         question_tags, student_tags, f = student_dataset.get_features(train[i:j])
#         labels = train[i:j]['']
#         pd.DataFrame(question_tags).to_csv(FEATURE_FOLDER_PATH + 'questions.csv', mode='a', index=False)
#         pd.DataFrame(student_tags).to_csv(FEATURE_FOLDER_PATH + 'student.csv', mode='a', index=False)
#         pd.DataFrame(f).to_csv(FEATURE_FOLDER_PATH + 'extra.csv', mode='a', index=False)

In [29]:
# Compute input sizes
_, _, f = student_dataset.get_features(np.array([train[0]]))
extra_input_dim = len(f[0])
num_embeddings = student_dataset.num_tags + 1

In [34]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Network(student_dataset.num_tags + 1, extra_input_dim, 64, 32, embedding_dim=5).to(device)
loss_func = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)

for epoch in range(20):
    tq = tqdm(create_batch_iter(len(train), BATCH_SIZE))

    # Average Loss Variables
    total_loss = 0
    current_len = 0
    total_correct = 0
    total_len = 0

    for i,j in tq:
        question_tags, student_tags, f = student_dataset.get_features(train[i:j])
        question_tags = torch.tensor(question_tags).long().to(device)
        student_tags = torch.tensor(student_tags).long().to(device)
        n = len(f)

        f = torch.tensor(f).view(n, len(f[0])).float().to(device)

        labels = torch.tensor(
            train[i:j, t_index['answered_correctly']].astype(np.bool)
        ).view(n, 1).float().to(device)

        predictions = model(question_tags, student_tags, f)

        loss = loss_func(predictions, labels.float())
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        current_len += n
        total_loss += loss.item() * n

        total_correct += np.sum(np.round(torch.sigmoid(predictions).cpu().view(n).detach().numpy()) == labels.cpu().view(n).detach().numpy())
        tq.set_description('Avg Loss: ' + str(total_loss / current_len) + ', Train Accuracy: '  + \
                           str(np.round(total_correct / current_len * 100, 2)) + '%')



Avg Loss: 32.91097988204956, Train Accuracy: 56.68%: 100%|██████████| 313/313 [00:06<00:00, 48.37it/s] 
Avg Loss: 15.887437721252441, Train Accuracy: 56.96%: 100%|██████████| 313/313 [00:06<00:00, 50.38it/s]
Avg Loss: 16.589845665359498, Train Accuracy: 57.19%: 100%|██████████| 313/313 [00:06<00:00, 49.54it/s]
Avg Loss: 11.129831821918488, Train Accuracy: 56.73%: 100%|██████████| 313/313 [00:06<00:00, 49.52it/s]
Avg Loss: 9.877737319374084, Train Accuracy: 56.81%: 100%|██████████| 313/313 [00:06<00:00, 50.12it/s] 
Avg Loss: 8.285740426445008, Train Accuracy: 56.62%: 100%|██████████| 313/313 [00:06<00:00, 48.89it/s] 
Avg Loss: 6.825373198986053, Train Accuracy: 56.39%: 100%|██████████| 313/313 [00:06<00:00, 51.86it/s] 
Avg Loss: 6.252027510356903, Train Accuracy: 56.42%: 100%|██████████| 313/313 [00:06<00:00, 50.31it/s] 
Avg Loss: 7.602202873420715, Train Accuracy: 57.05%: 100%|██████████| 313/313 [00:06<00:00, 49.21it/s] 
Avg Loss: 7.484163282926266, Train Accuracy: 58.15%:  42%|████▏ 

KeyboardInterrupt: 