In [34]:

import pandas as pd
import numpy as np
import torch
from torch import nn, optim
from torch.nn import functional as F
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [35]:
BATCH_SIZE = 512
LEARNING_RATE = 0.001
WEIGHT_DECAY = 5e-5


In [36]:
# dir = '../input/riiid-test-answer-prediction/'
dir = './'
# Read large datasets: https://www.kaggle.com/rohanrao/tutorial-on-reading-large-datasets
train_dtypes = {
    "row_id": "int64",
    "timestamp": "int64",
    "user_id": "int32",
    "content_id": "int16",
    "content_type_id": "boolean",
    "task_container_id": "int16",
    "user_answer": "int8",
    "answered_correctly": "int8",
    "prior_question_elapsed_time": "float32",
    "prior_question_had_explanation": "boolean"
}

# why we removed columns:
#   row_id: redundant
#   task_container_id: tells you what container this question is in
#       the max container size is 5 so not really significant
#   user_answer: doesnt really affect if the answer is correct
#   prior_question_had_explanation: majority of learning (we assume) will be done from lectures, not answer explanations
#

req_cols = ['timestamp', 'user_id', 'content_id', 'content_type_id', 'answered_correctly']
            # 'prior_question_elapsed_time']

# function to convert the milliseconds to seconds at load time
# messes with the dtypes above and doesn't really save time so it is commented (see converters)
def mil_to_sec(val):
    if val == '':
        return np.NaN
    return round(int(val)/1000)

train_columns = pd.read_csv(dir + 'train.csv', usecols=req_cols, nrows=1).columns
t_index = {col: i for (i, col) in enumerate(train_columns)}
train = pd.read_csv(dir + 'train.csv', usecols=req_cols, dtype=train_dtypes,
                       # converters={'timestamp': mil_to_sec,
                       #             'prior_question_elapsed_time': mil_to_sec},
                       nrows=100000).to_numpy()
# the following lines are kinda cheating since we don't actually have all the data at once
# train_df = raw_df[raw_df['content_type_id'] == 0]
# lecture_events_df = raw_df[raw_df['content_type_id'] == 1]

questions_df = pd.read_csv(dir + 'questions.csv')
lectures_df = pd.read_csv(dir + 'lectures.csv')
example_test_df = pd.read_csv(dir + 'example_test.csv')

# train_df = train_df.drop(['content_id', 'content_type_id'], axis=1) # TODO: replace

# we need a way to dummify the user id
# pd.get_dummies(train_df, columns=['user_id']) # DO NOT TRY THIS IT WILL OVERLOAD RAM

In [37]:
class StudentKnowledge:
    def __init__(self, num_tags, question_avg_default=0.67):
        self.lectures = []
        self.lecture_timestamp = []
        self.questions = []
        self.question_timestamp = []
        self.num_tags = num_tags
        self.question_avg_default = question_avg_default

    def add_lecture(self, lecture_tag, lecture_timestamp):
        self.lectures.append(lecture_tag)
        self.lecture_timestamp.append(lecture_timestamp)

    def add_question(self, question_answered_right, question_timestamp):
        self.questions.append(question_answered_right)
        self.question_timestamp.append(question_timestamp)

    def get_features(self, timestamp):
        features = np.zeros(self.num_tags + 1)

        before_ts_lecture = [lt > timestamp for lt in self.lecture_timestamp]
        valid_tags_lecture = [self.lectures[i] for i in range(len(self.lectures)) if before_ts_lecture[i]]

        before_ts_question = [qt > timestamp for qt in self.question_timestamp]
        num_questions_answered = sum(before_ts_question)
        if num_questions_answered == 0:
            question_avg = self.question_avg_default
        else:
            question_avg = sum([self.questions[i] for i in range(len(self.questions)) if before_ts_question[i]]) / num_questions_answered
        features[valid_tags_lecture] = 1
        features[-1] = question_avg
        return features

class StudentDataset:
    def __init__(self, lectures, questions, train_columns):
        self.students = {}
        self.lecture_tags = {row['lecture_id']: row['tag'] for _, row in lectures.iterrows()}
        self.question_tags = {row['question_id']: [int(x) for x in str(row['tags']).split() if x != 'nan'] for _, row in questions.iterrows()}
        self.t_index = {col: i for i, col in enumerate(train_columns)}

        a = questions['tags'].apply(lambda r: [int(x) for x in str(r).split() if x != 'nan']).to_numpy()
        self.num_tags = len(np.unique([x for b in a for x in b]))

    def read(self, row):
        student_id = row[self.t_index['user_id']]
        content_type_id = row[self.t_index['content_type_id']]
        content_id = row[self.t_index['content_id']]
        timestamp = row[self.t_index['timestamp']]
        answered_right = row[self.t_index['answered_correctly']]
        self.read_info(student_id, content_type_id, content_id, timestamp, answered_right)

    def read_info(self, student_id, content_type_id, content_id, timestamp, answered_right):
        if student_id not in self.students:
            self.students[student_id] = StudentKnowledge(self.num_tags)
        if content_type_id == 0:
            self.students[student_id].add_question(answered_right, timestamp)
        else:
            self.students[student_id].add_lecture(self.lecture_tags[content_id], timestamp)

    # Must receive a question vector, lectures not valid
    def get_features(self, rows):
        user_ids = rows[:,self.t_index['user_id']]
        timestamps = rows[:,self.t_index['timestamp']]
        question_ids = rows[:, self.t_index['content_id']]

        question_features = np.zeros((len(rows), self.num_tags))
        for i, question_id in enumerate(question_ids):
            question_features[i,self.question_tags[question_id]] = 1

        student_features = np.array([self.students[user_id].get_features(timestamp)
                                     for user_id, timestamp in zip(user_ids, timestamps)])

        return np.hstack([question_features, student_features])

student_dataset = StudentDataset(lectures_df, questions_df, train_columns)

In [38]:
for row in tqdm(train):
    student_dataset.read(row)

100%|██████████| 100000/100000 [00:00<00:00, 831873.07it/s]


In [39]:
class Network(nn.Module):
  def __init__(self, inputs, hidden, lr=0.001):
    super().__init__()
    self.main = nn.Sequential(
      nn.Linear(inputs, hidden),
      nn.ReLU(),
      nn.Linear(hidden, 1),
    )

  def forward(self, x):
    return self.main(x)

In [40]:
# Select only the questions
train = train[train[:,t_index['content_type_id']]==0]


In [49]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = Network(len(student_dataset.get_features(train[0:10])[0]), 16).to(device)
loss_func = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)

for epoch in range(20):
    batch_iter = list(range(0, len(train), BATCH_SIZE)) + [len(train)]
    tq = tqdm(range(len(batch_iter) - 1))

    # Average Loss Variables
    total_loss = 0
    current_len = 0

    for i in tq:
        batch = torch.tensor(
            student_dataset.get_features(np.array(train[batch_iter[i] : batch_iter[i+1]]))
        ).float().to(device)

        labels = torch.tensor(
            train[batch_iter[i] : batch_iter[i + 1], t_index['answered_correctly']].astype(np.bool)
        ).view(len(batch), 1).float().to(device)

        predictions = model(batch)

        loss = loss_func(predictions, labels.float())
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        current_len += len(batch)
        total_loss += loss.item() * len(batch)

        tq.set_description('Avg Loss: ' + str(total_loss / current_len))

Avg Loss: 0.6310918636181275: 100%|██████████| 192/192 [00:21<00:00,  8.75it/s]
Avg Loss: 0.6108763433634835: 100%|██████████| 192/192 [00:21<00:00,  8.90it/s]
Avg Loss: 0.6054975811616462: 100%|██████████| 192/192 [00:22<00:00,  8.59it/s]
Avg Loss: 0.6007638281858066: 100%|██████████| 192/192 [00:22<00:00,  8.67it/s]
Avg Loss: 0.5980724128813595: 100%|██████████| 192/192 [00:22<00:00,  8.68it/s]
Avg Loss: 0.5960870792887074: 100%|██████████| 192/192 [00:21<00:00,  8.78it/s]
Avg Loss: 0.5949339064802666: 100%|██████████| 192/192 [00:21<00:00,  8.94it/s]
Avg Loss: 0.5943859391270646: 100%|██████████| 192/192 [00:21<00:00,  9.05it/s]
Avg Loss: 0.5932434733518965: 100%|██████████| 192/192 [00:21<00:00,  8.78it/s]
Avg Loss: 0.5922841803260835: 100%|██████████| 192/192 [00:22<00:00,  8.63it/s]
Avg Loss: 0.5900551537679559: 100%|██████████| 192/192 [00:23<00:00,  8.12it/s]
Avg Loss: 0.5891324962548479: 100%|██████████| 192/192 [00:23<00:00,  8.23it/s]
Avg Loss: 0.5888445949064406: 100%|█████

In [55]:
with torch.no_grad():
    total_correct = 0
    for row in train: # replace with test/val set
        prediction = model(torch.tensor(student_dataset.get_features(np.array([row[:-1]]))).float().to(device))
        total_correct += round(torch.sigmoid(prediction).item()) == row[-1]
    print(total_correct/len(train))

0.7025625878470595


In [90]:
with torch.no_grad():
    total_correct = 0
    for row in train: # replace with test/val set
        prediction = model(torch.tensor(student_dataset.get_features(np.array([row[:-1]]))).float())
        total_correct += round(torch.sigmoid(prediction).item()) == row[-1]
    print(total_correct/len(train))

0.47721028327941895: 100%|██████████| 982/982 [00:01<00:00, 646.16it/s]
0.5117090940475464: 100%|██████████| 982/982 [00:01<00:00, 694.01it/s] 
0.5589438676834106: 100%|██████████| 982/982 [00:01<00:00, 691.92it/s] 
0.6039080619812012: 100%|██████████| 982/982 [00:01<00:00, 714.20it/s] 
0.6484382748603821: 100%|██████████| 982/982 [00:01<00:00, 710.09it/s] 
0.675808846950531: 100%|██████████| 982/982 [00:01<00:00, 686.56it/s]   
0.71211838722229: 100%|██████████| 982/982 [00:01<00:00, 673.43it/s]    
0.754368782043457: 100%|██████████| 982/982 [00:01<00:00, 646.40it/s]   
0.7781763672828674: 100%|██████████| 982/982 [00:01<00:00, 652.69it/s]  
0.801904022693634: 100%|██████████| 982/982 [00:01<00:00, 652.96it/s]   
0.8250249028205872: 100%|██████████| 982/982 [00:01<00:00, 656.88it/s]  
0.8431645035743713: 100%|██████████| 982/982 [00:01<00:00, 577.36it/s]   
0.8558878302574158: 100%|██████████| 982/982 [00:01<00:00, 657.00it/s]   
0.8667155504226685: 100%|██████████| 982/982 [00:01<00

mean loss: 0.6595275169295352
mean loss: 0.6426682004708865
mean loss: 0.6291753450473069
mean loss: 0.6133536751364982
mean loss: 0.5950161170578597
mean loss: 0.5767507815767694
mean loss: 0.558864320958386
mean loss: 0.5422472487528127
mean loss: 0.5268002794804184
mean loss: 0.5129485195836948
mean loss: 0.49938805383335383
mean loss: 0.4870936342406371
mean loss: 0.4750490730202394
mean loss: 0.4639553778206947
mean loss: 0.45314748302022134
mean loss: 0.44310402308372693
mean loss: 0.43323565440822187
mean loss: 0.4242707047260686
mean loss: 0.41552238765069466
mean loss: 0.40741920772696494


In [95]:
with torch.no_grad():
    total_correct = 0
    for row in train: # replace with test/val set
        prediction = model(torch.tensor(student_dataset.get_features(np.array([row[:-1]]))).float())
        total_correct += round(torch.sigmoid(prediction).item()) == row[-1]
    print(total_correct/len(train))

0.8319755600814664
