In [2]:

import pandas as pd
import numpy as np
import torch
from torch import nn, optim
from torch.nn import functional as F
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [3]:
# dir = '../input/riiid-test-answer-prediction/'
dir = './'
# Read large datasets: https://www.kaggle.com/rohanrao/tutorial-on-reading-large-datasets
train_dtypes = {
    "row_id": "int64",
    "timestamp": "int64",
    "user_id": "int32",
    "content_id": "int16",
    "content_type_id": "boolean",
    "task_container_id": "int16",
    "user_answer": "int8",
    "answered_correctly": "int8",
    "prior_question_elapsed_time": "float32",
    "prior_question_had_explanation": "boolean"
}

# why we removed columns:
#   row_id: redundant
#   task_container_id: tells you what container this question is in
#       the max container size is 5 so not really significant
#   user_answer: doesnt really affect if the answer is correct
#   prior_question_had_explanation: majority of learning (we assume) will be done from lectures, not answer explanations
#

req_cols = ['timestamp', 'user_id', 'content_id', 'content_type_id', 'answered_correctly']
            # 'prior_question_elapsed_time']

# function to convert the milliseconds to seconds at load time
# messes with the dtypes above and doesn't really save time so it is commented (see converters)
def mil_to_sec(val):
    if val == '':
        return np.NaN
    return round(int(val)/1000)

train_columns = pd.read_csv(dir + 'train.csv', usecols=req_cols, nrows=1).columns
train = pd.read_csv(dir + 'train.csv', usecols=req_cols, dtype=train_dtypes,
                       # converters={'timestamp': mil_to_sec,
                       #             'prior_question_elapsed_time': mil_to_sec},
                       nrows=1000).to_numpy()
# the following lines are kinda cheating since we don't actually have all the data at once
# train_df = raw_df[raw_df['content_type_id'] == 0]
# lecture_events_df = raw_df[raw_df['content_type_id'] == 1]

questions_df = pd.read_csv(dir + 'questions.csv')
lectures_df = pd.read_csv(dir + 'lectures.csv')
example_test_df = pd.read_csv(dir + 'example_test.csv')

# train_df = train_df.drop(['content_id', 'content_type_id'], axis=1) # TODO: replace

# we need a way to dummify the user id
# pd.get_dummies(train_df, columns=['user_id']) # DO NOT TRY THIS IT WILL OVERLOAD RAM

In [4]:
class StudentKnowledge:
    def __init__(self, num_tags):
        self.lectures = []
        self.lecture_timestamp = []
        self.questions = []
        self.question_timestamp = []
        self.num_tags = num_tags

    def add_lecture(self, lecture_tag, lecture_timestamp):
        self.lectures.append(lecture_tag)
        self.lecture_timestamp.append(lecture_timestamp)

    def add_question(self, question_answered_right, question_timestamp):
        self.questions.append(question_answered_right)
        self.question_timestamp.append(question_timestamp)

    def get_features(self, timestamp):
        features = np.zeros(self.num_tags + 1)
        before_ts_lecture = [lt > timestamp for lt in self.lecture_timestamp]
        valid_tags_lecture = [self.lectures[i] for i in range(len(self.lectures)) if before_ts_lecture[i]]
        before_ts_question = [qt > timestamp for qt in self.question_timestamp]
        question_avg = sum([self.questions[i] for i in range(len(self.questions)) if before_ts_question[i]]) / sum(before_ts_question)
        features[valid_tags_lecture] = 1
        features[-1] = question_avg
        return features

class StudentDataset:
    def __init__(self, lectures, questions, train_columns):
        self.students = {}
        self.lecture_tags = {row['lecture_id']: row['tag'] for _, row in lectures.iterrows()}
        self.question_tags = {row['question_id']: [int(x) for x in str(row['tags']).split() if x != 'nan'] for _, row in questions.iterrows()}
        self.t_index = {col: i for i, col in enumerate(train_columns)}

        a = questions['tags'].apply(lambda r: [int(x) for x in str(r).split() if x != 'nan']).to_numpy()
        self.num_tags = len(np.unique([x for b in a for x in b]))

    def read(self, row):
        student_id = row[self.t_index['user_id']]
        content_type_id = row[self.t_index['content_type_id']]
        content_id = row[self.t_index['content_id']]
        timestamp = row[self.t_index['timestamp']]
        answered_right = row[self.t_index['answered_correctly']]
        self.read_info(student_id, content_type_id, content_id, timestamp, answered_right)

    def read_info(self, student_id, content_type_id, content_id, timestamp, answered_right):
        if student_id not in self.students:
            self.students[student_id] = StudentKnowledge(self.num_tags)
        if content_type_id == 0:
            self.students[student_id].add_question(answered_right, timestamp)
        else:
            self.students[student_id].add_lecture(self.lecture_tags[content_id], timestamp)

    # Must receive a question vector, lectures not valid
    def get_features(self, rows):
        user_ids = rows[:,self.t_index['user_id']]
        timestamps = rows[:,self.t_index['timestamp']]
        question_ids = rows[:, self.t_index['content_id']]

        question_features = np.zeros((len(rows), self.num_tags))
        for i, question_id in enumerate(question_ids):
            question_features[i,self.question_tags[question_id]] = 1

        student_features = np.array([self.students[user_id].get_features(timestamp)
                                     for user_id, timestamp in zip(user_ids, timestamps)])

        return np.hstack([question_features, student_features])

student_dataset = StudentDataset(lectures_df, questions_df, train_columns)

In [5]:
for row in tqdm(train):
    student_dataset.read(row)

100%|██████████| 1000/1000 [00:00<00:00, 499619.30it/s]


In [8]:
# Example features with train
print(train[0:10])
student_dataset.get_features(train[0:10])

[[0 115 5692 False 1]
 [56943 115 5716 False 1]
 [118363 115 128 False 1]
 [131167 115 7860 False 1]
 [137965 115 7922 False 1]
 [157063 115 156 False 1]
 [176092 115 51 False 1]
 [194190 115 50 False 1]
 [212463 115 7896 False 1]
 [230983 115 7863 False 1]]


array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.69565217],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.68888889],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.68181818],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.64102564],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.63157895],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.62162162]])

In [12]:
class Network(nn.Module):
  def __init__(self, inputs, hidden, lr=0.001):
    super().__init__()
    self.main = nn.Sequential(
      nn.Linear(inputs, hidden),
      nn.ReLU(),
      nn.Linear(hidden, 1),
    )
    self.optimizer = optim.Adam(self.parameters(), lr=lr)

  def forward(self, x):
    return self.main(x)

In [90]:
train = train[train[:,3]==0]
model = Network(len(student_dataset.get_features(train[0:10])[0]), 16)
loss_func = nn.BCEWithLogitsLoss()
for epoch in range(20):
    tq = tqdm(train)
    losses = []
    for row in tq:
        prediction = model(torch.tensor(student_dataset.get_features(np.array([row[:-1]]))).float())
        loss = loss_func(prediction, torch.tensor(row[-1]).view(1,1).float())
        model.optimizer.zero_grad()
        loss.backward()
        model.optimizer.step()
        tq.set_description(str(loss.item()))
        losses += [loss.item()]
    print('mean loss:', sum(losses)/len(losses))

0.47721028327941895: 100%|██████████| 982/982 [00:01<00:00, 646.16it/s]
0.5117090940475464: 100%|██████████| 982/982 [00:01<00:00, 694.01it/s] 
0.5589438676834106: 100%|██████████| 982/982 [00:01<00:00, 691.92it/s] 
0.6039080619812012: 100%|██████████| 982/982 [00:01<00:00, 714.20it/s] 
0.6484382748603821: 100%|██████████| 982/982 [00:01<00:00, 710.09it/s] 
0.675808846950531: 100%|██████████| 982/982 [00:01<00:00, 686.56it/s]   
0.71211838722229: 100%|██████████| 982/982 [00:01<00:00, 673.43it/s]    
0.754368782043457: 100%|██████████| 982/982 [00:01<00:00, 646.40it/s]   
0.7781763672828674: 100%|██████████| 982/982 [00:01<00:00, 652.69it/s]  
0.801904022693634: 100%|██████████| 982/982 [00:01<00:00, 652.96it/s]   
0.8250249028205872: 100%|██████████| 982/982 [00:01<00:00, 656.88it/s]  
0.8431645035743713: 100%|██████████| 982/982 [00:01<00:00, 577.36it/s]   
0.8558878302574158: 100%|██████████| 982/982 [00:01<00:00, 657.00it/s]   
0.8667155504226685: 100%|██████████| 982/982 [00:01<00

mean loss: 0.6595275169295352
mean loss: 0.6426682004708865
mean loss: 0.6291753450473069
mean loss: 0.6133536751364982
mean loss: 0.5950161170578597
mean loss: 0.5767507815767694
mean loss: 0.558864320958386
mean loss: 0.5422472487528127
mean loss: 0.5268002794804184
mean loss: 0.5129485195836948
mean loss: 0.49938805383335383
mean loss: 0.4870936342406371
mean loss: 0.4750490730202394
mean loss: 0.4639553778206947
mean loss: 0.45314748302022134
mean loss: 0.44310402308372693
mean loss: 0.43323565440822187
mean loss: 0.4242707047260686
mean loss: 0.41552238765069466
mean loss: 0.40741920772696494


In [95]:
with torch.no_grad():
    total_correct = 0
    for row in train: # replace with test/val set
        prediction = model(torch.tensor(student_dataset.get_features(np.array([row[:-1]]))).float())
        total_correct += round(torch.sigmoid(prediction).item()) == row[-1]
    print(total_correct/len(train))

0.8319755600814664
