In [1]:
import torch

import pandas as pd

from datetime import datetime

In [2]:
class IRT(torch.nn.Module):
    def __init__(self, df, q_col, u_col, s_col):
        super(IRT, self).__init__()
        
        self.df = df.sort_values([q_col, u_col])
        
        self.q = list(self.df[q_col].unique())
        self.q_to_index = {q : index for index, q in enumerate(self.q)}
        self.num_q = len(self.q)
        self.matrix_q = torch.nn.Parameter(torch.sparse_coo_tensor([list(range(len(self.df))), 
                                                                    [self.q_to_index[q] for q in self.df[q_col]]], 
                                                                   [1.] * len(self.df), 
                                                                   (len(self.df), self.num_q)).double(), 
                                           requires_grad=False)
        
        self.u = list(self.df[u_col].unique())
        self.u_to_index = {u : index for index, u in enumerate(self.u)}
        self.num_u = len(self.u)
        self.matrix_u = torch.nn.Parameter(torch.sparse_coo_tensor([list(range(len(self.df))), 
                                                                    [self.u_to_index[u] for u in self.df[u_col]]],
                                                                   [1.] * len(self.df), 
                                                                   (len(self.df), self.num_u)).double(), 
                                            requires_grad=False)
        
        self.s = torch.nn.Parameter(torch.tensor(self.df[s_col]).double(), requires_grad=False)
        
        self.theta = torch.nn.Parameter(torch.normal(torch.zeros(self.num_u), torch.ones(self.num_u)).double())
        self.a = torch.nn.Parameter(1 + (torch.rand(self.num_q).double() - 0.5) * 0.5)
        self.b = torch.nn.Parameter(torch.normal(torch.zeros(self.num_q), torch.ones(self.num_q)).double())
        self.c = torch.nn.Parameter(torch.rand(self.num_q).double() / 4)
        
    def forward(self):
        theta = torch.matmul(self.matrix_u, self.theta)
        a = torch.matmul(self.matrix_q, self.a)
        b = torch.matmul(self.matrix_q, self.b)
        c = torch.matmul(self.matrix_q, self.c)
        return c + (1 - c) / (1 + torch.exp(- a * (theta - b)))
    
    def predict(self, df, q_col, u_col):
        matrix_q = torch.sparse_coo_tensor([list(range(len(df))), 
                                            [self.q_to_index[q] for q in df[q_col]]], 
                                           [1.] * len(df), 
                                           (len(df), self.num_q)).double()
        matrix_u = torch.sparse_coo_tensor([list(range(len(df))), 
                                            [self.u_to_index[u] for u in df[u_col]]], 
                                           [1.] * len(df), 
                                           (len(df), self.num_u)).double()
        if self.s.is_cuda:
            matrix_q = matrix_q.cuda()
            matrix_u = matrix_u.cuda()
        
        theta = torch.matmul(matrix_u, self.theta)
        a = torch.matmul(matrix_q, self.a)
        b = torch.matmul(matrix_q, self.b)
        c = torch.matmul(matrix_q, self.c)
        return c + (1 - c) / (1 + torch.exp(- a * (theta - b)))

In [3]:
df = pd.read_csv('instance_questions_cs105.zip', compression='zip')
df = df[df.semester.isin(['fa22', 'sp23'])]
df = df[df.assessment_number.isin([1, 2, 3, 4])]
df = df.reset_index(drop=True).copy()

In [4]:
# objects are strings
df.dtypes

semester                           object
user_id                             int64
assessment_set_name                object
assessment_number                   int64
question_name                      object
question_type                      object
assessment_question_max_points      int64
instance_question_points          float64
instance_question_score_perc      float64
dtype: object

In [5]:
df.describe()

Unnamed: 0,user_id,assessment_number,assessment_question_max_points,instance_question_points,instance_question_score_perc
count,240100.0,240100.0,240100.0,240100.0,240100.0
mean,4177602.0,2.537693,5.339396,4.356815,82.389007
std,689470.5,1.154946,1.820471,2.413431,35.270243
min,129964.0,1.0,3.0,0.0,0.0
25%,4316444.0,1.0,3.0,3.0,100.0
50%,4319895.0,3.0,6.0,5.0,100.0
75%,4369411.0,4.0,6.0,6.0,100.0
max,4422763.0,4.0,9.0,9.0,100.0


In [6]:
question_name_to_question_type = pd.Series(df.question_type.values, index=df.question_name).to_dict()

In [7]:
semesters = []
user_ids = []
assessment_set_names = []
assessment_numbers = []
question_names = []
question_types = []

for assessment_name in ['Exam', 'Quiz']:
    for assessment_number in range(1, 5):
        assessment_df = df[(df.assessment_set_name == assessment_name) 
                           & (df.assessment_number == assessment_number)]
        curr_exam_question_names = list(assessment_df.question_name.unique())
        curr_exam_question_types = [question_name_to_question_type[question_name] for question_name in curr_exam_question_names]
        for semester in ['fa22', 'sp23']:
            for user_id in assessment_df[assessment_df.semester == semester].user_id.unique():
                semesters.extend([semester] * len(curr_exam_question_names))
                user_ids.extend([user_id] * len(curr_exam_question_names))
                assessment_set_names.extend([assessment_name] * len(curr_exam_question_names))
                assessment_numbers.extend([assessment_number] * len(curr_exam_question_names))
                question_names.extend(curr_exam_question_names)
                question_types.extend(curr_exam_question_types)

In [8]:
predict_df = pd.DataFrame.from_dict({
    'semester' : semesters, 
    'user_id' : user_ids, 
    'assessment_set_name' : assessment_set_names, 
    'assessment_number' : assessment_numbers, 
    'question_name' : question_names, 
    'question_type' : question_types
})

predict_df['q'] = predict_df['question_name']
predict_df['u'] = predict_df['semester'] + '_' + predict_df['user_id'].astype(str) + '_' + predict_df['assessment_set_name'] + '_' + predict_df['assessment_number'].astype(str)

In [9]:
irt = torch.load(f'irt_models_l@s/irt_model_final.pkl')

In [10]:
predict_df['prediction'] = irt.predict(predict_df, 'q', 'u').data.detach().cpu().tolist()

In [11]:
predict_df.to_csv('irt_prediction_l@s.zip', index=False, compression='zip')