# CSC 311 Project Part b
Yulong Wu, Sunyi Liu, Kaiyao Duan, Aiwei Yin

This is the code that contains data manipulation, method definition, and training for our model in part b.

## Definition of functions and preparation to train our model

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


we uploaded the project starter code to google drive to utilize functions for loading data

In [None]:
base_dir = '/content/drive/MyDrive/UT/CSC311/csc311-project/'
starter_code_dir = base_dir + 'part_a/'
data_dir = base_dir + 'data/'
import os
os.listdir(base_dir)
os.listdir(starter_code_dir)
os.listdir(data_dir)

['test_data.csv',
 'subject_meta.csv',
 'train_data.csv',
 'question_meta.csv',
 'student_meta.csv',
 'valid_data.csv',
 'train_sparse.npz']

In [None]:
import sys
sys.path.append(base_dir)
sys.path.append(data_dir)

In [None]:
from utils import *
from torch.autograd import Variable

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data

import numpy as np
import torch

In [None]:
import matplotlib.pyplot as plt

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [None]:
import matplotlib.pyplot as plt

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

Our slightly modified train function, so that we can use colab's GPU to train

In [None]:
def train(model, lr, train_data, zero_train_data, valid_data, num_epoch):
    """ Train the neural network, where the objective also includes
    a regularizer.

    :param model: Module
    :param lr: float
    :param lamb: float
    :param train_data: 2D FloatTensor
    :param zero_train_data: 2D FloatTensor
    :param valid_data: Dict
    :param num_epoch: int
    :return: None
    """
    # TODO: Add a regularizer to the cost function.

    # Tell PyTorch you are training the model.
    model.train()

    # Define optimizers and loss function.
    optimizer = optim.SGD(model.parameters(), lr=lr)
    num_student = train_data.shape[0]

    for epoch in range(0, num_epoch):
        train_loss = 0.

        for user_id in range(num_student):
            inputs = Variable(zero_train_data[user_id]).unsqueeze(0)
            metadata = Variable(student_metadata[user_id]).unsqueeze(0)
            target = inputs.clone()

            optimizer.zero_grad()
            output = model(inputs, metadata)

            # Mask the target to only compute the gradient of valid entries.
            nan_mask = torch.isnan(train_data[user_id].unsqueeze(0))
            target[0:1][nan_mask] = output[0:1][nan_mask]

            loss = torch.sum((output - target) ** 2.)
            # loss = loss + lamb * model.get_weight_norm()**2.
            loss.backward()

            train_loss += loss.item()
            optimizer.step()

        valid_acc = evaluate(model, zero_train_data, valid_data)
        print("Epoch: {} \tTraining Cost: {:.6f}\t "
              "Valid Acc: {}".format(epoch, train_loss, valid_acc))

this function is copied from the starter code

In [None]:
def evaluate(model, train_data, valid_data):
    """ Evaluate the valid_data on the current model.

    :param model: Module
    :param train_data: 2D FloatTensor
    :param valid_data: A dictionary {user_id: list,
    question_id: list, is_correct: list}
    :return: float
    """
    # Tell PyTorch you are evaluating the model.
    model.eval()

    total = 0
    correct = 0

    for i, u in enumerate(valid_data["user_id"]):
        inputs = Variable(train_data[u]).unsqueeze(0)
        metadata = Variable(student_metadata[u]).unsqueeze(0)
        output = model(inputs, metadata)

        guess = output[0][valid_data["question_id"][i]].item() >= 0.5
        if guess == valid_data["is_correct"][i]:
            correct += 1
        total += 1
    return correct / float(total)

we also modified evaluate function slightly to get more details about the model during testing

In [None]:
def evaluate_detailed(model, train_data, test_data):
    """
    Evaluate the model on the provided test data and compute detailed classification metrics.

    :param model: The PyTorch model to evaluate.
    :param train_data: 2D FloatTensor of training data.
    :param test_data: A dictionary containing 'user_id', 'question_id', and 'is_correct' lists.
    :return: A dictionary with accuracy, precision, recall, F1 score, false positives, false negatives, true positives, true negatives, total positives, and total negatives.
    """
    model.eval()

    predictions = []
    true_labels = []

    with torch.no_grad():
        for i, u in enumerate(test_data["user_id"]):
            inputs = train_data[u].unsqueeze(0)
            metadata = student_metadata[u].unsqueeze(0)
            output = model(inputs, metadata)

            guess = output[0][test_data["question_id"][i]].item() >= 0.5
            predictions.append(int(guess))
            true_labels.append(test_data["is_correct"][i])

    predictions = torch.tensor(predictions)
    true_labels = torch.tensor(true_labels)

    accuracy = (predictions == true_labels).float().mean().item()
    precision = precision_score(true_labels, predictions, average='binary')
    recall = recall_score(true_labels, predictions, average='binary')
    f1 = f1_score(true_labels, predictions, average='binary')

    tn, fp, fn, tp = confusion_matrix(true_labels, predictions).ravel()
    total_positives = tp + fn
    total_negatives = tn + fp

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'F1_score': f1,
        'false_positives': fp,
        'false_negatives': fn,
        'true_positives': tp,
        'true_negatives': tn,
        'total_positives': total_positives,
        'total_negatives': total_negatives
    }

### Load original data
the following function is copied from the starter code

In [None]:
def load_data(base_path="../data"):
    """ Load the data in PyTorch Tensor.

    :return: (zero_train_matrix, train_data, valid_data, test_data)
        WHERE:
        zero_train_matrix: 2D sparse matrix where missing entries are
        filled with 0.
        train_data: 2D sparse matrix
        valid_data: A dictionary {user_id: list,
        user_id: list, is_correct: list}
        test_data: A dictionary {user_id: list,
        user_id: list, is_correct: list}
    """
    train_matrix = load_train_sparse(base_path).toarray()
    valid_data = load_valid_csv(base_path)
    test_data = load_public_test_csv(base_path)

    zero_train_matrix = train_matrix.copy()
    # Fill in the missing entries to 0.
    zero_train_matrix[np.isnan(train_matrix)] = 0
    # Change to Float Tensor for PyTorch.
    zero_train_matrix = torch.FloatTensor(zero_train_matrix)
    train_matrix = torch.FloatTensor(train_matrix)

    return zero_train_matrix, train_matrix, valid_data, test_data

In [None]:
zero_train_matrix, train_matrix, valid_data, test_data = load_data(data_dir)
train_matrix = train_matrix.to(device)
zero_train_matrix = zero_train_matrix.to(device)

### Load question metadata

In [None]:
import pandas as pd
import json

In [None]:
question_df = pd.read_csv(data_dir + 'question_meta.csv')
num_questions = question_df.shape[0]
subject_df = pd.read_csv(data_dir + 'subject_meta.csv')
num_subjects = subject_df.shape[0]
question_matrix = np.zeros((num_questions, num_subjects), dtype=int)
for _, row in question_df.iterrows():
    li = json.loads(row.subject_id)
    for i in li:
        question_matrix[row.question_id, i] = 1

In [None]:
question_matrix.shape

(1774, 388)

Clean the data further, remove all the columns of `question_matrix` that consists of only 1's or 0's

In [None]:
question_matrix = question_matrix[:, ~((question_matrix == 1).all(axis=0) | (question_matrix == 0).all(axis=0))]
question_matrix.shape

(1774, 288)

In [None]:
num_subject = question_matrix.shape[1]
num_subject

288

In [None]:
non_zero = question_matrix.nonzero()
non_zero

(array([   0,    0,    0, ..., 1773, 1773, 1773]),
 array([ 37,  41, 112, ...,   0,   1,  89]))

In [None]:
non_zero_row, non_zero_col = non_zero
subjects = []
for i in range(num_questions):
    subjects.append(torch.LongTensor(non_zero_col[non_zero_row == i]).unsqueeze(0))
subjects[:5]

[tensor([[ 37,  41, 112]]),
 tensor([[ 16,  22, 197]]),
 tensor([[  0,  89, 137]]),
 tensor([[  0,   7, 154]]),
 tensor([[16, 31, 93]])]

In [None]:
subjects = [s.to(device) for s in subjects]
subjects[:5]

[tensor([[ 37,  41, 112]], device='cuda:0'),
 tensor([[ 16,  22, 197]], device='cuda:0'),
 tensor([[  0,  89, 137]], device='cuda:0'),
 tensor([[  0,   7, 154]], device='cuda:0'),
 tensor([[16, 31, 93]], device='cuda:0')]

### Student Metadata

In [None]:
student_df = pd.read_csv(data_dir + 'student_meta.csv', index_col='user_id')
student_df.sort_index(inplace=True)

In [None]:
genders = student_df['gender'].to_numpy()
genders_onehot = np.eye(3)[genders]
genders_onehot

array([[0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       ...,
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.]])

In [None]:
from datetime import datetime

calculate each person's age based on theri date of birth, 0 means no data, and normalize them

In [None]:
student_df['dates'] = pd.to_datetime(student_df['data_of_birth'], errors='coerce')
reference_date = datetime.now()
student_df['relative_age_years'] = (reference_date - student_df['dates']).dt.days / 365.25

In [None]:
ages = student_df['relative_age_years'].fillna(0).to_numpy()
ages_norm = (ages - ages.mean())/ages.std()
print(ages_norm.mean())
print(ages_norm.var())
ages_norm = ages_norm[..., np.newaxis]
ages_norm

6.55482228560978e-17
0.9999999999999998


array([[ 0.86596527],
       [ 1.23186314],
       [-1.38958222],
       [ 0.70615491],
       [ 0.89404007],
       [-1.38958222],
       [ 0.96006211],
       [-1.38958222],
       [-1.38958222],
       [ 0.67808011],
       [-1.38958222],
       [-1.38958222],
       [ 1.10105311],
       [ 0.44330079],
       [ 0.86596527],
       [ 0.6126751 ],
       [ 0.5281422 ],
       [ 0.66851617],
       [ 0.60311115],
       [ 0.79994323],
       [ 3.02525633],
       [ 0.74410216],
       [-1.38958222],
       [-1.38958222],
       [ 0.42478993],
       [ 0.74410216],
       [ 0.63149446],
       [ 0.36802332],
       [ 0.49050346],
       [ 0.32082064],
       [-1.38958222],
       [ 0.70615491],
       [ 0.35876789],
       [-1.38958222],
       [-1.38958222],
       [-1.38958222],
       [ 0.56547242],
       [ 0.4716841 ],
       [ 0.92242338],
       [ 0.88447613],
       [-1.38958222],
       [-1.38958222],
       [ 0.63149446],
       [ 0.69689948],
       [ 0.84714591],
       [-1

In [None]:
student_df['premium_pupil'] = student_df['premium_pupil'].fillna(2)
student_df['premium_pupil'] = student_df['premium_pupil'].astype(int)
is_premium = np.eye(3)[student_df['premium_pupil']]
is_premium

array([[0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       ...,
       [0., 0., 1.],
       [1., 0., 0.],
       [1., 0., 0.]])

In [None]:
# check if shape matches
print(genders_onehot.shape)
print(is_premium.shape)
print(ages_norm.shape)

(542, 3)
(542, 3)
(542, 1)


In [None]:
student_metadata = torch.FloatTensor(np.concatenate((is_premium, genders_onehot, ages_norm), axis=1)).to(device)
student_metadata

tensor([[ 0.0000,  0.0000,  1.0000,  ...,  0.0000,  1.0000,  0.8660],
        [ 0.0000,  0.0000,  1.0000,  ...,  1.0000,  0.0000,  1.2319],
        [ 0.0000,  0.0000,  1.0000,  ...,  0.0000,  0.0000, -1.3896],
        ...,
        [ 0.0000,  0.0000,  1.0000,  ...,  1.0000,  0.0000, -1.3896],
        [ 1.0000,  0.0000,  0.0000,  ...,  1.0000,  0.0000,  0.6969],
        [ 1.0000,  0.0000,  0.0000,  ...,  1.0000,  0.0000,  0.1240]],
       device='cuda:0')

## Model

### Embedder

we first make padding to each of the embedders to add length

In [None]:
subjects_flat = [s.flatten() for s in subjects]
max_length = max(len(s) for s in subjects_flat)
constant = num_subject
padded_subjects = [torch.nn.functional.pad(s, (0, max_length - len(s)), "constant", constant) for s in subjects_flat]
combined_subjects = torch.stack(padded_subjects)

In [None]:
combined_subjects.shape

torch.Size([1774, 8])

We perform a k-means cluster to determine the cluster each question should belong to, and train an embedder that embeds each question so that we can reduce dimension before adding it to the network

In [None]:
from sklearn.cluster import KMeans
km = KMeans(n_clusters=20)
km.fit(question_matrix)
labels = km.predict(question_matrix)
labels_onehot = np.eye(20)[labels]
labels_onehot.shape



(1774, 20)

In [None]:
labels_onehot = torch.tensor(labels_onehot).to(device)

In [None]:
class EmbeddingTrainer(nn.Module):
    def __init__(self, embed_dim):
        super(EmbeddingTrainer, self).__init__()
        self.embedder = nn.EmbeddingBag(num_subject+1, embed_dim)
        self.linear = nn.Linear(embed_dim, 20)
    def forward(self, input):

        x = self.embedder(input)
        x = self.linear(x)
        x = F.softmax(x)
        return x

We want the embedder to "overfit" the questions as much as possible

In [None]:
def train_embedder(model, x, t, num_epoch, lr):
    model.train()
    optimizer = optim.SGD(model.parameters(), lr=lr)
    loss_function = nn.CrossEntropyLoss()
    for epoch in range(num_epoch):
        # total_loss = 0
        # for i in range(t.shape[0]):
        optimizer.zero_grad()
        y = model(x)
        loss = loss_function(y,t)
        loss.backward()
        optimizer.step()
        # total_loss += loss.item()
        if epoch % 10000 == 0:
            print("epoch: ", epoch, "loss: ", loss.item())

In [None]:
trainer = EmbeddingTrainer(30).to(device)
train_embedder(trainer, combined_subjects, labels_onehot, 100000, 0.1)

  x = F.softmax(x)


epoch:  0 loss:  2.9979003849266075
epoch:  10000 loss:  2.704224347396257
epoch:  20000 loss:  2.7015790557646295
epoch:  30000 loss:  2.6009708841865735
epoch:  40000 loss:  2.537744714146671
epoch:  50000 loss:  2.480692201429451
epoch:  60000 loss:  2.4777258168212994
epoch:  70000 loss:  2.4293268091111466
epoch:  80000 loss:  2.3750566405400617
epoch:  90000 loss:  2.3711999584883956


In [None]:
embedder = trainer.embedder
embedder.zero_grad()

In [None]:
embeddings = embedder(combined_subjects)
embeddings = embeddings.detach()
embeddings = embeddings.flatten()
embeddings.shape

torch.Size([53220])

### AutoEncoder

In [None]:
class AutoEncoder(nn.Module):
    def __init__(self, num_questions, embeddings, student_metadata_size, embed_dim=20, p=0.5):
        super(AutoEncoder, self).__init__()

        self.num_questions = num_questions
        self.embed_dim = embed_dim
        self.p = p
        self.student_metadata_size = student_metadata_size
        self.embeddings = embeddings.clone().flatten().unsqueeze(0)

        self.layers = nn.ModuleList([
            nn.Linear(embed_dim * num_questions, 10000),
            nn.ReLU(),
            nn.Dropout(p=p),
            nn.Linear(10000, 1000),
            nn.ReLU(),
            nn.Dropout(p=p),
            nn.Linear(1000, 500),
            nn.ReLU(),
            nn.Dropout(p=p),
            nn.Linear(500, 200),
            nn.ReLU(),
            nn.Linear(200 + self.student_metadata_size, 1000),
            nn.ReLU(),
            nn.Dropout(p=p),
            nn.Linear(1000, num_questions),
            nn.Sigmoid(),
        ])

    def forward(self, input, metadata):
        x = self.embeddings * input.repeat_interleave(self.embed_dim,1)
        for layer in self.layers[:11]:
            x = layer(x)
        x = torch.cat((x, metadata), axis=1)
        for layer in self.layers[11:]:
            x = layer(x)
        return x

In [None]:
model = AutoEncoder(train_matrix.shape[1], embeddings, student_metadata.shape[1], embed_dim=30, p=0.5).to(device)

lr=0.001
num_epoch=43
train(model, lr=lr, train_data=train_matrix, zero_train_data=zero_train_matrix, valid_data=valid_data,
      num_epoch=num_epoch)

Epoch: 0 	Training Cost: 14163.802794	 Valid Acc: 0.5433248659328253
Epoch: 1 	Training Cost: 14100.699588	 Valid Acc: 0.5671747106971493
Epoch: 2 	Training Cost: 14039.211732	 Valid Acc: 0.5839683883714366
Epoch: 3 	Training Cost: 13969.321591	 Valid Acc: 0.594411515664691
Epoch: 4 	Training Cost: 13883.965002	 Valid Acc: 0.6045723962743438
Epoch: 5 	Training Cost: 13764.786181	 Valid Acc: 0.609793959920971
Epoch: 6 	Training Cost: 13395.386335	 Valid Acc: 0.6162856336438047
Epoch: 7 	Training Cost: 12567.112990	 Valid Acc: 0.6234829240756421
Epoch: 8 	Training Cost: 12227.344996	 Valid Acc: 0.6277166243296641
Epoch: 9 	Training Cost: 11945.083048	 Valid Acc: 0.6298334744566751
Epoch: 10 	Training Cost: 11696.098851	 Valid Acc: 0.6308213378492803
Epoch: 11 	Training Cost: 11496.293815	 Valid Acc: 0.630962461191081
Epoch: 12 	Training Cost: 11367.244875	 Valid Acc: 0.6371718882303133
Epoch: 13 	Training Cost: 11280.802003	 Valid Acc: 0.6398532317245272
Epoch: 14 	Training Cost: 11254.1

In [None]:
evaluate_detailed(model, zero_train_matrix, test_data)

{'accuracy': 0.696302592754364,
 'precision': 0.7383694150161216,
 'recall': 0.7593557555660825,
 'F1_score': 0.7487155534796823,
 'false_positives': 568,
 'false_negatives': 508,
 'true_positives': 1603,
 'true_negatives': 864,
 'total_positives': 2111,
 'total_negatives': 1432}