In [1]:
from google.colab import drive
drive.mount('/content/drive/')

%cd drive/MyDrive/Colab Notebooks/knowledge-tracing

Mounted at /content/drive/
/content/drive/MyDrive/Colab Notebooks/knowledge-tracing


In [2]:
import os
import argparse
import json
import pickle
import easydict

import numpy as np
import pandas as pd
import torch

from torch.utils.data import DataLoader, random_split
from torch.optim import SGD, Adam
from torch.utils.data import Dataset

from torch.nn import Module, Embedding, LSTM, Linear, Dropout
from torch.nn.functional import one_hot, binary_cross_entropy
from sklearn import metrics
from torch.nn.utils.rnn import pad_sequence

In [3]:
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device, use_cuda

(device(type='cpu'), False)

In [4]:
DATASET_DIR = "./data/assist2009/"

In [5]:
def collate_fn(batch, pad_val=-1):
    '''
        The collate function for torch.utils.data.DataLoader
        Returns:
            q_seqs: the question(KC) sequences with the size of \
                [batch_size, maximum_sequence_length_in_the_batch]
            r_seqs: the response sequences with the size of \
                [batch_size, maximum_sequence_length_in_the_batch]
            qshft_seqs: the question(KC) sequences which were shifted \
                one step to the right with ths size of \
                [batch_size, maximum_sequence_length_in_the_batch]
            rshft_seqs: the response sequences which were shifted \
                one step to the right with ths size of \
                [batch_size, maximum_sequence_length_in_the_batch]
            mask_seqs: the mask sequences indicating where \
                the padded entry is with the size of \
                [batch_size, maximum_sequence_length_in_the_batch]
    '''
    q_seqs = []
    r_seqs = []
    qshft_seqs = []
    rshft_seqs = []

    # q_seq의 len이 n일 때, q_seq는 0부터 n-1까지, qshft_seqs(rnn output 형태)는 1부터 n까지로 cutting 
    for q_seq, r_seq in batch:
        q_seqs.append(FloatTensor(q_seq[:-1]))    
        r_seqs.append(FloatTensor(r_seq[:-1]))
        qshft_seqs.append(FloatTensor(q_seq[1:]))
        rshft_seqs.append(FloatTensor(r_seq[1:]))

    # maximum sequence length에 맞추어 padding 처리
    q_seqs = pad_sequence(
        q_seqs, batch_first=True, padding_value=pad_val
    )
    r_seqs = pad_sequence(
        r_seqs, batch_first=True, padding_value=pad_val
    )
    qshft_seqs = pad_sequence(
        qshft_seqs, batch_first=True, padding_value=pad_val
    )
    rshft_seqs = pad_sequence(
        rshft_seqs, batch_first=True, padding_value=pad_val
    )

    mask_seqs = (q_seqs != pad_val) * (qshft_seqs != pad_val)

    # [int] * [False] = 0
    q_seqs, r_seqs, qshft_seqs, rshft_seqs = \
        q_seqs * mask_seqs, r_seqs * mask_seqs, qshft_seqs * mask_seqs, \
        rshft_seqs * mask_seqs

    return q_seqs, r_seqs, qshft_seqs, rshft_seqs, mask_seqs

In [6]:
def match_seq_len(q_seqs, r_seqs, seq_len, pad_val=-1):
    '''
        Args:
            q_seqs: the question(KC) sequences with the size of \
                [batch_size, some_sequence_length]
            r_seqs: the response sequences with the size of \
                [batch_size, some_sequence_length]
            Note that the "some_sequence_length" is not uniform over \
                the whole batch of q_seqs and r_seqs
            seq_len: the sequence length to match the q_seqs, r_seqs \
                to same length
            pad_val: the padding value for the sequence with the length \
                longer than seq_len
        Returns:
            proc_q_seqs: the processed q_seqs with the size of \
                [batch_size, seq_len + 1]
            proc_r_seqs: the processed r_seqs with the size of \
                [batch_size, seq_len + 1]
    '''
    proc_q_seqs = []
    proc_r_seqs = []


    # [0, 1, 2, ..., 9]를 [0, 1, 2, 3, 4], [1, 2, 3, 4, 5] 식으로 index를 +1하면서 분할하는 게 아니라 [0, 1, 2, 3, 4], [5, 6, 7, 8, 9]
    for q_seq, r_seq in zip(q_seqs, r_seqs):
        i = 0
        while i + seq_len + 1 < len(q_seq):
            # 본래 seq_len보다 +1 해서 만듦 ( 마지막 index는 label로 처리해야 하므로 )
            proc_q_seqs.append(q_seq[i:i + seq_len + 1])
            proc_r_seqs.append(r_seq[i:i + seq_len + 1])

            i += seq_len + 1

        proc_q_seqs.append(
            np.concatenate(
                [
                    q_seq[i:],
                    np.array([pad_val] * (i + seq_len + 1 - len(q_seq)))
                ]
            )
        )
        proc_r_seqs.append(
            np.concatenate(
                [
                    r_seq[i:],
                    np.array([pad_val] * (i + seq_len + 1 - len(q_seq)))
                ]
            )
        )

    return proc_q_seqs, proc_r_seqs

In [7]:
class ASSIST2009(Dataset):
    def __init__(self, seq_len, dataset_dir=DATASET_DIR) -> None:
        super().__init__()

        self.dataset_dir = dataset_dir
        self.dataset_path = os.path.join(
            self.dataset_dir, "skill_builder_data.csv"
        )

        if os.path.exists(os.path.join(self.dataset_dir, "q_seqs.pkl")):
            with open(os.path.join(self.dataset_dir, "q_seqs.pkl"), "rb") as f:
                self.q_seqs = pickle.load(f)
            with open(os.path.join(self.dataset_dir, "r_seqs.pkl"), "rb") as f:
                self.r_seqs = pickle.load(f)
            with open(os.path.join(self.dataset_dir, "q_list.pkl"), "rb") as f:
                self.q_list = pickle.load(f)
            with open(os.path.join(self.dataset_dir, "u_list.pkl"), "rb") as f:
                self.u_list = pickle.load(f)
            with open(os.path.join(self.dataset_dir, "q2idx.pkl"), "rb") as f:
                self.q2idx = pickle.load(f)
            with open(os.path.join(self.dataset_dir, "u2idx.pkl"), "rb") as f:
                self.u2idx = pickle.load(f)
        else:
            self.q_seqs, self.r_seqs, self.q_list, self.u_list, self.q2idx, \
                self.u2idx = self.preprocess()

        self.num_u = self.u_list.shape[0]
        self.num_q = self.q_list.shape[0]

        if seq_len:
            self.q_seqs, self.r_seqs = \
                match_seq_len(self.q_seqs, self.r_seqs, seq_len)

        self.len = len(self.q_seqs)

    def __getitem__(self, index):
        return self.q_seqs[index], self.r_seqs[index]

    def __len__(self):
        return self.len

    def preprocess(self):
        df = pd.read_csv(self.dataset_path, encoding="ISO-8859-1").dropna(subset=["skill_name"])\
            .drop_duplicates(subset=["order_id", "skill_name"])\
            .sort_values(by=["order_id"])

        u_list = np.unique(df["user_id"].values)
        q_list = np.unique(df["skill_name"].values)

        u2idx = {u: idx for idx, u in enumerate(u_list)}
        q2idx = {q: idx for idx, q in enumerate(q_list)}

        q_seqs = []
        r_seqs = []

        for u in u_list:
            df_u = df[df["user_id"] == u]

            q_seq = np.array([q2idx[q] for q in df_u["skill_name"]])
            r_seq = df_u["correct"].values

            q_seqs.append(q_seq)
            r_seqs.append(r_seq)

        with open(os.path.join(self.dataset_dir, "q_seqs.pkl"), "wb") as f:
            pickle.dump(q_seqs, f)
        with open(os.path.join(self.dataset_dir, "r_seqs.pkl"), "wb") as f:
            pickle.dump(r_seqs, f)
        with open(os.path.join(self.dataset_dir, "q_list.pkl"), "wb") as f:
            pickle.dump(q_list, f)
        with open(os.path.join(self.dataset_dir, "u_list.pkl"), "wb") as f:
            pickle.dump(u_list, f)
        with open(os.path.join(self.dataset_dir, "q2idx.pkl"), "wb") as f:
            pickle.dump(q2idx, f)
        with open(os.path.join(self.dataset_dir, "u2idx.pkl"), "wb") as f:
            pickle.dump(u2idx, f)

        return q_seqs, r_seqs, q_list, u_list, q2idx, u2idx

In [8]:
if torch.cuda.is_available():
    from torch.cuda import FloatTensor
    torch.set_default_tensor_type(torch.cuda.FloatTensor)
else:
    from torch import FloatTensor

In [9]:
args = easydict.EasyDict(
    {
        "batch_size" : 256,
        "num_epochs" : 10,
        "train_ratio" : 0.9,
        "learning_rate" : 0.001,
        "optimizer" : "adam",
        "seq_len" : 100,
        "emb_size" : 100,
        "hidden_size" : 100 ,
        "gpu_id" : 0
    })

# device = torch.device("cuda:" + str(args.gpu_id) if torch.cuda.is_available() else "cpu")

In [10]:
class DKT(Module):
    '''
        Args:
            num_q: the total number of the questions(KCs) in the given dataset
            emb_size: the dimension of the embedding vectors in this model
            hidden_size: the dimension of the hidden vectors in this model
    '''
    
    def __init__(self, num_q, emb_size, hidden_size):
        super().__init__()
        self.num_q = num_q
        self.emb_size = emb_size
        self.hidden_size = hidden_size

        self.interaction_emb = Embedding(self.num_q * 2, self.emb_size)
        self.lstm_layer = LSTM(
            self.emb_size, self.hidden_size, batch_first=True
        )
        self.out_layer = Linear(self.hidden_size, self.num_q)
        self.dropout_layer = Dropout()

    def forward(self, q, r):
        '''
            Args:
                q: the question(KC) sequence with the size of [batch_size, n]
                r: the response sequence with the size of [batch_size, n]
            Returns:
                y: the knowledge level about the all questions(KCs), [batch_size, seq_len, hidden_size]
        '''
        x = q + self.num_q * r

        h, _ = self.lstm_layer(self.interaction_emb(x))
        y = self.out_layer(h)
        y = self.dropout_layer(y)
        y = torch.sigmoid(y)

        return y

    def train_model(
        self, train_loader, test_loader, num_epochs, optimizer, ckpt_path
    ):
        '''
            Args:
                train_loader: the PyTorch DataLoader instance for training
                test_loader: the PyTorch DataLoader instance for test
                num_epochs: the number of epochs
                opt: the optimization to train this model
                ckpt_path: the path to save this model's parameters
        '''
        aucs = []
        loss_means = []

        max_auc = 0

        for i in range(1, num_epochs + 1):
            loss_mean = []

            for data in train_loader:
                q, r, qshft, rshft, m = data

                self.train()

                y = self(q.long(), r.long())
                # 각 시점에서 다음 문제의 index만 제하고 0 처리, 그 다음 차원 통합
                y = (y * one_hot(qshft.long(), self.num_q)).sum(-1) 

                y = torch.masked_select(y, m)

                t = torch.masked_select(rshft, m)

                optimizer.zero_grad()
                loss = binary_cross_entropy(y, t)
                loss.backward()
                optimizer.step()

                loss_mean.append(loss.detach().cpu().numpy())

            with torch.no_grad():
                for data in test_loader:
                    q, r, qshft, rshft, m = data

                    self.eval()

                    y = self(q.long(), r.long())

                    y = (y * one_hot(qshft.long(), self.num_q)).sum(-1)   

                    y = torch.masked_select(y, m).detach().cpu()
                    t = torch.masked_select(rshft, m).detach().cpu()

                    auc = metrics.roc_auc_score(
                        y_true=t.numpy(), y_score=y.numpy()
                    )

                    loss_mean = np.mean(loss_mean)

                    print(
                        "Epoch: {},   AUC: {},   Loss Mean: {}"
                        .format(i, auc, loss_mean)
                    )

                    if auc > max_auc:
                        torch.save(
                            self.state_dict(),
                            os.path.join(
                                ckpt_path, "model.ckpt"
                            )
                        )
                        max_auc = auc

                    aucs.append(auc)
                    loss_means.append(loss_mean)

        return aucs, loss_means

In [11]:
dataset = ASSIST2009(5)

In [12]:
model = DKT(dataset.num_q, args.emb_size, args.hidden_size)

In [13]:
train_size = int(len(dataset) * args.train_ratio)
test_size = len(dataset) - train_size


# train_dataset : [ train_size, seq_len ] ( match_seq_len을 통해 패딩처리되어 있음 )
# test_dataset : [ test_size, seq_len ]
train_dataset, test_dataset = random_split(
    dataset, [train_size, test_size]
)

In [14]:
if os.path.exists(os.path.join(dataset.dataset_dir, "train_indices.pkl")):
    with open(os.path.join(dataset.dataset_dir, "train_indices.pkl"), "rb") as f:
        train_dataset.indices = pickle.load(f)
    with open(os.path.join(dataset.dataset_dir, "test_indices.pkl"), "rb") as f:
        test_dataset.indices = pickle.load(f)
else:
    with open(os.path.join(dataset.dataset_dir, "train_indices.pkl"), "wb") as f:
        pickle.dump(train_dataset.indices, f)
    with open(os.path.join(dataset.dataset_dir, "test_indices.pkl"), "wb") as f:
        pickle.dump(test_dataset.indices, f)

In [15]:
train_loader = DataLoader(
    train_dataset, batch_size=args.batch_size, shuffle=True,
    collate_fn=collate_fn
)
test_loader = DataLoader(
    test_dataset, batch_size=test_size, shuffle=True,
    collate_fn=collate_fn
)

In [16]:
optimizer = Adam(model.parameters(), args.learning_rate)
modelname = "DKT" + \
  "_bs_" + str(args.batch_size) + \
  "_nemb_" + str(args.emb_size) + \
  "_lr_" + str(args.learning_rate)

In [17]:
aucs, loss_means = model.train_model(
        train_loader, test_loader, args.num_epochs, optimizer, "./ckpts/"
    )

Epoch: 1,   AUC: 0.7866887261053133,   Loss Mean: 0.6227133274078369
Epoch: 2,   AUC: 0.7954699291222699,   Loss Mean: 0.6022433042526245
Epoch: 3,   AUC: 0.7990790640814379,   Loss Mean: 0.5994665622711182
Epoch: 4,   AUC: 0.8006111990559164,   Loss Mean: 0.5951282978057861
Epoch: 5,   AUC: 0.8010540372706851,   Loss Mean: 0.5930237174034119
Epoch: 6,   AUC: 0.8006461955373816,   Loss Mean: 0.5913102030754089
Epoch: 7,   AUC: 0.8011501052057696,   Loss Mean: 0.5902721285820007
Epoch: 8,   AUC: 0.8018882441583015,   Loss Mean: 0.5888826847076416
Epoch: 9,   AUC: 0.8015807663536223,   Loss Mean: 0.5885145664215088
Epoch: 10,   AUC: 0.8022189349735289,   Loss Mean: 0.586394727230072
