In [1]:
import os
import torch
import torch.nn as nn
import numpy as np
from glob import glob

import sys
sys.path.append('/opt/ml/develop')

In [2]:
from args_jupyter import parse_args
from dkt.dataloader import Preprocess
from dkt import trainer
from dkt.utils import setSeeds
from dkt.dataloader import get_loaders
from dkt.optimizer import get_optimizer, get_lr
from dkt.scheduler import get_scheduler
from dkt.criterion import get_criterion
from dkt.metric import get_metric

In [3]:
args = parse_args(mode='train')
setSeeds(args.seed)
device = "cuda" if torch.cuda.is_available() else "cpu"

In [4]:
# 배치 전처리
def process_batch(batch, args):

    test, question, tag, correct, mask = batch
    
    
    # change to float
    mask = mask.type(torch.FloatTensor)
    correct = correct.type(torch.FloatTensor)

    #  interaction을 임시적으로 correct를 한칸 우측으로 이동한 것으로 사용
    #    saint의 경우 decoder에 들어가는 input이다
    interaction = correct + 1 # 패딩을 위해 correct값에 1을 더해준다.
    interaction = interaction.roll(shifts=1, dims=1)
    interaction[:, 0] = 0 # set padding index to the first sequence
    interaction = (interaction * mask).to(torch.int64)
    # print(interaction)
    # exit()
    #  test_id, question_id, tag
    test = ((test + 1) * mask).to(torch.int64)
    question = ((question + 1) * mask).to(torch.int64)
    tag = ((tag + 1) * mask).to(torch.int64)

    # gather index
    # 마지막 sequence만 사용하기 위한 index
    gather_index = torch.tensor(np.count_nonzero(mask, axis=1))
    gather_index = gather_index.view(-1, 1) - 1


    # device memory로 이동

    test = test.to(args.device)
    question = question.to(args.device)


    tag = tag.to(args.device)
    correct = correct.to(args.device)
    mask = mask.to(args.device)

    interaction = interaction.to(args.device)
    gather_index = gather_index.to(args.device)

    return (test, question,
            tag, correct, mask,
            interaction, gather_index)

In [5]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=1000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        self.scale = nn.Parameter(torch.ones(1))

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(
            0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.scale * self.pe[:x.size(0), :]
        return self.dropout(x)

In [6]:
preprocess = Preprocess(args)
preprocess.load_train_data(args.file_name)
train_data = preprocess.get_train_data()

train_data, valid_data = preprocess.split_data(train_data)

In [7]:
train_loader, valid_loader = get_loaders(args, train_data, valid_data)

In [8]:
batch = None
for t in train_loader:
    # train_loader is tuple
    
    print(f"testID size: {t[0].size()}")
    print(f"assessmentItemId size: {t[1].size()}")
    print(f"knowledgetag size: {t[2].size()}")
    print(f"answercode size: {t[3].size()}")
    print(f"mask size: {t[4].size()}")
    
    batch = t
    break

testID size: torch.Size([64, 20])
assessmentItemId size: torch.Size([64, 20])
knowledgetag size: torch.Size([64, 20])
answercode size: torch.Size([64, 20])
mask size: torch.Size([64, 20])


In [9]:
args

Namespace(asset_dir='asset/', batch_size=64, clip_grad=10, data_dir='/opt/ml/input/data/train_dataset', data_id='userID', device='cpu', drop_out=0.2, emb_size=100, file_name='train_data.csv', hidden_dim=64, log_steps=50, lr=0.0001, max_lr=0.0001, max_seq_len=20, min_lr=1e-05, model='lstm', model_dir='models/', model_name='model.pt', n_epochs=20, n_heads=2, n_layers=2, n_questions=9455, n_tag=913, n_test=1538, num_workers=1, optimizer='adam', output_dir='output/', patience=5, scheduler='plateau', scheduler_step=5, seed=5, test_file_name='test_data.csv', wandb_name=None)

In [10]:
import torch
import torch.nn as nn
import torch.nn.functional as F 
import numpy as np
import copy
import math

In [11]:
inputs = process_batch(batch, args)
# test, question, tag, correct, mask, interaction, gather_index

In [12]:
test, question, tag, _, mask, interaction, _ = inputs

In [13]:
batch_size = interaction.size(0)
seq_len = interaction.size(1)
batch_size, seq_len

(64, 20)

In [14]:
mask.shape

torch.Size([64, 20])

In [15]:
mask

tensor([[1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        ...,
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.]])

In [16]:
temp1, temp2 = mask.view(args.batch_size, args.max_seq_len, -1).max(2)
temp1.shape

torch.Size([64, 20])

# Saint 모델

In [17]:
embedding_test = nn.Embedding(args.n_test + 1, args.hidden_dim//3)
embedding_question = nn.Embedding(args.n_questions + 1, args.hidden_dim//3)
embedding_tag = nn.Embedding(args.n_tag + 1, args.hidden_dim//3)

embed_test = embedding_test(test)
embed_question = embedding_question(question)
embed_tag = embedding_tag(tag)

print(embed_test.size())
print(embed_question.size())
print(embed_tag.size())

torch.Size([64, 20, 21])
torch.Size([64, 20, 21])
torch.Size([64, 20, 21])


In [18]:
embed_enc = torch.cat([embed_test,
                       embed_question,
                       embed_tag,], 2)

embed_enc.size()

torch.Size([64, 20, 63])

In [19]:
enc_comb_proj = nn.Linear((args.hidden_dim//3)*3, args.hidden_dim)
embed_enc = enc_comb_proj(embed_enc)
embed_enc.size()

torch.Size([64, 20, 64])

In [20]:
embed_test

tensor([[[ 1.5263,  0.9159,  1.7554,  ...,  0.6618, -1.7855,  0.6752],
         [ 1.5263,  0.9159,  1.7554,  ...,  0.6618, -1.7855,  0.6752],
         [ 1.5263,  0.9159,  1.7554,  ...,  0.6618, -1.7855,  0.6752],
         ...,
         [ 0.3263,  0.9922,  1.2015,  ...,  1.0176, -0.6821,  2.5479],
         [ 0.3263,  0.9922,  1.2015,  ...,  1.0176, -0.6821,  2.5479],
         [ 0.3263,  0.9922,  1.2015,  ...,  1.0176, -0.6821,  2.5479]],

        [[-1.0275, -0.2720,  0.1552,  ..., -2.3512,  1.2479,  1.6195],
         [-1.0275, -0.2720,  0.1552,  ..., -2.3512,  1.2479,  1.6195],
         [-1.0275, -0.2720,  0.1552,  ..., -2.3512,  1.2479,  1.6195],
         ...,
         [ 0.7609, -1.5030,  1.5665,  ...,  0.5125, -0.7370, -2.5342],
         [ 0.7609, -1.5030,  1.5665,  ...,  0.5125, -0.7370, -2.5342],
         [ 0.7609, -1.5030,  1.5665,  ...,  0.5125, -0.7370, -2.5342]],

        [[-1.6326,  1.1773, -0.2609,  ...,  1.1532, -1.4429,  1.3231],
         [-1.6326,  1.1773, -0.2609,  ...,  1

In [21]:
# DECODER embedding# interaction은 현재 correct으로 구성되어있다. correct(1, 2) + padding(0)
embedding_interaction = nn.Embedding(3, args.hidden_dim//3)

embed_test = embedding_test(test)
embed_question = embedding_question(question)
embed_tag = embedding_tag(tag)
embed_interaction = embedding_interaction(interaction)

print(embed_test.size())
print(embed_question.size())
print(embed_tag.size())
print(embed_interaction.size())

torch.Size([64, 20, 21])
torch.Size([64, 20, 21])
torch.Size([64, 20, 21])
torch.Size([64, 20, 21])


In [22]:
embed_test

tensor([[[ 1.5263,  0.9159,  1.7554,  ...,  0.6618, -1.7855,  0.6752],
         [ 1.5263,  0.9159,  1.7554,  ...,  0.6618, -1.7855,  0.6752],
         [ 1.5263,  0.9159,  1.7554,  ...,  0.6618, -1.7855,  0.6752],
         ...,
         [ 0.3263,  0.9922,  1.2015,  ...,  1.0176, -0.6821,  2.5479],
         [ 0.3263,  0.9922,  1.2015,  ...,  1.0176, -0.6821,  2.5479],
         [ 0.3263,  0.9922,  1.2015,  ...,  1.0176, -0.6821,  2.5479]],

        [[-1.0275, -0.2720,  0.1552,  ..., -2.3512,  1.2479,  1.6195],
         [-1.0275, -0.2720,  0.1552,  ..., -2.3512,  1.2479,  1.6195],
         [-1.0275, -0.2720,  0.1552,  ..., -2.3512,  1.2479,  1.6195],
         ...,
         [ 0.7609, -1.5030,  1.5665,  ...,  0.5125, -0.7370, -2.5342],
         [ 0.7609, -1.5030,  1.5665,  ...,  0.5125, -0.7370, -2.5342],
         [ 0.7609, -1.5030,  1.5665,  ...,  0.5125, -0.7370, -2.5342]],

        [[-1.6326,  1.1773, -0.2609,  ...,  1.1532, -1.4429,  1.3231],
         [-1.6326,  1.1773, -0.2609,  ...,  1

In [23]:
embed_dec = torch.cat([embed_test,
                       embed_question,
                       embed_tag,
                       embed_interaction], 2)

# decoder combination projection
dec_comb_proj = nn.Linear((args.hidden_dim//3)*4, args.hidden_dim)

embed_dec = dec_comb_proj(embed_dec)
embed_dec.size()

torch.Size([64, 20, 64])

In [24]:
enc_mask = None
dec_mask = None
enc_dec_mask = None

def get_mask(seq_len):
        mask = torch.from_numpy(np.triu(np.ones((seq_len, seq_len)), k=1))

        return mask.masked_fill(mask==1, float('-inf'))

In [25]:
# ATTENTION MASK 생성
# encoder하고 decoder의 mask는 가로 세로 길이가 모두 동일하여
# 사실 이렇게 3개로 나눌 필요가 없다

enc_mask = get_mask(seq_len)
print(enc_mask.size())
enc_mask

torch.Size([20, 20])


tensor([[0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0.,

In [26]:
if enc_mask is None or enc_mask.size(0) != seq_len:
    enc_mask = get_mask(seq_len)

if dec_mask is None or dec_mask.size(0) != seq_len:
    dec_mask = get_mask(seq_len)

if enc_dec_mask is None or enc_dec_mask.size(0) != seq_len:
    enc_dec_mask = get_mask(seq_len)

In [27]:
embed_enc = embed_enc.permute(1, 0, 2)
embed_dec = embed_dec.permute(1, 0, 2)

In [28]:
embed_enc.size(), embed_dec.size()

(torch.Size([20, 64, 64]), torch.Size([20, 64, 64]))

In [29]:
# Positional encoding
pos_encoder = PositionalEncoding(args.hidden_dim, args.drop_out, args.max_seq_len)
pos_decoder = PositionalEncoding(args.hidden_dim, args.drop_out, args.max_seq_len)

# Positional encoding
embed_enc = pos_encoder(embed_enc)
embed_dec = pos_decoder(embed_dec)

embed_enc.size(), embed_dec.size()

(torch.Size([20, 64, 64]), torch.Size([20, 64, 64]))

In [30]:
transformer = nn.Transformer(
            d_model=args.hidden_dim, 
            nhead=args.n_heads,
            num_encoder_layers=args.n_layers, 
            num_decoder_layers=args.n_layers, 
            dim_feedforward=args.hidden_dim, 
            dropout=args.drop_out, 
            activation='relu')

In [31]:
out = transformer(embed_enc, embed_dec,
                 src_mask=enc_mask,
                 tgt_mask=dec_mask,
                 memory_mask=enc_dec_mask)

out.size()

torch.Size([20, 64, 64])

In [32]:
out = out.contiguous().view(batch_size, -1, args.hidden_dim)
out.size()

torch.Size([64, 20, 64])

In [33]:
fc = nn.Linear(args.hidden_dim, 1)
activation = nn.Sigmoid()

out = fc(out)
out.size()

torch.Size([64, 20, 1])

In [34]:
preds = activation(out).view(batch_size, -1)
preds.size()

torch.Size([64, 20])


# 클래스로 만들기

In [35]:
n_rows_per_step = 2

In [36]:

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=1000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        self.scale = nn.Parameter(torch.ones(1))

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(
            0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.scale * self.pe[:x.size(0), :]
        return self.dropout(x)

class Saint(nn.Module):
    
    def __init__(self, args):
        super(Saint, self).__init__()
        self.args = args
        self.device = args.device

        self.hidden_dim = self.args.hidden_dim
        # self.dropout = self.args.dropout
        self.dropout = 0.
        
        ### Embedding 
        # ENCODER embedding
        self.embedding_test = nn.Embedding(self.args.n_test + 1, self.hidden_dim//3)
        self.embedding_question = nn.Embedding(self.args.n_questions + 1, self.hidden_dim//3)
        self.embedding_tag = nn.Embedding(self.args.n_tag + 1, self.hidden_dim//3)
        
        # encoder combination projection
        self.enc_comb_proj = nn.Linear((self.hidden_dim//3)*3, self.hidden_dim)

        # DECODER embedding
        # interaction은 현재 correct으로 구성되어있다. correct(1, 2) + padding(0)
        self.embedding_interaction = nn.Embedding(3, self.hidden_dim//3)
        
        # decoder combination projection
        self.dec_comb_proj = nn.Linear((self.hidden_dim//3)*4, self.hidden_dim)

        # Positional encoding
        self.pos_encoder = PositionalEncoding(self.hidden_dim, self.dropout, self.args.max_seq_len)
        self.pos_decoder = PositionalEncoding(self.hidden_dim, self.dropout, self.args.max_seq_len)
        

        self.transformer = nn.Transformer(
            d_model=self.hidden_dim, 
            nhead=self.args.n_heads,
            num_encoder_layers=self.args.n_layers, 
            num_decoder_layers=self.args.n_layers, 
            dim_feedforward=self.hidden_dim, 
            dropout=self.dropout, 
            activation='relu')

        self.fc = nn.Linear(self.hidden_dim, 1)
        self.activation = nn.Sigmoid()

        self.enc_mask = None
        self.dec_mask = None
        self.enc_dec_mask = None
    
    def get_mask(self, seq_len):
        mask = torch.from_numpy(np.triu(np.ones((seq_len, seq_len)), k=1))

        return mask.masked_fill(mask==1, float('-inf'))

    def forward(self, input):
        test, question, tag, _, mask, interaction, _ = input

        batch_size = interaction.size(0)
        seq_len = interaction.size(1)

        # 신나는 embedding
        # ENCODER
        embed_test = self.embedding_test(test)
        embed_question = self.embedding_question(question)
        embed_tag = self.embedding_tag(tag)

        embed_enc = torch.cat([embed_test,
                               embed_question,
                               embed_tag,], 2)

        embed_enc = self.enc_comb_proj(embed_enc)
        
        # DECODER     
        embed_test = self.embedding_test(test)
        embed_question = self.embedding_question(question)
        embed_tag = self.embedding_tag(tag)

        embed_interaction = self.embedding_interaction(interaction)

        embed_dec = torch.cat([embed_test,
                               embed_question,
                               embed_tag,
                               embed_interaction], 2)

        embed_dec = self.dec_comb_proj(embed_dec)

        # ATTENTION MASK 생성
        # encoder하고 decoder의 mask는 가로 세로 길이가 모두 동일하여
        # 사실 이렇게 3개로 나눌 필요가 없다
        if self.enc_mask is None or self.enc_mask.size(0) != seq_len:
            self.enc_mask = self.get_mask(seq_len).to(self.device)
            
        if self.dec_mask is None or self.dec_mask.size(0) != seq_len:
            self.dec_mask = self.get_mask(seq_len).to(self.device)
            
        if self.enc_dec_mask is None or self.enc_dec_mask.size(0) != seq_len:
            self.enc_dec_mask = self.get_mask(seq_len).to(self.device)
            
  
        embed_enc = embed_enc.permute(1, 0, 2)
        embed_dec = embed_dec.permute(1, 0, 2)
        
        # Positional encoding
        embed_enc = self.pos_encoder(embed_enc)
        embed_dec = self.pos_decoder(embed_dec)
        
        out = self.transformer(embed_enc, embed_dec,
                               src_mask=self.enc_mask,
                               tgt_mask=self.dec_mask,
                               memory_mask=self.enc_dec_mask)

        out = out.permute(1, 0, 2)
        out = out.contiguous().view(batch_size, -1, self.hidden_dim)
        out = self.fc(out)

        preds = self.activation(out).view(batch_size, -1)

        return preds

In [37]:
model = Saint(args)

for step, batch in enumerate(train_loader):
    inputs = process_batch(batch, args)
    output = model(inputs)
    print(f"output size : {output.size()}")
    break

output size : torch.Size([64, 20])


In [38]:
output

tensor([[0.2362, 0.2398, 0.2560,  ..., 0.4950, 0.3964, 0.3946],
        [0.3345, 0.1979, 0.2174,  ..., 0.4468, 0.4816, 0.4916],
        [0.2613, 0.2527, 0.3043,  ..., 0.3738, 0.3584, 0.3135],
        ...,
        [0.3734, 0.3503, 0.3339,  ..., 0.3717, 0.3900, 0.3282],
        [0.2320, 0.2227, 0.2540,  ..., 0.4059, 0.4327, 0.3781],
        [0.2846, 0.4026, 0.4221,  ..., 0.4167, 0.3381, 0.4591]],
       grad_fn=<ViewBackward>)