In [1]:
import os
from args_jupyter import parse_args
from dkt.dataloader import Preprocess
from dkt import trainer
import torch
from dkt.utils import setSeeds

In [2]:
import os
import torch
import numpy as np
from glob import glob


from dkt.dataloader import get_loaders
from dkt.optimizer import get_optimizer, get_lr
from dkt.scheduler import get_scheduler
from dkt.criterion import get_criterion
from dkt.metric import get_metric
from dkt.model import LSTM, LSTMATTN, Bert

In [3]:
args = parse_args(mode='train')
setSeeds(args.seed)
device = "cuda" if torch.cuda.is_available() else "cpu"

In [4]:
# 배치 전처리
def process_batch(batch, args):

    test, question, tag, correct, mask = batch
    
    
    # change to float
    mask = mask.type(torch.FloatTensor)
    correct = correct.type(torch.FloatTensor)

    #  interaction을 임시적으로 correct를 한칸 우측으로 이동한 것으로 사용
    #    saint의 경우 decoder에 들어가는 input이다
    interaction = correct + 1 # 패딩을 위해 correct값에 1을 더해준다.
    interaction = interaction.roll(shifts=1, dims=1)
    interaction[:, 0] = 0 # set padding index to the first sequence
    interaction = (interaction * mask).to(torch.int64)
    # print(interaction)
    # exit()
    #  test_id, question_id, tag
    test = ((test + 1) * mask).to(torch.int64)
    question = ((question + 1) * mask).to(torch.int64)
    tag = ((tag + 1) * mask).to(torch.int64)

    # gather index
    # 마지막 sequence만 사용하기 위한 index
    gather_index = torch.tensor(np.count_nonzero(mask, axis=1))
    gather_index = gather_index.view(-1, 1) - 1


    # device memory로 이동

    test = test.to(args.device)
    question = question.to(args.device)


    tag = tag.to(args.device)
    correct = correct.to(args.device)
    mask = mask.to(args.device)

    interaction = interaction.to(args.device)
    gather_index = gather_index.to(args.device)

    return (test, question,
            tag, correct, mask,
            interaction, gather_index)

In [5]:
preprocess = Preprocess(args)
preprocess.load_train_data(args.file_name)
train_data = preprocess.get_train_data()

train_data, valid_data = preprocess.split_data(train_data)

In [6]:
train_loader, valid_loader = get_loaders(args, train_data, valid_data)

In [7]:
batch = None
for t in train_loader:
    # train_loader is tuple
    
    print(f"testID size: {t[0].size()}")
    print(f"assessmentItemId size: {t[1].size()}")
    print(f"knowledgetag size: {t[2].size()}")
    print(f"answercode size: {t[3].size()}")
    print(f"mask size: {t[4].size()}")
    
    batch = t
    break

testID size: torch.Size([64, 20])
assessmentItemId size: torch.Size([64, 20])
knowledgetag size: torch.Size([64, 20])
answercode size: torch.Size([64, 20])
mask size: torch.Size([64, 20])


In [8]:
args

Namespace(asset_dir='asset/', batch_size=64, clip_grad=10, data_dir='/opt/ml/input/data/train_dataset', device='cpu', drop_out=0.2, emb_size=100, file_name='train_data.csv', hidden_dim=64, log_steps=50, lr=0.0001, max_seq_len=20, model='lstm', model_dir='models/', model_name='model.pt', n_epochs=20, n_heads=2, n_layers=2, n_questions=9455, n_tag=913, n_test=1538, num_workers=1, optimizer='adam', output_dir='output/', patience=5, scheduler='plateau', seed=5, test_file_name='test_data.csv', wandb_name=None)

In [9]:
import torch
import torch.nn as nn
import torch.nn.functional as F 
import numpy as np
import copy
import math

In [10]:
inputs = process_batch(batch, args)
# test, question, tag, correct, mask, interaction, gather_index

In [11]:
test, question, tag, _, mask, interaction, _ = inputs

In [12]:
mask.shape

torch.Size([64, 20])

In [38]:
mask

tensor([[1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        ...,
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.]])

In [13]:
temp1, temp2 = mask.view(args.batch_size, args.max_seq_len, -1).max(2)
temp1.shape

torch.Size([64, 20])

In [14]:
extended_attention_mask = mask.unsqueeze(1).unsqueeze(2)
extended_attention_mask = extended_attention_mask.to(dtype=torch.float32)
extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
head_mask = [None] * args.n_layers

In [15]:
extended_attention_mask.shape, len(head_mask)

(torch.Size([64, 1, 1, 20]), 2)

In [37]:
extended_attention_mask

tensor([[[[-0., -0., -0.,  ..., -0., -0., -0.]]],


        [[[-0., -0., -0.,  ..., -0., -0., -0.]]],


        [[[-0., -0., -0.,  ..., -0., -0., -0.]]],


        ...,


        [[[-0., -0., -0.,  ..., -0., -0., -0.]]],


        [[[-0., -0., -0.,  ..., -0., -0., -0.]]],


        [[[-0., -0., -0.,  ..., -0., -0., -0.]]]])

In [16]:
hidden_dim = args.hidden_dim
n_layers = args.n_layers
device

# Embedding 
# interaction은 현재 correct로 구성되어있다. correct(1, 2) + padding(0)
embedding_interaction = nn.Embedding(3, hidden_dim//4)
embedding_test = nn.Embedding(args.n_test + 1, hidden_dim//4)
embedding_question = nn.Embedding(args.n_questions + 1, hidden_dim//4)
embedding_tag = nn.Embedding(args.n_tag + 1, hidden_dim//4)

embed_interaction = embedding_interaction(interaction)
embed_test = embedding_test(test)
embed_question = embedding_question(question)
embed_tag = embedding_tag(tag)

print(embed_interaction.size())
print(embed_test.size())
print(embed_question.size())
print(embed_tag.size())

torch.Size([64, 20, 16])
torch.Size([64, 20, 16])
torch.Size([64, 20, 16])
torch.Size([64, 20, 16])


In [17]:
embed = torch.cat([embed_interaction,
                   embed_test,
                   embed_question,
                   embed_tag,], 2)
embed.size()

torch.Size([64, 20, 64])

In [18]:
# embedding combination projection
comb_proj = nn.Sequential(nn.Linear(hidden_dim, hidden_dim), nn.LayerNorm(hidden_dim))

comb_embed = comb_proj(embed)
comb_embed.size()

torch.Size([64, 20, 64])

In [19]:
try:
    from transformers.modeling_bert import BertConfig, BertEncoder, BertModel    
except:
    from transformers.models.bert.modeling_bert import BertConfig, BertEncoder, BertModel   

config = BertConfig(3, # not used
                    hidden_size=hidden_dim,
                    num_hidden_layers=args.n_layers,
                    num_attention_heads=args.n_heads,
                    intermediate_size=args.hidden_dim,
                    hidden_dropout_prob=args.drop_out,
                    attention_probs_dropout_prob=args.drop_out)

encoder = BertEncoder(config)

In [20]:
encoded_layers = encoder(comb_embed)
sequence_output = encoded_layers[-1]
sequence_output.size()

torch.Size([64, 20, 64])

In [21]:
sequence_output = sequence_output[:, -1]
sequence_output.size()

torch.Size([64, 64])

In [22]:
def get_reg():
    return nn.Sequential(nn.Linear(args.hidden_dim, args.hidden_dim),
                         nn.LayerNorm(args.hidden_dim),
                         nn.Dropout(args.drop_out),
                         nn.ReLU(),
                         nn.Linear(args.hidden_dim, 1))

reg_layer = get_reg()

In [23]:
# torch.Size([64, 20, 64]) -> [64, 1]
pred_y = reg_layer(sequence_output)
pred_y.size()

torch.Size([64, 1])

### 클래스로 만들기

In [34]:
# riiid rank 7
class Transformer(nn.Module):
    def __init__(self, args):
        super(Transformer, self).__init__()
        self.args = args
        # Embedding 
        # interaction은 현재 correct로 구성되어있다. correct(1, 2) + padding(0)
        self.embedding_interaction = nn.Embedding(3, args.hidden_dim//4)
        self.embedding_test = nn.Embedding(args.n_test + 1, args.hidden_dim//4)
        self.embedding_question = nn.Embedding(args.n_questions + 1, args.hidden_dim//4)
        self.embedding_tag = nn.Embedding(args.n_tag + 1, args.hidden_dim//4)

        # embedding combination projection
        self.comb_proj = nn.Sequential(nn.Linear(args.hidden_dim, args.hidden_dim), 
                                  nn.LayerNorm(args.hidden_dim))
        
        config = BertConfig(3, # not used
                    hidden_size=args.hidden_dim,
                    num_hidden_layers=args.n_layers,
                    num_attention_heads=args.n_heads,
                    intermediate_size=args.hidden_dim,
                    hidden_dropout_prob=args.drop_out,
                    attention_probs_dropout_prob=args.drop_out)
        
        self.encoder = BertEncoder(config)

        def get_reg():
            return nn.Sequential(nn.Linear(args.hidden_dim, args.hidden_dim),
                                 nn.LayerNorm(args.hidden_dim),
                                 nn.Dropout(args.drop_out),
                                 nn.ReLU(),
                                 nn.Linear(args.hidden_dim, 1),
                                 nn.Sigmoid())

        self.reg_layer = get_reg()

    def forward(self, inputs):
        test, question, tag, _, mask, interaction, _ = inputs
        batch_size = interaction.size(0)
        
        embed_interaction = self.embedding_interaction(interaction)
        embed_test = self.embedding_test(test)
        embed_question = self.embedding_question(question)
        embed_tag = self.embedding_tag(tag)

        embed = torch.cat([embed_interaction,
                       embed_test,
                       embed_question,
                       embed_tag,], 2)
        
        comb_embed = self.comb_proj(embed)
        
        extended_attention_mask = mask.unsqueeze(1).unsqueeze(2)
        extended_attention_mask = extended_attention_mask.to(dtype=torch.float32)
        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
        
#         mask, _ = mask.view(batch_size, args.max_seq_len, -1).max(2)
        
        encoded_layers = self.encoder(comb_embed, attention_mask= extended_attention_mask)
        sequence_output = encoded_layers[0]  # 길이 1이라서 0과 -1 같음
        # sequence_output은 [64, 20, 64]
        # sequence_output = sequence_output[:, -1]
        
        pred_y = self.reg_layer(sequence_output).view(batch_size, -1)  # [64, 20, 64] -> [64, 20, 1]
        
        return pred_y

In [35]:
model = Transformer(args)

for step, batch in enumerate(train_loader):
    inputs = process_batch(batch, args)
    output = model(inputs)
    print(f"output size : {output.size()}")
    break

output size : torch.Size([64, 20])


In [36]:
output

tensor([[0.5435, 0.5199, 0.5476,  ..., 0.4855, 0.5477, 0.4420],
        [0.5780, 0.4721, 0.3875,  ..., 0.4301, 0.6226, 0.6035],
        [0.5190, 0.5130, 0.4998,  ..., 0.3284, 0.4155, 0.2817],
        ...,
        [0.5990, 0.5744, 0.4462,  ..., 0.5712, 0.4767, 0.4135],
        [0.5131, 0.5915, 0.6935,  ..., 0.5372, 0.5526, 0.4999],
        [0.3874, 0.5622, 0.6337,  ..., 0.6429, 0.5470, 0.4966]],
       grad_fn=<ViewBackward>)