In [1]:
import os
import torch
import torch.nn as nn
import numpy as np
from glob import glob

import sys
sys.path.append('/opt/ml/develop')

In [2]:
from args_jupyter import parse_args
from dkt.dataloader import Preprocess
from dkt import trainer
from dkt.utils import setSeeds
from dkt.dataloader import get_loaders
from dkt.optimizer import get_optimizer, get_lr
from dkt.scheduler import get_scheduler
from dkt.criterion import get_criterion
from dkt.metric import get_metric

In [3]:
args = parse_args(mode='train')
setSeeds(args.seed)
device = "cuda" if torch.cuda.is_available() else "cpu"

In [4]:
# 배치 전처리
def process_batch(batch, args):

    test, question, tag, correct, mask = batch
    
    
    # change to float
    mask = mask.type(torch.FloatTensor)
    correct = correct.type(torch.FloatTensor)

    #  interaction을 임시적으로 correct를 한칸 우측으로 이동한 것으로 사용
    #    saint의 경우 decoder에 들어가는 input이다
    interaction = correct + 1 # 패딩을 위해 correct값에 1을 더해준다.
    interaction = interaction.roll(shifts=1, dims=1)
    interaction[:, 0] = 0 # set padding index to the first sequence
    interaction = (interaction * mask).to(torch.int64)
    # print(interaction)
    # exit()
    #  test_id, question_id, tag
    test = ((test + 1) * mask).to(torch.int64)
    question = ((question + 1) * mask).to(torch.int64)
    tag = ((tag + 1) * mask).to(torch.int64)

    # gather index
    # 마지막 sequence만 사용하기 위한 index
    gather_index = torch.tensor(np.count_nonzero(mask, axis=1))
    gather_index = gather_index.view(-1, 1) - 1


    # device memory로 이동

    test = test.to(args.device)
    question = question.to(args.device)


    tag = tag.to(args.device)
    correct = correct.to(args.device)
    mask = mask.to(args.device)

    interaction = interaction.to(args.device)
    gather_index = gather_index.to(args.device)

    return (test, question,
            tag, correct, mask,
            interaction, gather_index)

In [5]:
preprocess = Preprocess(args)
preprocess.load_train_data(args.file_name)
train_data = preprocess.get_train_data()

train_data, valid_data = preprocess.split_data(train_data)

In [6]:
train_loader, valid_loader = get_loaders(args, train_data, valid_data)

In [7]:
batch = None
for t in train_loader:
    # train_loader is tuple
    
    print(f"testID size: {t[0].size()}")
    print(f"assessmentItemId size: {t[1].size()}")
    print(f"knowledgetag size: {t[2].size()}")
    print(f"answercode size: {t[3].size()}")
    print(f"mask size: {t[4].size()}")
    
    batch = t
    break

testID size: torch.Size([64, 20])
assessmentItemId size: torch.Size([64, 20])
knowledgetag size: torch.Size([64, 20])
answercode size: torch.Size([64, 20])
mask size: torch.Size([64, 20])


In [8]:
args

Namespace(asset_dir='asset/', batch_size=64, clip_grad=10, data_dir='/opt/ml/input/data/train_dataset', data_id='userID', device='cpu', drop_out=0.2, emb_size=100, file_name='train_data.csv', hidden_dim=64, log_steps=50, lr=0.0001, max_lr=0.0001, max_seq_len=20, min_lr=1e-05, model='lstm', model_dir='models/', model_name='model.pt', n_epochs=20, n_heads=2, n_layers=2, n_questions=9455, n_tag=913, n_test=1538, num_workers=1, optimizer='adam', output_dir='output/', patience=5, scheduler='plateau', scheduler_step=5, seed=5, test_file_name='test_data.csv', wandb_name=None)

In [9]:
import torch
import torch.nn as nn
import torch.nn.functional as F 
import numpy as np
import copy
import math

In [10]:
inputs = process_batch(batch, args)
# test, question, tag, correct, mask, interaction, gather_index

In [11]:
test, question, tag, _, mask, interaction, index = inputs

In [12]:
batch_size = interaction.size(0)
seq_len = interaction.size(1)
batch_size, seq_len

(64, 20)

In [13]:
mask.shape

torch.Size([64, 20])

In [14]:
mask

tensor([[1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        ...,
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.]])

In [15]:
temp1, temp2 = mask.view(args.batch_size, args.max_seq_len, -1).max(2)
temp1.shape

torch.Size([64, 20])

# Last Query 모델

### Post Padding 사용시
- post padding을 사용할 경우 last query 구현이 조금 까다롭다. Tensor의 변화 흐름을 한번 쯤 느껴보며 아래 2가지 살펴보기
- 3D tensor에서 원하는 last query 데이터만 가져오기 위한 `gather`와 `index` 사용법
- last query를 위한 3D mask 만들기

In [16]:
class Feed_Forward_block(nn.Module):
    """
    out =  Relu( M_out*w1 + b1) *w2 + b2
    """
    def __init__(self, dim_ff):
        super().__init__()
        self.layer1 = nn.Linear(in_features=dim_ff, out_features=dim_ff)
        self.layer2 = nn.Linear(in_features=dim_ff, out_features=dim_ff)

    def forward(self,ffn_in):
        return self.layer2(F.relu(self.layer1(ffn_in)))

In [17]:
batch_size = interaction.size(0)
seq_len = interaction.size(1)

In [18]:
# Embedding
# interacton은 현재 correct으로 구성되어있다
# correct(1, 2) + padding(0)
embedding_interaction = nn.Embedding(3, args.hidden_dim//3)
embedding_test = nn.Embedding(args.n_test + 1, args.hidden_dim//3)
embedding_question = nn.Embedding(args.n_questions + 1, args.hidden_dim//3)
embedding_tag = nn.Embedding(args.n_tag + 1, args.hidden_dim//3)
embedding_position = nn.Embedding(args.max_seq_len, args.hidden_dim)

# 신나는 embedding
embed_interaction = embedding_interaction(interaction)
embed_test = embedding_test(test)
embed_question = embedding_question(question)
embed_tag = embedding_tag(tag)

print(embed_interaction.size())
print(embed_test.size())
print(embed_question.size())
print(embed_tag.size())

torch.Size([64, 20, 21])
torch.Size([64, 20, 21])
torch.Size([64, 20, 21])
torch.Size([64, 20, 21])


In [19]:
embed = torch.cat([embed_interaction,
                   embed_test,
                   embed_question,
                   embed_tag,], 2)
embed.size()

torch.Size([64, 20, 84])

In [20]:
comb_proj = nn.Linear((args.hidden_dim//3)*4, args.hidden_dim)
embed = comb_proj(embed)
embed.size()

torch.Size([64, 20, 64])

In [21]:
# 기존 keetar님 솔루션에서는 Positional Embedding 사용되지 않는다
# 사용 여부 자유롭게 결정해주세요 :)

# self.embedding_position = nn.Embedding(self.args.max_seq_len, self.hidden_dim)

In [22]:
# Encoder
query = nn.Linear(in_features=args.hidden_dim, out_features=args.hidden_dim)
key = nn.Linear(in_features=args.hidden_dim, out_features=args.hidden_dim)
value = nn.Linear(in_features=args.hidden_dim, out_features=args.hidden_dim)

q = query(embed)
q.size()

torch.Size([64, 20, 64])

In [23]:
# 이 3D gathering은 머리가 아픕니다. 잠시 머리를 식히고 옵니다.
q = torch.gather(q, 1, index.repeat(1, args.hidden_dim).unsqueeze(1))
q = q.permute(1, 0, 2)
q.size()

torch.Size([1, 64, 64])

In [24]:
k = key(embed).permute(1, 0, 2)
v = value(embed).permute(1, 0, 2)
k.size(), v.size()

(torch.Size([20, 64, 64]), torch.Size([20, 64, 64]))

In [25]:
def get_mask(seq_len, index, batch_size):
    """
    batchsize * n_head 수만큼 각 mask를 반복하여 증가시킨다

    참고로 (batch_size*self.args.n_heads, seq_len, seq_len) 가 아니라
          (batch_size*self.args.n_heads,       1, seq_len) 로 하는 이유는

    last query라 output의 seq부분의 사이즈가 1이기 때문이다
    """
    # [[1], -> [1, 2, 3]
    #  [2],
    #  [3]]
    index = index.view(-1)

    # last query의 index에 해당하는 upper triangular mask의 row를 사용한다
    mask = torch.from_numpy(np.triu(np.ones((seq_len, seq_len)), k=1))
    mask = mask[index]

    # batchsize * n_head 수만큼 각 mask를 반복하여 증가시킨다
    mask = mask.repeat(1, args.n_heads).view(batch_size*args.n_heads, -1, seq_len)
    return mask.masked_fill(mask==1, float('-inf'))

In [26]:
attn = nn.MultiheadAttention(embed_dim=args.hidden_dim, num_heads=args.n_heads)
mask = None # last query에서는 필요가 없지만 수정을 고려하여서 넣어둠
ffn = Feed_Forward_block(args.hidden_dim) 

In [27]:
# attention
# last query only
mask = get_mask(seq_len, index, batch_size)
mask.size()

torch.Size([128, 1, 20])

In [28]:
out, _ = attn(q, k, v, attn_mask=mask)
out.size()

torch.Size([1, 64, 64])

In [29]:
ln1 = nn.LayerNorm(args.hidden_dim)
ln2 = nn.LayerNorm(args.hidden_dim)

## residual + layer norm
out = out.permute(1, 0, 2)
out = embed + out
out = ln1(out)
print(out.size())

## feed forward network
out = ffn(out)
print(out.size())

## residual + layer norm
out = embed + out
out = ln2(out)
print(out.size())

torch.Size([64, 20, 64])
torch.Size([64, 20, 64])
torch.Size([64, 20, 64])


In [30]:
def init_hidden(batch_size):
    h = torch.zeros(
        args.n_layers,
        batch_size,
        args.hidden_dim)
#     h = h.to(self.device)

    c = torch.zeros(
        args.n_layers,
        batch_size,
        args.hidden_dim)
#     c = c.to(self.device)

    return (h, c)

In [31]:
# LSTM
lstm = nn.LSTM(
    args.hidden_dim,
    args.hidden_dim,
    args.n_layers,
    batch_first=True)

hidden = init_hidden(batch_size)
out, hidden = lstm(out, hidden)
print(out[0].shape)
print(hidden[0].shape)

torch.Size([20, 64])
torch.Size([2, 64, 64])


In [32]:
# Fully connected layer
fc = nn.Linear(args.hidden_dim, 1)
activation = nn.Sigmoid()

out = out.contiguous().view(batch_size, -1, args.hidden_dim)
print(out.size())
out = fc(out)
print(out.size())

preds = activation(out).view(batch_size, -1)
preds.size()

torch.Size([64, 20, 64])
torch.Size([64, 20, 1])


torch.Size([64, 20])

### Class로 만들기

In [33]:
class Feed_Forward_block(nn.Module):
    """
    out =  Relu( M_out*w1 + b1) *w2 + b2
    """
    def __init__(self, dim_ff):
        super().__init__()
        self.layer1 = nn.Linear(in_features=dim_ff, out_features=dim_ff)
        self.layer2 = nn.Linear(in_features=dim_ff, out_features=dim_ff)

    def forward(self,ffn_in):
        return self.layer2(F.relu(self.layer1(ffn_in)))

class LastQuery(nn.Module):
    def __init__(self, args):
        super(LastQuery, self).__init__()
        self.args = args
        self.device = args.device

        self.hidden_dim = self.args.hidden_dim
        
        # Embedding 
        # interaction은 현재 correct으로 구성되어있다. correct(1, 2) + padding(0)
        self.embedding_interaction = nn.Embedding(3, self.hidden_dim//3)
        self.embedding_test = nn.Embedding(self.args.n_test + 1, self.hidden_dim//3)
        self.embedding_question = nn.Embedding(self.args.n_questions + 1, self.hidden_dim//3)
        self.embedding_tag = nn.Embedding(self.args.n_tag + 1, self.hidden_dim//3)
        self.embedding_position = nn.Embedding(self.args.max_seq_len, self.hidden_dim)

        # embedding combination projection
        self.comb_proj = nn.Linear((self.hidden_dim//3)*4, self.hidden_dim)

        # 기존 keetar님 솔루션에서는 Positional Embedding은 사용되지 않습니다
        # 하지만 사용 여부는 자유롭게 결정해주세요 :)
        # self.embedding_position = nn.Embedding(self.args.max_seq_len, self.hidden_dim)
        
        # Encoder
        self.query = nn.Linear(in_features=self.hidden_dim, out_features=self.hidden_dim)
        self.key = nn.Linear(in_features=self.hidden_dim, out_features=self.hidden_dim)
        self.value = nn.Linear(in_features=self.hidden_dim, out_features=self.hidden_dim)

        self.attn = nn.MultiheadAttention(embed_dim=self.hidden_dim, num_heads=self.args.n_heads)
        self.mask = None # last query에서는 필요가 없지만 수정을 고려하여서 넣어둠
        self.ffn = Feed_Forward_block(self.hidden_dim)      

        self.ln1 = nn.LayerNorm(self.hidden_dim)
        self.ln2 = nn.LayerNorm(self.hidden_dim)

        # LSTM
        self.lstm = nn.LSTM(
            self.hidden_dim,
            self.hidden_dim,
            self.args.n_layers,
            batch_first=True)

        # Fully connected layer
        self.fc = nn.Linear(self.hidden_dim, 1)
       
        self.activation = nn.Sigmoid()


    def get_mask(self, seq_len, index, batch_size):
        """
        batchsize * n_head 수만큼 각 mask를 반복하여 증가시킨다
        
        참고로 (batch_size*self.args.n_heads, seq_len, seq_len) 가 아니라
              (batch_size*self.args.n_heads,       1, seq_len) 로 하는 이유는
        
        last query라 output의 seq부분의 사이즈가 1이기 때문이다
        """
        # [[1], -> [1, 2, 3]
        #  [2],
        #  [3]]
        index = index.view(-1)

        # last query의 index에 해당하는 upper triangular mask의 row를 사용한다
        mask = torch.from_numpy(np.triu(np.ones((seq_len, seq_len)), k=1))
        mask = mask[index]

        # batchsize * n_head 수만큼 각 mask를 반복하여 증가시킨다
        mask = mask.repeat(1, self.args.n_heads).view(batch_size*self.args.n_heads, -1, seq_len)
        return mask.masked_fill(mask==1, float('-inf'))

    def get_pos(self, seq_len):
        # use sine positional embeddinds
        return torch.arange(seq_len).unsqueeze(0)
 
    def init_hidden(self, batch_size):
        h = torch.zeros(
            self.args.n_layers,
            batch_size,
            self.args.hidden_dim)
        h = h.to(self.device)

        c = torch.zeros(
            self.args.n_layers,
            batch_size,
            self.args.hidden_dim)
        c = c.to(self.device)

        return (h, c)


    def forward(self, input):
        test, question, tag, _, mask, interaction, index = input
        batch_size = interaction.size(0)
        seq_len = interaction.size(1)

        # 신나는 embedding
        embed_interaction = self.embedding_interaction(interaction)
        embed_test = self.embedding_test(test)
        embed_question = self.embedding_question(question)
        embed_tag = self.embedding_tag(tag)

        embed = torch.cat([embed_interaction,
                           embed_test,
                           embed_question,
                           embed_tag,], 2)

        embed = self.comb_proj(embed)

        # Positional Embedding
        # last query에서는 positional embedding을 하지 않음
        # position = self.get_pos(seq_len).to('cuda')
        # embed_pos = self.embedding_position(position)
        # embed = embed + embed_pos

        ####################### ENCODER #####################
        q = self.query(embed)

        # 이 3D gathering은 머리가 아픕니다. 잠시 머리를 식히고 옵니다.
        q = torch.gather(q, 1, index.repeat(1, self.hidden_dim).unsqueeze(1))
        q = q.permute(1, 0, 2)

        k = self.key(embed).permute(1, 0, 2)
        v = self.value(embed).permute(1, 0, 2)

        ## attention
        # last query only
        self.mask = self.get_mask(seq_len, index, batch_size).to(self.device)
        out, _ = self.attn(q, k, v, attn_mask=self.mask)
        
        ## residual + layer norm
        out = out.permute(1, 0, 2)
        out = embed + out
        out = self.ln1(out)

        ## feed forward network
        out = self.ffn(out)

        ## residual + layer norm
        out = embed + out
        out = self.ln2(out)

        ###################### LSTM #####################
        hidden = self.init_hidden(batch_size)
        out, hidden = self.lstm(out, hidden)

        ###################### DNN #####################
        out = out.contiguous().view(batch_size, -1, self.hidden_dim)
        out = self.fc(out)

        preds = self.activation(out).view(batch_size, -1)

        return preds

In [34]:
model = LastQuery(args)

for step, batch in enumerate(train_loader):
    inputs = process_batch(batch, args)
    output = model(inputs)
    print(f"output size : {output.size()}")
    break

output size : torch.Size([64, 20])


In [37]:
output

tensor([[0.4927, 0.4948, 0.4984,  ..., 0.5102, 0.5111, 0.5109],
        [0.4928, 0.4939, 0.4935,  ..., 0.4834, 0.4860, 0.4862],
        [0.4887, 0.4880, 0.4867,  ..., 0.4905, 0.4858, 0.4844],
        ...,
        [0.4900, 0.4838, 0.4813,  ..., 0.4942, 0.4984, 0.5012],
        [0.4880, 0.4864, 0.4850,  ..., 0.4905, 0.4940, 0.5000],
        [0.4897, 0.4874, 0.4914,  ..., 0.4937, 0.4915, 0.4902]],
       grad_fn=<ViewBackward>)