In [17]:
import pandas as pd
import numpy as np

import urllib.request
import time

import re
import copy
from typing import Optional, Any, Union, Callable


import torch
from torch import nn
from torch import Tensor
from torch import LongTensor

from torch.nn.init import xavier_uniform_
from torch.nn.init import kaiming_uniform_
from torch.nn.init import xavier_normal_
from torch.utils.data import DataLoader

from torch.nn import MultiheadAttention, ModuleList, Dropout, Linear, LayerNorm, functional as F

import torch.optim as optim


import math
import sentencepiece as spm

from torch.optim.lr_scheduler import _LRScheduler

class CosineAnnealingWarmUpRestarts(_LRScheduler):
    def __init__(self, optimizer, T_0, T_mult=1, eta_max=0.1, T_up=0, gamma=1., last_epoch=-1):
        if T_0 <= 0 or not isinstance(T_0, int):
            raise ValueError("Expected positive integer T_0, but got {}".format(T_0))
        if T_mult < 1 or not isinstance(T_mult, int):
            raise ValueError("Expected integer T_mult >= 1, but got {}".format(T_mult))
        if T_up < 0 or not isinstance(T_up, int):
            raise ValueError("Expected positive integer T_up, but got {}".format(T_up))
        self.T_0 = T_0
        self.T_mult = T_mult
        self.base_eta_max = eta_max
        self.eta_max = eta_max
        self.T_up = T_up
        self.T_i = T_0
        self.gamma = gamma
        self.cycle = 0
        self.T_cur = last_epoch
        super(CosineAnnealingWarmUpRestarts, self).__init__(optimizer, last_epoch)
    
    def get_lr(self):
        if self.T_cur == -1:
            return self.base_lrs
        elif self.T_cur < self.T_up:
            return [(self.eta_max - base_lr)*self.T_cur / self.T_up + base_lr for base_lr in self.base_lrs]
        else:
            return [base_lr + (self.eta_max - base_lr) * (1 + math.cos(math.pi * (self.T_cur-self.T_up) / (self.T_i - self.T_up))) / 2
                    for base_lr in self.base_lrs]

    def step(self, epoch=None):
        if epoch is None:
            epoch = self.last_epoch + 1
            self.T_cur = self.T_cur + 1
            if self.T_cur >= self.T_i:
                self.cycle += 1
                self.T_cur = self.T_cur - self.T_i
                self.T_i = (self.T_i - self.T_up) * self.T_mult + self.T_up
        else:
            if epoch >= self.T_0:
                if self.T_mult == 1:
                    self.T_cur = epoch % self.T_0
                    self.cycle = epoch // self.T_0
                else:
                    n = int(math.log((epoch / self.T_0 * (self.T_mult - 1) + 1), self.T_mult))
                    self.cycle = n
                    self.T_cur = epoch - self.T_0 * (self.T_mult ** n - 1) / (self.T_mult - 1)
                    self.T_i = self.T_0 * self.T_mult ** (n)
            else:
                self.T_i = self.T_0
                self.T_cur = epoch
                
        self.eta_max = self.base_eta_max * (self.gamma**self.cycle)
        self.last_epoch = math.floor(epoch)
        for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()):
            param_group['lr'] = lr


def preprocessing(data,max_len):
    
    if len(data)>max_len:
        data[:max_len]
            
    else:
        data.extend([0]*(max_len-len(data)))
        
    return data


def tokenize_and_filter(questions, answers):
    
    result = []

    for (question, answer) in zip(questions, answers):
        
        sentence = question + "[QES]" + answer

        sentence1 = sp.encode_as_ids(sentence)[:-1]
        sentence2 = sp.encode_as_ids(sentence)[1:]
        
        sentence1 = preprocessing(sentence1,MAX_LENGTH)
        sentence2 = preprocessing(sentence2,MAX_LENGTH)
       
        result.append([sentence1,sentence2])
       
    return result



def collate_fn(batch):
    
    train_iter = torch.Tensor(np.array(batch)).long()
    
    data = train_iter.transpose(0,1)
    
    return data[0], data[1]
    
    
    
MAX_LENGTH = 50

vocab_size=2**13

urllib.request.urlretrieve("https://raw.githubusercontent.com/songys/Chatbot_data/master/ChatbotData.csv", filename="ChatBotData.csv")
train_data = pd.read_csv('ChatBotData.csv')
train_data.head()

questions = []
for sentence in train_data['Q']:
 
    sentence = re.sub(r"([?.!,])", r" \1 ", sentence)
    sentence = sentence.strip()
    questions.append(sentence)
    
answers = []
for sentence in train_data['A']:
    
    sentence = re.sub(r"([?.!,])", r" \1 ", sentence)
    sentence = sentence.strip()
    answers.append(sentence)
    
print(f"데이터 샘플")
print(f"질문: {questions[0]}")
print(f"대답: {answers[0]}")
    
sentences = questions+answers
    
with open('subword_train.txt','w',encoding='utf-8') as f:
    for line in sentences:
        f.write(line + '\n')
        
with open('subword_train.txt','r',encoding='utf-8') as f:
    test = f.read().split('\n')


input_file = 'subword_train.txt'

model_name='subword_tokenizer_kor'

model_type = 'bpe'

user_defined_symbols = "[QES]"

input_argument = '--input=%s --model_prefix=%s --vocab_size=%s --model_type=%s --user_defined_symbols=%s'

cmd = input_argument%(input_file, model_name, vocab_size, model_type, user_defined_symbols)

spm.SentencePieceTrainer.Train(cmd)

sp = spm.SentencePieceProcessor()

sp.Load('subword_tokenizer_kor.model')

sp.SetEncodeExtraOptions('bos:eos')


DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

d_hid = 512
nlayers = 2
nhead = 8
dropout = 0.1
d_model = 256
emb_size = d_model
class_num = 3
batch_size=64  

PAD_IDX, BOS_IDX, EOS_IDX, QES_IDX = 0, 1, 2, 3

train_iter = tokenize_and_filter(questions,answers)

dataloader = DataLoader(train_iter,batch_size=batch_size,shuffle=True,collate_fn=collate_fn)



class GPT(nn.Module):
    
    def __init__(self, d_model: int = 512, nhead: int = 8, num_decoder_layers: int = 6, dim_feedforward: int = 2048,
                 dropout: float = 0.1, activation: Union[str, Callable[[Tensor], Tensor]] = F.relu, 
                 custom_decoder: Optional[Any] = None, layer_norm_eps: float = 1e-5, 
                 norm_first: bool = False) -> None:
        
        super(GPT, self).__init__()
        
        self.tok_emb = TokenEmbedding(vocab_size, emb_size) 
        self.positional_encoding = PositionalEncoding(emb_size, dropout=dropout)    


        if custom_decoder is not None:
            self.decoder = custom_decoder
        else:
            decoder_layer = TransformerDecoderLayer(d_model, nhead, dim_feedforward, dropout,
                                                    activation, layer_norm_eps, norm_first)
            decoder_norm = LayerNorm(d_model, eps=layer_norm_eps)
            self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm)
            
        self.linear_lm =  nn.Linear(emb_size, vocab_size)
        self.linear_cls = nn.Linear(emb_size, class_num)
            
        self._reset_parameters()

        self.d_model = d_model
        self.nhead = nhead

        

    def forward(self, tgt: Tensor, tgt_mask: Optional[Tensor] = None, tgt_key_padding_mask: Optional[Tensor] = None) -> [Tensor]:
        output = self.positional_encoding(self.tok_emb(tgt))
        output = self.decoder(output, tgt_mask=tgt_mask, tgt_key_padding_mask=tgt_key_padding_mask)
        output1 = self.linear_lm(output)
        output2 = self.linear_cls(output)
        return output1, output2

    

    def _reset_parameters(self):
        r"""Initiate parameters in the transformer model."""

        for p in self.parameters():
            if p.dim() > 1:
                xavier_normal_(p)
                
                
    def decode(self, tgt: Tensor, tgt_mask: Tensor):
        return self.decoder(self.positional_encoding(self.tok_emb(tgt)),tgt_mask)
                
                
class TransformerDecoder(nn.Module):
    
    __constants__ = ['norm']

    def __init__(self, decoder_layer, num_layers, norm=None):
        super(TransformerDecoder, self).__init__()
        self.layers = _get_clones(decoder_layer, num_layers)
        self.num_layers = num_layers
        self.norm = norm

    def forward(self, tgt: Tensor, tgt_mask: Optional[Tensor] = None, tgt_key_padding_mask: Optional[Tensor] = None) -> Tensor:
        
        output = tgt

        for mod in self.layers:
            output = mod(output, tgt_mask=tgt_mask, tgt_key_padding_mask=tgt_key_padding_mask )

        if self.norm is not None:
            output = self.norm(output)

        return output
    
    
    
class TransformerDecoderLayer(nn.Module):
    
    
    __constants__ = ['norm_first']

    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation=F.relu,
                 layer_norm_eps=1e-5, norm_first=False) -> None:
      
        
        super(TransformerDecoderLayer, self).__init__()
        
        self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout)
        self.multihead_attn = MultiheadAttention(d_model, nhead, dropout=dropout)
        
        self.linear1 = Linear(d_model, dim_feedforward)
        self.dropout = Dropout(dropout)
        self.linear2 = Linear(dim_feedforward, d_model)

        self.norm_first = norm_first
        self.norm1 = LayerNorm(d_model, eps=layer_norm_eps)
        self.norm2 = LayerNorm(d_model, eps=layer_norm_eps)
        
        self.dropout1 = Dropout(dropout)
        self.dropout2 = Dropout(dropout)
       
        if isinstance(activation, str):
            self.activation = _get_activation_fn(activation)
        else:
            self.activation = activation

    def __setstate__(self, state):
        if 'activation' not in state:
            state['activation'] = F.relu
        super(TransformerDecoderLayer, self).__setstate__(state)

    def forward(self, tgt: Tensor, tgt_mask: Optional[Tensor] = None, tgt_key_padding_mask: Optional[Tensor] = None) -> Tensor:
       
        x = tgt
        if self.norm_first:
            x = x + self._sa_block(self.norm1(x), tgt_mask, tgt_key_padding_mask)
            x = x + self._ff_block(self.norm2(x))
        else:
            x = self.norm1(x + self._sa_block(x, tgt_mask, tgt_key_padding_mask))
            x = self.norm2(x + self._ff_block(x))

        return x

    # self-attention block
    def _sa_block(self, x: Tensor, attn_mask: Optional[Tensor], key_padding_mask: Optional[Tensor]) -> Tensor:
        x = self.self_attn(x, x, x, attn_mask=attn_mask, key_padding_mask=key_padding_mask, need_weights=False)[0] 
        return self.dropout1(x)


    # feed forward block
    def _ff_block(self, x: Tensor) -> Tensor: 
        x = self.linear2(self.dropout(self.activation(self.linear1(x))))
        return self.dropout2(x)




class PositionalEncoding(nn.Module):
    def __init__(self,
                 emb_size: int,
                 dropout: float,
                 maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(- torch.arange(0, emb_size, 2)* math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding: Tensor):
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])


class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size

    def forward(self, tokens: Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)

    

class CustomSchedule():

    def __init__(self, d_model, warmup_steps=4000):
        self.d_model = d_model
        self.warmup_steps = warmup_steps
        
    def call(self,epoch):
        arg1 = 1/math.sqrt(epoch)
        arg2 = epoch * (self.warmup_steps**-1.5)

        return 1/math.sqrt(self.d_model) * min(arg1, arg2)
    
    
def _get_clones(module, N):
    return ModuleList([copy.deepcopy(module) for i in range(N)])


def _get_activation_fn(activation):
    if activation == "relu":
        return F.relu
    elif activation == "gelu":
        return F.gelu

    raise RuntimeError("activation should be relu/gelu, not {}".format(activation))        
    
def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz, sz), device=DEVICE)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask


def create_mask(tgt):
    tgt_seq_len = tgt.shape[0]
    tgt_mask = generate_square_subsequent_mask(tgt_seq_len)
    tgt_padding_mask = (tgt == PAD_IDX).transpose(0, 1)
    return tgt_mask, tgt_padding_mask


    
def greedy_decode(model, ys, max_len):
    
    ys = ys.to(DEVICE)
    for i in range(max_len-1):
        tgt_mask = (generate_square_subsequent_mask(ys.size(0))
                    .type(torch.bool)).to(DEVICE)
        out = model.decode(ys, tgt_mask)
        out = out.transpose(0, 1)
        prob = model.linear_lm(out[:, -1])
        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.item()
        ys = torch.cat([ys, torch.ones(1, 1).type_as(ys.data).fill_(next_word)], dim=0).to(DEVICE)
        if next_word == EOS_IDX:
            break
    return ys



def reply(model: torch.nn.Module, src_sentence: str):
    
    model.eval()
    token = sp.encode_as_ids(src_sentence)
    token.append(QES_IDX)
    ys = torch.Tensor(token).view(-1, 1)
    tgt_tokens = greedy_decode(model,  ys, max_len=vocab_size + 5).flatten().to(DEVICE)
    answer = " ".join(sp.decode(tgt_tokens.long().tolist()[len(token):]))
    return answer


gpt = GPT(d_model = d_model, nhead = nhead, num_decoder_layers = nlayers , dim_feedforward = d_hid, dropout = dropout)
gpt = gpt.to(DEVICE)
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)
optimizer = optim.Adam(gpt.parameters(), lr = 0)
scheduler = CosineAnnealingWarmUpRestarts(optimizer, T_0=30, T_mult=1, eta_max=0.008,  T_up=10, gamma=0.8)

def train(model: nn.Module, epoch):
    
    model.train()
    losses = 0 
    batch=1

    for targets1, targets2 in dataloader:        
        targets1= targets1.transpose(0,1).to(DEVICE)
        targets2= targets2.transpose(0,1).to(DEVICE)
        tgt_mask, tgt_padding_mask = create_mask(targets1)
        output1, output2 = model(tgt=targets1, tgt_mask = tgt_mask, tgt_key_padding_mask = tgt_padding_mask)
        optimizer.zero_grad()
        loss = loss_fn(output1.reshape(-1,output1.shape[-1]), targets2.reshape(-1))
        loss.backward()
        losses += loss.item()
        optimizer.step()
    scheduler.step()
    return losses/len(dataloader)


epochs = 300
for epoch in range(1,epochs+1):
    loss = train(gpt,epoch)
    if epoch%30==0 or epoch==1:
        
        print("epoch :",epoch , "loss: " ,loss)
        try:
            print(reply(gpt, "배고프다"))
        except:
            continue
            
        