In [None]:
!nvidia-smi

Tue Jun  8 03:45:19 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.27       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   44C    P0    29W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
%%capture
!pip install transformers
!pip install einops

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%%capture
%cd '/content/drive/MyDrive'
!ls

In [None]:
import torch
from transformers import AutoModel, AutoTokenizer
from transformers import RobertaTokenizerFast
import os
import torch
from torch.utils.data.dataset import Dataset
from transformers.tokenization_utils import PreTrainedTokenizer
from filelock import FileLock
from transformers.utils import logging
from typing import Dict, List, Optional
import pickle
import random
import time
from torch.utils.data import DataLoader
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim
from pathlib import Path
import numpy as np
from einops import rearrange
import math

In [None]:
class ScaleDotProductAttention(nn.Module):
    def __init__(self):
        super(ScaleDotProductAttention, self).__init__()
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, q, k, v, mask=None, e=1e-12):
        batch_size, head, length, d_tensor = k.size()

        score = torch.einsum("bhid,bhjd->bhij",q,k)
        score = score/math.sqrt(d_tensor)

        if mask is not None:
            score = score.masked_fill(mask == 0, -e)

        score = self.softmax(score)

        v = score @ v

        return v, score

In [None]:
class MultiHeadAttention(nn.Module):

    def __init__(self, d_model, n_head):
        super(MultiHeadAttention, self).__init__()
        self.n_head = n_head
        self.attention = ScaleDotProductAttention()
        self.w_q = nn.Linear(d_model, d_model*n_head)
        self.w_k = nn.Linear(d_model, d_model*n_head)
        self.w_v = nn.Linear(d_model, d_model*n_head)
        self.w_concat = nn.Linear(d_model*n_head, d_model)

    def forward(self, x, mask=None):
        q, k, v = self.w_q(x), self.w_k(x), self.w_v(x)

        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h=self.n_head), (q, k, v))

        out, attention = self.attention(q, k, v, mask=mask)

        # 4. concat and pass to linear layer
        # out = self.concat(out)
        out = rearrange(out, 'b h n d -> b n (h d)')
        out = self.w_concat(out)

        return out

In [None]:
class SelfAttentionLstm(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers,n_head):
        super(SelfAttentionLstm, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.multi_attention = MultiHeadAttention(d_model=input_size,n_head=4)
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)

    def forward(self, x):
        x = self.multi_attention(x)
         
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to("cuda")
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to('cuda')

        out, _ = self.lstm(x, (h0, c0))

        out = out[: ,-1, : ]
        return out

In [None]:
train_path = 'Data/Poem/train_dataset_27_04.txt'
test_path = 'Data/Poem/valid_dataset_27_04.txt'

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizer = RobertaTokenizerFast.from_pretrained("./Tokenizer_27_04", max_len=512)
tokenizer.add_tokens('\n')
vocab_size= tokenizer.vocab_size
vocab_size = vocab_size + 1

In [None]:
vocab_size

14673

In [None]:
def add_padding(list_token: list, block_size:int):
    tmp_list = [0]* block_size
    tmp_list[0:len(list_token)] = list_token
    tmp_list[len(list_token):block_size] = [1]*(block_size-len(list_token))
    return tmp_list

In [None]:
logger = logging.get_logger(__name__)
class CusTextDataset(Dataset):
    """
    This will be superseded by a framework-agnostic approach
    soon.
    """

    def __init__(
        self,
        tokenizer: PreTrainedTokenizer,
        file_path: str,
        block_size: int,
        overwrite_cache=False,
        cache_dir: Optional[str] = None,
    ):
        assert os.path.isfile(file_path), f"Input file path {file_path} not found"

        directory, filename = os.path.split(file_path)
        cached_features_file = os.path.join(
            cache_dir if cache_dir is not None else directory,
            "cached_lm_{}_{}_{}".format(
                tokenizer.__class__.__name__,
                str(block_size),
                filename,
            ),
        )
        lock_path = cached_features_file + ".lock"
        with FileLock(lock_path):

            if os.path.exists(cached_features_file) and not overwrite_cache:
                start = time.time()
                with open(cached_features_file, "rb") as handle:
                    self.examples = pickle.load(handle)
                logger.info(
                    f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start
                )

            else:
                logger.info(f"Creating features from dataset file at {directory}")

                self.examples = []
                with open(file_path, encoding="utf-8") as f:
                    total_poem = f.read()
                split_total_poem = total_poem.split("\n\n")
                canto_poem = [split_total_poem[x:x+4] for x in range(0, len(split_total_poem), 4)]
                canto_poem = ["\n\n".join(i) for i in canto_poem]

                canto_token = [tokenizer.encode(i) for i in canto_poem]
                canto_token = [i for i in canto_token if len(i) >= 129 and len(i) <= 140]

                for i in canto_token:
                  self.examples.append(add_padding(i,block_size=block_size ))


                start = time.time()
                with open(cached_features_file, "wb") as handle:
                    pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)
                logger.info(
                    "Saving features into cached file %s [took %.3f s]", cached_features_file, time.time() - start
                )

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i) -> torch.Tensor:
        return torch.tensor(self.examples[i], dtype=torch.long)

In [None]:
def load_dataset(train_path, test_path, tokenizer):
    train_dataset = CusTextDataset(
          tokenizer=tokenizer,
          file_path=train_path,
          block_size=140)
     
    test_dataset = CusTextDataset(
          tokenizer=tokenizer,
          file_path=test_path,
          block_size=140)   
    
    return train_dataset,test_dataset

train_dataset,test_dataset = load_dataset(train_path,test_path, tokenizer)

#Initialize Model

In [None]:
train_loader = DataLoader(dataset=train_dataset, batch_size=8, shuffle=False)
test_loader = DataLoader(dataset= test_dataset, batch_size= 8, shuffle=False)

In [None]:
from transformers import Trainer, TrainingArguments, GPT2Config, GPT2LMHeadModel,GPT2Model

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
configuration = GPT2Config(vocab_size=vocab_size,n_layer = 8)
poem = GPT2LMHeadModel(configuration).to("cuda")

#Train GPT-2


In [None]:
lr_rate = 3e-5
criterion = nn.CrossEntropyLoss().to(device)
optimizer = optim.Adam(poem.parameters(), lr_rate)

In [None]:
def save_checkpoint(state, filename= "GPT-2/gpt_2_custom_loss_v2.pth.tar"):
    print("Saving checkpoint")
    torch.save(state,filename)

def load_checkpoint(state):
    print("Load checkpoint")
    poem.load_state_dict(state['state_dict'])
    optimizer.load_state_dict(state['optimizer'])


In [None]:
load_checkpoint(torch.load("GPT-2/gpt_2_27_04_dataset.pth.tar"))

Load checkpoint


In [None]:
head_gpt = SelfAttentionLstm(input_size=768,hidden_size=800, num_layers=2,n_head=4).to('cuda')

In [None]:
def custom_index(list_token:list):
    list_token = [list_token[i:i+4] for i in range(0,len(list_token),4)]
    for i in range(len(list_token)):
      list_token[i] = [list_token[i][0],list_token[i][3]]
    return list_token

In [None]:
def get_idx_two_line(lm_logits):
    token = torch.argmax(lm_logits, dim= 2)
    token = token[0].tolist()
    index_token = [0]
    for i in range(len(token)):
        if token[i:i+2] == [vocab_size-1,vocab_size-1]:
          index_token.append(i)
          index_token.append(i+2)
    index_token.append(len(token))

    # Lấy index đầu và cuối của 1 khổ
    index_khotho = [index_token[i:i+2] for i in range(0,len(index_token),2)]
    index_khotho = [i for i in index_khotho if len(i) == 2]

    a = index_khotho
    
    #Lấy index của token đầu và token cuối của 2 câu trong 1 khổ
    token_final = []
    for idx_khotho in index_khotho:
        tmp = token[idx_khotho[0]:idx_khotho[1]]
        token_tmp = [idx_khotho[0]]
        for i in range(len(tmp)):
          if tmp[i] == vocab_size-1:
            token_tmp.append(i + idx_khotho[0])
            token_tmp.append(i+1 +idx_khotho[0])
        token_tmp.append(idx_khotho[1])
        if len(token_tmp) != 8:
          continue 
        else :
          token_final.append(custom_index(token_tmp))
    
    return token_final

In [None]:
def loss_kho_tho(lm_logits,embedding):
    lm_logits = torch.unsqueeze(lm_logits,0)
    pair_list = get_idx_two_line(lm_logits)
    embedding = torch.unsqueeze(embedding,0)
    
    total_lost = 0
    loss = nn.MSELoss().to(device)
    for pair in pair_list:
        one = pair[0]
        two = pair[1]

        if one == None or two == None:
          continue

        embedd_one = head_gpt(embedding[:,one[0]: one[1], :])
        embedd_two = head_gpt(embedding[:,two[0]: two[1], :])

        total_lost += loss(embedd_one,embedd_two)

    return total_lost     

In [None]:
for param in head_gpt.parameters():
    param.require_grad = True

for param in poem.parameters():
    param.require_grad = True

In [None]:
 def train(model, train_loader, device, criterion, vocab_size, optimizer,num_epochs):
    list_loss = []
    n_batches, n_samples = len(train_loader), len(train_loader.dataset)
    
    model.train()
    torch.autograd.set_detect_anomaly(True)
    for index_epoch in range(num_epochs):
      losses = 0
      for i, batch in enumerate(train_loader):
        inputs = batch.to(device)
        targets = inputs[:, 1:].contiguous()
        
        lm_logits = model(inputs).logits 
        embedding = model.transformer(inputs)[0]
        lm_logits_1 = lm_logits[:, :-1].contiguous()

        loss = criterion(lm_logits_1.view(-1, vocab_size), targets.view(-1))
        loss = loss + sum([loss_kho_tho(lm_logits[i],embedding[i]) for i in range(lm_logits.shape[0])])*5
        print('index: {}, loss: {}'.format(i, loss))
        losses += loss.item()
        
        optimizer.zero_grad()
        loss.backward()
        
        optimizer.step()
      list_loss.append(losses)
      print('='*20)
      print('epoch: {}'.format(index_epoch))

      checkpoint = {'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict()}
      save_checkpoint(checkpoint, filename= "GPT-2/gpt_2_custom_loss_v2.pth.tar")
      print('Loss: {}'.format(list_loss))
      
train(poem, train_loader, device, criterion, vocab_size, optimizer, num_epochs= 100)

In [None]:
class TextGenerator():

    def __init__(self, max_tokens, start_tokens, maxlen, model, tokenizer,device, topk):
        self.max_tokens = max_tokens
        self.start_tokens = start_tokens
        self.maxlen = maxlen
        self.model = model
        self.tokenizer = tokenizer
        self.device = device
        self.k = topk 

    def sample_from(self, logits):
        logits, indices = torch.topk(logits, k=self.k, sorted=True)
        return np.random.choice(indices.cpu().numpy())


    def gen_poem(self):
        start_tokens = [_ for _ in self.start_tokens]
        num_tokens_generated = 0
        tokens_generated = []
        while num_tokens_generated <= self.max_tokens:
            pad_len = self.maxlen - len(start_tokens)
            sample_index = len(start_tokens) - 1
            if pad_len < 0:
                x = start_tokens[:self.maxlen]
                sample_index = self.maxlen - 1
            elif pad_len > 0:
                x = start_tokens + [0] * pad_len
            else:
                x = start_tokens
            x = torch.tensor([x], device= self.device)
            y = self.model(x).logits
            sample_token = self.sample_from(y[0][sample_index])
            tokens_generated.append(sample_token)
            start_tokens.append(sample_token)
            num_tokens_generated = len(tokens_generated)
            # print(sample_token)
        output_token = [_ for _ in self.start_tokens + tokens_generated]
        poem = self.tokenizer.decode(output_token)
        print(f"generated text:\n{poem}\n")

In [None]:
num_token_generated = 30
hint = 'mùa thu'
start_tokens = tokenizer.encode(hint)[:-1]
generator = TextGenerator(max_tokens= num_token_generated, start_tokens= start_tokens, maxlen= 300, model= poem, tokenizer= tokenizer,device= device, topk= 1)
generator.gen_poem()

generated text:
<s> mùa thu năm ấy gặp em 
 nắng chiều ấm áp nắng oi dòng rồi 
 còn không em nữa đôi môi 
 nụ cười e thẹn đôi môi ngập ngừng 


