In [1]:
import os
os.chdir('/content/drive/My Drive/GPT/07_02_gpt/')

In [2]:
!pip install gluonnlp pandas tqdm
!pip install mxnet-cu101



In [4]:
!pip install sentencepiece==0.1.85
!pip install transformers==2.1.1
!pip install torch==1.3.1



In [5]:
import torch
import torch.nn as nn
import pickle
import pandas as pd
import json

from tqdm import tqdm, tqdm_notebook
from kogpt2.pytorch_kogpt2 import get_pytorch_kogpt2_model
from transformers import GPT2Config

os.chdir('/content/drive/My Drive/GPT/07_02_gpt/BERT_pairwise_text_classification')
from gluonnlp.data import SentencepieceTokenizer
from model.utils import PreProcessor, PadSequence
from torch.utils.data import Dataset,DataLoader
from typing import Tuple, List, Callable
from utils import Config, CheckpointManager, SummaryManager
from model.metric import evaluate, acc

In [6]:
batch_size = 64
gpt2model, vocab = get_pytorch_kogpt2_model()

using cached model
using cached model


In [7]:
#### TEST DATASET ####
class Corpus(Dataset):
    """Corpus class"""
    def __init__(self, filepath: str, transform_fn: Callable[[str], List[int]]) -> None:
        """Instantiating Corpus class
        Args:
            filepath (str): filepath
            transform_fn (Callable): a function that can act as a transformer
        """
        self._corpus = pd.read_csv(filepath, sep='\t')
        self._transform = transform_fn

    def __len__(self) -> int:
        return len(self._corpus)

    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
        q1, q2, is_duplicate = self._corpus.iloc[idx].tolist()
        list_of_indices, list_of_token_types = [torch.tensor(elm) for elm in self._transform(q1, q2)]
        label = torch.tensor(is_duplicate)
        return list_of_indices, list_of_token_types, label

In [8]:

def get_preprocessor(ptr_config_info):
    with open(ptr_config_info.vocab, mode='rb') as io:
        vocab = pickle.load(io)
        
        ptr_tokenizer = SentencepieceTokenizer(ptr_config_info.tokenizer)
        pad_sequence = PadSequence(length=64, pad_val=vocab.to_indices(vocab.padding_token))
        preprocessor = PreProcessor(vocab=vocab, split_fn=ptr_tokenizer, pad_fn=pad_sequence)
    
    return preprocessor

class PairwiseClassifier(nn.Module):
    def __init__(self, 
                 gpt2,
                 num_classes, 
                 vocab, 
                 hidden_size=768, 
                 dr_rate = None):# -> None:
        super(PairwiseClassifier, self).__init__()
        self.gpt2 = gpt2model
        self.classifier = nn.Linear(hidden_size, num_classes)
        self.vocab = vocab
        self.dr_rate = dr_rate
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)

    def forward(self, input_ids, token_type_ids) -> torch.Tensor:
        _, pooled_output = self.gpt2(input_ids = input_ids, token_type_ids=token_type_ids.long())
        pool = pooled_output[:,-1]
        #pooled_output = self.dropout(pooled_output)
        #logits = self.classifier(pooled_output)

        if self.dr_rate:
            out = self.dropout(pool)
        out2 = pool
        return self.classifier(out2)

In [24]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

dataset_config = {
    "train": "qpair/train.txt",
    "validation": "qpair/validation.txt",
    "test": "qpair/test.txt"
    }

ptr_config_info = Config(f"conf/pretrained/skt.json")
preprocessor = get_preprocessor(ptr_config_info)

with open(ptr_config_info.config, mode="r") as io:
    ptr_config = json.load(io)
  
config = GPT2Config()
    
test_ds = Corpus(dataset_config["test"], preprocessor.preprocess)
test_dl = DataLoader(test_ds, batch_size=batch_size, num_workers=4)

model = PairwiseClassifier(gpt2model, num_classes=2, vocab=vocab)    

model.load_state_dict(torch.load('checkpoint2/checkpoint_{}.model'.format(str(19+1).zfill(2))))
model.to(device)     

PairwiseClassifier(
  (gpt2): GPT2LMHeadModel(
    (transformer): GPT2Model(
      (wte): Embedding(50000, 768)
      (wpe): Embedding(1024, 768)
      (drop): Dropout(p=0.1, inplace=False)
      (h): ModuleList(
        (0): Block(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): Attention(
            (c_attn): Conv1D()
            (c_proj): Conv1D()
            (attn_dropout): Dropout(p=0.1, inplace=False)
            (resid_dropout): Dropout(p=0.1, inplace=False)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): MLP(
            (c_fc): Conv1D()
            (c_proj): Conv1D()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (1): Block(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): Attention(
            (c_attn): Conv1D()
            (c_proj): Conv1D()
            (attn_dropout): Dropout(p=0.1, inplace=False)
     

In [26]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model.eval()    
test_acc = 0.0

for step, mb in enumerate(tqdm_notebook(test_dl)):
    x_mb, x_types_mb, label = map(lambda elm: elm.to(device), mb)

    out = model(x_mb, x_types_mb)
    test_acc += accuracy(out, label)
print("test acc {}".format(test_acc / (step+1)))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))


test acc 0.8624131679534912
