### Ko-Gpt2

In [None]:
!pip install gluonnlp
!pip install transformers
!pip install mxnet

In [None]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla V100-SXM2-16GB


In [None]:
import torch
import torch.nn.functional as F
import torch.optim as optim
import os
import tensorflow as tf
import gluonnlp as nlp
import pandas as pd
import re
import numpy as np

from tqdm import tqdm, tqdm_notebook
from torch import nn
from torch.utils.data import Dataset, DataLoader
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from transformers import *
from kogpt2.pytorch_kogpt2 import get_pytorch_kogpt2_model
from kogpt2.utils import get_tokenizer
from gluonnlp.data import SentencepieceTokenizer


In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
import numpy as np; np.random.seed(1234)
import pandas as pd

data = pd.read_excel('/content/gdrive/My Drive/labeled_0825_924.xlsx')
data = data[['a_num','content','inform']]


In [None]:

ntrain = 800

trn, tst = data[:ntrain], data[ntrain:]

header = 'a_num depend quality'.split()
trn.to_csv('ratings_train.txt', sep='\t', index=False, header=header)
tst.to_csv('ratings_test.txt', sep='\t', index=False, header=header)

In [None]:
dataset_train = nlp.data.TSVDataset("ratings_train.txt", field_indices=[1,2], num_discard_samples=1)
dataset_test = nlp.data.TSVDataset("ratings_test.txt", field_indices=[1,2], num_discard_samples=1)

In [None]:
tok_path = get_tokenizer()
model_gpt, vocab, config = get_pytorch_kogpt2_model()
tok = nlp.data.BERTSPTokenizer(tok_path, vocab, lower=False)

using cached model
using cached model
using cached model


In [None]:
class GPT2Dataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, gpt2,
                 max_len, pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            gpt2, max_seq_length=max_len, pad=pad, pair=pair)
        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))

In [None]:
## Setting parameters
max_len = 512
batch_size = 8
warmup_ratio = 0.1
num_epochs = 5
max_grad_norm = 1
log_interval = 200
learning_rate =  1e-4

In [None]:
data_train = GPT2Dataset(dataset_train, 0, 1, tok, max_len, True, False)
data_test = GPT2Dataset(dataset_test, 0, 1, tok, max_len, True, False)

In [None]:
train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=5)
test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=5)

In [None]:
# attention mask 삭제
class GPT2Classifier(nn.Module):
    def __init__(self, gpt2, hidden_size = 768, num_classes = 2, dr_rate = None, params = None):
        super(GPT2Classifier, self).__init__()
        self.gpt2 = gpt2 # pre_trained model
        self.dr_rate = dr_rate

        self.classifier = nn.Linear(hidden_size, num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p = dr_rate)

    def forward(self, token_ids, valid_length, segment_ids):

        _, hidden_output = self.gpt2(input_ids=token_ids, token_type_ids=segment_ids.long())
        pooled_output = hidden_output[:,-1]
        # pooler_output = pooler_out[0][:,-1] ## transformers 모듈을 사용할 경우

        if self.dr_rate:
            out = self.dropout(pooled_output)

        return self.classifier(out)

In [None]:
model = GPT2Classifier(model_gpt, dr_rate = 0.5).to(device)