# GPT2

## Drive mount

In [1]:
!nvidia-smi

Wed Aug  4 09:58:01 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.42.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   42C    P0    29W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Install Transformers

In [None]:
!pip install transformers

## Import modules

In [5]:
import os
import re
import pandas as pd
import numpy as np
from tqdm import tqdm
import warnings

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset

from transformers import AutoTokenizer, AutoModelWithLMHead, GPT2ForSequenceClassification, PreTrainedTokenizerFast
warnings.filterwarnings('ignore')

In [6]:
print(torch.__version__)

1.9.0+cu102


In [23]:
#os.environ["CUDA_LAUNCH_BLOCKING"] = '0'

## Import Data

In [21]:
PATH = '/content/drive/MyDrive/gh/dacon_newstopic'
train      = pd.read_csv(PATH + '/DATA/train_clean_4bert.csv')
test       = pd.read_csv(PATH + '/DATA/test_clean_4bert.csv')
submission = pd.read_csv(PATH + '/DATA/sample_submission.csv')
submission_1 = pd.read_csv(PATH + '/DATA/sample_submission.csv')
topic_dict = pd.read_csv(PATH + '/DATA/topic_dict.csv')

In [22]:
train.dtypes

Unnamed: 0       int64
index            int64
title           object
topic_idx        int64
ko_to_en        object
en_to_ko        object
c_title         object
c_title_enko    object
dtype: object

## Tokenizer

In [None]:
#tokenizer = AutoTokenizer.from_pretrained("skt/kogpt2-base-v2")
tokenizer = PreTrainedTokenizerFast.from_pretrained("skt/kogpt2-base-v2")

model = GPT2ForSequenceClassification.from_pretrained("skt/kogpt2-base-v2")
model.score = torch.nn.Linear(768, 7)
model.cuda()

In [24]:
class TrainDataset(Dataset):
    def __init__(self, data, tokenizer, max_seq_len=40):
        self.data = data
        self.max_seq_len = max_seq_len
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        record = self.data.iloc[index]
        document, label = str(record['c_title']), int(record['topic_idx'])
        tokens = self.tokenizer.tokenize(document)
        encoder_input_id = self.tokenizer.convert_tokens_to_ids(tokens)
        attention_mask = [1] * len(encoder_input_id)
        if len(encoder_input_id) < self.max_seq_len:
            while len(encoder_input_id) < self.max_seq_len:
                encoder_input_id += [tokenizer.convert_tokens_to_ids('<pad>')]
                attention_mask += [0]
        else:
            encoder_input_id = encoder_input_id[:self.max_seq_len - 1] + [
                self.tokenizer.eos_token_id]
            attention_mask = attention_mask[:self.max_seq_len]
        return {'input_ids': np.array(encoder_input_id, dtype=np.int_),
                'attention_mask': np.array(attention_mask, dtype=np.float),
                'labels': np.array(label, dtype=np.int_)}
    
class TestDataset(Dataset):
    def __init__(self, data, tokenizer, max_seq_len=40):
        self.data = data
        self.max_seq_len = max_seq_len
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        record = self.data.iloc[index]
        document = str(record['c_title'])
        tokens = self.tokenizer.tokenize(document)
        encoder_input_id = self.tokenizer.convert_tokens_to_ids(tokens)
        attention_mask = [1] * len(encoder_input_id)
        if len(encoder_input_id) < self.max_seq_len:
            while len(encoder_input_id) < self.max_seq_len:
                encoder_input_id += [tokenizer.convert_tokens_to_ids('<pad>')]
                attention_mask += [0]
        else:
            encoder_input_id = encoder_input_id[:self.max_seq_len - 1] + [
                self.tokenizer.eos_token_id]
            attention_mask = attention_mask[:self.max_seq_len]
        return {'input_ids': np.array(encoder_input_id, dtype=np.int_),
                'attention_mask': np.array(attention_mask, dtype=np.float)}

In [25]:
# train parameters
epochs = 10
#batch_size = 32

my_learning_rate = 3E-6 # default is 5E-5
my_adam_epsilon = 1E-8 # default is 1E-8
my_number_of_epochs = 7
my_warmup = 3
my_mini_batch_size = 128

batch_size = my_mini_batch_size

In [26]:
# train loader
# train_ds = TrainDataset(train, tokenizer)
# loader = DataLoader(train_ds, batch_size=batch_size, num_workers=0, shuffle=True)

train_ds = TrainDataset(train, tokenizer)
loader = DataLoader(train_ds, batch_size=batch_size, num_workers=0, shuffle=True)
total_steps = len(loader) * epochs

In [27]:
from transformers import get_linear_schedule_with_warmup

In [28]:
# # optimizer
# optimizer = torch.optim.AdamW(model.parameters(), lr=2e-4)
# scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, 5, )
# loss_fn = torch.nn.CrossEntropyLoss()


optimizer = torch.optim.AdamW(model.parameters(),
                  lr = my_learning_rate, #args.learning_rate
                  eps = my_adam_epsilon  #args.adam_epsilon
                )
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = my_warmup, 
                                            num_training_steps = total_steps)
loss_fn = torch.nn.CrossEntropyLoss()

In [29]:
import time
import datetime
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [30]:
model.train()
for e in range(epochs):
    total_loss = 0
    print("")
    print('======== Epoch {:} / {:} ========'.format(e + 1, epochs))
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()
    for step, batch in enumerate(loader):
        if step % 100 == 0 and not step == 0:
        # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(loader), elapsed))

        optimizer.zero_grad()
        ids, atts, labels = batch['input_ids'], batch['attention_mask'], batch['labels']
        ids = torch.tensor(ids).long().cuda()
        atts = torch.tensor(atts).long().cuda()
        labels = torch.tensor(labels).long().cuda()
        pred = model(ids, attention_mask=atts)
        loss = loss_fn(pred[0], labels)
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.)
        optimizer.step()
        total_loss += loss.item()
    avg_train_loss = total_loss / len(loader) 
    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(format_time(time.time() - t0)))        
    scheduler.step()
    print(e, total_loss)
print("")
print("Training complete!")


Training...
  Batch   100  of    357.    Elapsed: 0:01:00.
  Batch   200  of    357.    Elapsed: 0:01:59.
  Batch   300  of    357.    Elapsed: 0:02:58.

  Average training loss: 2.36
  Training epoch took: 0:03:32
0 843.9357922077179

Training...
  Batch   100  of    357.    Elapsed: 0:00:59.
  Batch   200  of    357.    Elapsed: 0:01:59.
  Batch   300  of    357.    Elapsed: 0:02:58.

  Average training loss: 0.84
  Training epoch took: 0:03:32
1 299.569109082222

Training...
  Batch   100  of    357.    Elapsed: 0:00:59.
  Batch   200  of    357.    Elapsed: 0:01:59.
  Batch   300  of    357.    Elapsed: 0:02:58.

  Average training loss: 0.41
  Training epoch took: 0:03:32
2 146.2905465066433

Training...
  Batch   100  of    357.    Elapsed: 0:00:59.
  Batch   200  of    357.    Elapsed: 0:01:59.
  Batch   300  of    357.    Elapsed: 0:02:58.

  Average training loss: 0.35
  Training epoch took: 0:03:32
3 124.14787873625755

Training...
  Batch   100  of    357.    Elapsed: 0:00:

In [31]:
# test loader
test_ds = TestDataset(test, tokenizer)
test_loader = DataLoader(test_ds, 8)

In [32]:
preds = []
model.eval()
preds_1 = []
true = []
for b in tqdm(test_loader):
    ids, atts = b['input_ids'], b['attention_mask']
    ids = torch.tensor(ids).long().cuda()
    atts = torch.tensor(atts).long().cuda()
    pred = model(ids, attention_mask=atts)
    logits1 = pred[0]
    logits1 = logits1.detach().cpu().numpy()
    preds_1.append(logits1)
    preds += list(np.argmax(pred[0].detach().cpu().numpy(), 1))
#     break
flat_predictions_1 = [item for sublist in preds_1 for item in sublist]

100%|██████████| 1142/1142 [00:23<00:00, 48.27it/s]


## Submission

In [33]:
# Combine the predictions for each batch into a single list of 0s and 1s.
flat_predictions = [item for sublist in preds for item in sublist]
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
valid_flat_predictions = np.argmax(valid_flat_predictions, axis=1).flatten()
#

TypeError: ignored

In [34]:
submission['topic_idx'] = preds
submission.head(20)


Unnamed: 0,index,topic_idx
0,45654,0
1,45655,3
2,45656,2
3,45657,0
4,45658,3
5,45659,0
6,45660,5
7,45661,3
8,45662,4
9,45663,4


In [35]:
submission.to_csv(PATH + '/koGPT2/maxlen40_gpt2_torch.csv', index = False)

In [None]:
# Combine the predictions for each batch into a single list of 0s and 1s.
flat_predictions = [item for sublist in preds for item in sublist]
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
valid_flat_predictions = np.argmax(valid_flat_predictions, axis=1).flatten()
#

In [None]:
import torch.nn.functional as F
z = torch.FloatTensor(flat_predictions_1)

probs_1 = F.softmax(z, dim=1)
probs_1 = (probs_1).tolist()

i, j, k=  [], [], []
m, n, o = [], [], []
l = []
for row in probs_1:
  i.append(row[0])
  j.append(row[1])
  k.append(row[2])
  l.append(row[3])
  m.append(row[4])
  n.append(row[5])
  o.append(row[6])


submission_1['0'] = i
submission_1['1'] = j
submission_1['2'] = k
submission_1['3'] = l
submission_1['4'] = m
submission_1['5'] = n
submission_1['6'] = o
submission_1.topic_idx = preds

submission_1.to_csv(PATH + '/koGPT2/koGPT2_proba_.csv',index = False)

## Dacon api

In [None]:
!pip install dacon_submit_api-0.0.4-py3-none-any.whl

Processing ./dacon_submit_api-0.0.4-py3-none-any.whl
[31mERROR: Could not install packages due to an EnvironmentError: [Errno 2] No such file or directory: '/content/dacon_submit_api-0.0.4-py3-none-any.whl'
[0m


In [None]:
from dacon_submit_api import dacon_submit_api 

result = dacon_submit_api.post_submission_file(
'/content/drive/MyDrive/공민표/torch_gpt2/submission_gpt2_torch.csv', # 파일경로
'd4f6a76341e9e0ccf85124528a36319e9617e4b95cf11687c29f3bff7f083aa7',  # token
'235747', # 대회 id
'그냥취미로합니다.',  # 팀이름
'gpt2_test') # 노트