<a href="https://colab.research.google.com/github/cow-coding/Competition/blob/main/Dacon/Dacon_17_news_topic_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install mxnet
!pip install gluonnlp pandas tqdm
!pip install sentencepiece
!pip install transformers==3 # 최신 버전으로 설치하면 "Input: must be Tensor, not str" 라는 에러 발생
!pip install torch

In [None]:
!pip install git+https://git@github.com/SKTBrain/KoBERT.git@master

Collecting git+https://****@github.com/SKTBrain/KoBERT.git@master
  Cloning https://****@github.com/SKTBrain/KoBERT.git (to revision master) to /tmp/pip-req-build-cldwkzb_
  Running command git clone -q 'https://****@github.com/SKTBrain/KoBERT.git' /tmp/pip-req-build-cldwkzb_
Building wheels for collected packages: kobert
  Building wheel for kobert (setup.py) ... [?25l[?25hdone
  Created wheel for kobert: filename=kobert-0.1.2-py3-none-any.whl size=12770 sha256=48ee005b1651cde16c738d48c45b1e1f9bded683c9ed0dea0603c8aaf47c1e6a
  Stored in directory: /tmp/pip-ephem-wheel-cache-el31fr60/wheels/d3/68/ca/334747dfb038313b49cf71f84832a33372f3470d9ddfd051c0
Successfully built kobert
Installing collected packages: kobert
Successfully installed kobert-0.1.2


In [None]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
from tqdm import tqdm, tqdm_notebook

import pandas as pd

from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model

from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
bertmodel, vocab = get_pytorch_kobert_model()

[██████████████████████████████████████████████████]
[██████████████████████████████████████████████████]


In [None]:
PATH = '/content/drive/MyDrive/data/17th_comp/'
train = pd.read_csv(PATH+'train_data.csv')
test = pd.read_csv(PATH+'test_data.csv')
topic_dict = pd.read_csv(PATH+'topic_dict.csv')

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = train.drop('index', axis=1).values

train_data, valid_data = train_test_split(X, random_state=2021, test_size=0.2, shuffle=True)

In [None]:
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

using cached model


In [None]:
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len, pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)
        
        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))

In [None]:
max_len = 64
batch_size = 64
warmup_ratio = 0.1
num_epochs = 5
max_grad_norm = 1
log_interval = 200
learning_rate = 5e-5

In [None]:
transform = nlp.data.BERTSentenceTransform(
            tok, max_seq_length=max_len, pad=True, pair=False)

In [None]:
data_train = BERTDataset(train_data, 0, 1, tok, max_len, True, False)
data_valid = BERTDataset(valid_data, 0, 1, tok, max_len, True, False) # values형태의 np.array로 안넘기면 오류남
# data_test = [transform([i]) for i in test.title.values] # BERTDataset으로 하면 라벨이 없어서 오류 그래서 transfomr만 뽑아서 처리

In [None]:
train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=5)
valid_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=5)
# test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=1, num_workers=5)

  cpuset_checked))


In [None]:
class BERTClassifier(nn.Module):
    def __init__(self, bert, hidden_size=768, num_classes=7, dr_rate=None, params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate

        self.classifier = nn.Linear(hidden_size, num_classes)

        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_dis, valid_length):
        attention_mask = torch.zeros_like(token_ids)

        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)

        _, pooler = self.bert(input_ids=token_ids, token_type_ids=segment_ids.long(), attention_mask=attention_mask.float().to(token_ids.device))

        if self.dr_rate:
            out = self.dropout(pooler)
        
        return self.classifier(out)

In [None]:
model = BERTClassifier(bertmodel, dr_rate=0.5).to(device)

In [None]:
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
                                {'params':[p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
                                {'params':[p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
                                ]

In [None]:
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()

t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)

scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

In [None]:
def calc_accuracy(X, Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy() / max_indices.size()[0]

    return train_acc

In [None]:
for e in range(num_epochs):
    train_acc = 0.0
    valid_acc = 0.0
    
    model.train()

    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length = valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()

        train_acc += calc_accuracy(out, label)

        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))

    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))

    

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  import sys


HBox(children=(FloatProgress(value=0.0, max=571.0), HTML(value='')))

  cpuset_checked))


epoch 1 batch id 1 loss 1.9436252117156982 train acc 0.1875
epoch 1 batch id 201 loss 0.3686245083808899 train acc 0.5827114427860697
epoch 1 batch id 401 loss 0.38915586471557617 train acc 0.7228802992518704

epoch 1 train acc 0.7669009387854845


HBox(children=(FloatProgress(value=0.0, max=571.0), HTML(value='')))

epoch 2 batch id 1 loss 0.4619449973106384 train acc 0.859375
epoch 2 batch id 201 loss 0.28017356991767883 train acc 0.8760105721393034
epoch 2 batch id 401 loss 0.31428560614585876 train acc 0.88910536159601

epoch 2 train acc 0.8948276432615159


HBox(children=(FloatProgress(value=0.0, max=571.0), HTML(value='')))

epoch 3 batch id 1 loss 0.324025422334671 train acc 0.875
epoch 3 batch id 201 loss 0.13984699547290802 train acc 0.9153451492537313
epoch 3 batch id 401 loss 0.23750290274620056 train acc 0.9252259975062345

epoch 3 train acc 0.9305921119618784


HBox(children=(FloatProgress(value=0.0, max=571.0), HTML(value='')))

epoch 4 batch id 1 loss 0.24510149657726288 train acc 0.90625
epoch 4 batch id 201 loss 0.12983198463916779 train acc 0.9420087064676617
epoch 4 batch id 401 loss 0.057784244418144226 train acc 0.9502805486284289

epoch 4 train acc 0.9538103795870159


HBox(children=(FloatProgress(value=0.0, max=571.0), HTML(value='')))

epoch 5 batch id 1 loss 0.19626639783382416 train acc 0.90625
epoch 5 batch id 201 loss 0.09043648093938828 train acc 0.9617537313432836
epoch 5 batch id 401 loss 0.02466246485710144 train acc 0.9667238154613467

epoch 5 train acc 0.9690656437095263


In [None]:
test_acc = 0

for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(valid_dataloader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        test_acc += calc_accuracy(out, label)
print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=0.0, max=571.0), HTML(value='')))

  cpuset_checked))



epoch 5 test acc 0.9728966419582129


In [None]:
test_sentence = test.sample(2).title.values

In [None]:
test_set = [transform([i]) for i in test.title.values] # BERTDataset으로 하면 라벨이 없어서 오류 그래서 transfomr만 뽑아서 처리

In [None]:
test_input = torch.utils.data.DataLoader(test_set, batch_size=1, num_workers=5)

ans = []

for batch_id, (token_ids, valid_length, segment_ids) in enumerate(tqdm_notebook(test_input)):
  token_ids = token_ids.long().to(device)
  segment_ids = segment_ids.long().to(device)
  valid_length= valid_length
  out = model(token_ids, valid_length, segment_ids)
  ans.append(out.data.cpu())

  cpuset_checked))
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """


HBox(children=(FloatProgress(value=0.0, max=9131.0), HTML(value='')))






In [None]:
len(ans), len(test)

(9131, 9131)

In [None]:
import tensorflow as tf

In [None]:
a = []

for item in ans:
    a.append(np.argmax(item, axis=1).numpy())

tmp = []
for item in a:
    tmp.append(item[0])

In [None]:
sub = pd.read_csv(PATH+'submission/submission.csv')
sub.topic_idx = tmp

In [None]:
sub

Unnamed: 0,index,topic_idx
0,45654,2
1,45655,3
2,45656,2
3,45657,0
4,45658,3
...,...,...
9126,54780,3
9127,54781,2
9128,54782,3
9129,54783,2


In [None]:
sub.to_csv(PATH+'submission/submission_koBERT.csv', index=False)