In [9]:
import os
import re
import torch
import tqdm
import pandas as pd
import torch.nn as nn
import numpy as np
from transformers import BertModel
from torch.utils.data import Dataset, DataLoader
from kobert_tokenizer import KoBERTTokenizer
from adamp import AdamP
import torch.nn as nn
from collections import defaultdict

### Tokenizing Additional

In [10]:
batch_size = 16
epochs = 10
max_len = 512

In [11]:
class TokenMaking():
  def __init__(self, data):
    self.data = data

  def attention_mask(self):
    return np.random.choice(2, size=( len(self.data), ), p = [0.8, 0.2]).tolist()
    
  def token_type_id(self):
    return [0 for _ in range(max_len)]

In [5]:
def make_ids(tokenizer, string_):
  tokenized_string = tokenizer.tokenize(string_)
  return tokenizer.convert_tokens_to_ids(tokenized_string)

In [12]:
os.chdir("../data")

In [13]:
data = [pd.read_excel(file, engine='openpyxl') for file in os.listdir() if file.endswith("xlsx")]

In [14]:
data[0].dropna(subset=["content", "summary", "중요도(+/0/-)", "극성값(+/0/-)"], inplace = True)
data[1].dropna(subset=["content", "summary", "중요도(+/0/-)", "극성값(+/0/-)"], inplace = True)
for i in range(2):
  data[i].loc[:, "content"].replace('[-=+,#/\?:^$@*\"※~&%ㆍ!』\\‘|\[\]\<\>`\'…》]', " ", regex=True, inplace=True)
  data[i].loc[:, "content"].replace('\n', "", regex=True, inplace=True)
  data[i].loc[:, "summary"].replace('[-=+,#/\?:^$@*\"※~&%ㆍ!』\\‘|\[\]\<\>`\'…》]', " ", regex=True, inplace=True)
  data[i].loc[:, "summary"].replace('\n', "", regex=True, inplace=True)
  data[i].loc[:, "content"].replace('[가-힣]+ 기자', "", regex=True, inplace=True)
  data[i].loc[:, "summary"].replace('[가-힣]+ 기자', "", regex=True, inplace=True)
  data[i].loc[:, "content"].replace('[가-힣]+ 선임기자', "", regex=True, inplace=True)
  data[i].loc[:, "summary"].replace('[가-힣]+ 선임기자', "", regex=True, inplace=True)

In [15]:
for i in range(2):
  data[i].loc[(data[i]["중요도(+/0/-)"]=="+"), "중요도(+/0/-)"] = 1
  data[i].loc[(data[i]["중요도(+/0/-)"]=="0"), "중요도(+/0/-)"] = 0
  data[i].loc[(data[i]["중요도(+/0/-)"]=="-"), "중요도(+/0/-)"] = -1
  data[i].loc[(data[i]["극성값(+/0/-)"]=="+"), "극성값(+/0/-)"] = 1
  data[i].loc[(data[i]["극성값(+/0/-)"]=="0"), "극성값(+/0/-)"] = 0
  data[i].loc[(data[i]["극성값(+/0/-)"]=="-"), "극성값(+/0/-)"] = -1

In [16]:
for i in range(2):
  data[i] = data[i].astype({"중요도(+/0/-)":int})
  data[i] = data[i].astype({"극성값(+/0/-)":int})

### DataLoader

In [17]:
class AIRDataset(Dataset):
    def __init__(self, df):
      self.df = df

    def __len__(self):
      return len(self.df)

    def __getitem__(self, idx):
      text = self.df.iloc[idx, 4]
      label = self.df.iloc[idx, 7]
      return text, label

In [18]:
train_data = AIRDataset(data[0])
train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=0)
test_data = AIRDataset(data[1])
test_dataloader = DataLoader(test_data, batch_size=batch_size, shuffle=True, num_workers=0)

### Model

In [19]:
class FineTuneModel(nn.Module):
    def __init__(self):
        super(FineTuneModel, self).__init__()
        self.linear = nn.Linear(768, 3, bias=False)
        self.act = nn.Softmax(-1)
    def forward(self, x):
        result = self.linear(x)
        return self.act(result)

In [20]:
tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLNetTokenizer'. 
The class this function is called from is 'KoBERTTokenizer'.


In [21]:
pretrain_model = BertModel.from_pretrained('skt/kobert-base-v1')

In [22]:
tuning_model = FineTuneModel()

In [23]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
optimizer = AdamP(tuning_model.parameters(), lr=1e-3, weight_decay=1e-5)

In [25]:
tuning_model.train()
for epoch in tqdm.notebook.tqdm(range(epochs)):
  optimizer.zero_grad()
  for train, label in tqdm.notebook.tqdm(train_dataloader):
    train = [tokenizer.batch_encode_plus([t[:min(len(t), max_len-2)]]) for t in train]
    tokenized_train = [t["input_ids"][0]+[1]*(max(0, max_len-len(t["input_ids"][0]))) for t in train]
    tokenized_train = torch.LongTensor(tokenized_train)
    tokenized_train, label = tokenized_train.to(device), label.to(device)
    attention_mask = [t["attention_mask"][0]+[1]*(max(0, max_len-len(t["attention_mask"][0]))) for t in train]
    attention_mask, token_type = torch.LongTensor(attention_mask).to(device), torch.LongTensor(token_type).to(device)
    pretrain_data = pretrain_model(input_ids=tokenized_train, attention_mask=attention_mask)
    for_train_data = pretrain_data.pooler_output.data
    output = tuning_model(for_train_data)
    print(output.shape)
    break
  break

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/2104 [00:00<?, ?it/s]

torch.Size([16, 3])


In [29]:
torch.argmax(output, dim = -1)

tensor([2, 0, 2, 0, 0, 0, 2, 0, 2, 0, 2, 0, 2, 0, 0, 2])

In [27]:
label

tensor([ 1, -1, -1, -1,  1, -1, -1,  0, -1, -1, -1, -1,  0, -1, -1,  1])