## Import Library

In [1]:
from datetime import datetime

import logging
import re
import time
import sys

# Data preprocessing
import os
from collections import Counter
from nltk.tokenize import word_tokenize

# for transformer 
from soynlp.normalizer import repeat_normalize
from soynlp.normalizer import *
from soynlp.noun import NewsNounExtractor
from transformers import BertModel, BertTokenizer, AdamW
from transformers.optimization import get_cosine_schedule_with_warmup
import gluonnlp as nlp
from kobert.pytorch_kobert import get_pytorch_kobert_model
from kobert.utils import get_tokenizer

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from tqdm import tqdm, tqdm_notebook

# torch library
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import Adam
from sklearn.metrics import f1_score, accuracy_score

import warnings
warnings.filterwarnings(action='ignore')

In [2]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
device = torch.device("cuda:0")

## Data load & Preprocessing

In [3]:
# Data Loading to clustering

root_path = os.getcwd()
print(root_path)
dir_path = root_path +'/2-1. analysis dataset/esg_train.csv'

df = pd.read_csv(dir_path)
#df = df.drop(['Unnamed: 0'], axis=1)
df.head()

/nas1/yongk/kpmg/BERT_for_finetuning


Unnamed: 0.1,Unnamed: 0,date,title,contents,label
0,0,2020-01-01,피부 철벽같이 보호하는 '철벽녀'…3년 연속 홈쇼핑 화장품 히트상품,이종섭 뷰티피플 대표(50·사진)는 1990년대 중반 지인의 권유로 LG화학(현 L...,0
1,1,2020-01-01,"(2020 위기가 기회다)유통가, 온라인에 더 몰린다…배송망·전용 상품 강화 총…",LG생활건강은 세계적인 안무팀과 협업해 온라인 전용 화장품 브랜드 '밀리언뷰티'를 ...,0
2,2,2020-01-01,[2020 뷰티 전망] 럭셔리 아니면 저가 양극화…맞춤형 화장품 시대 열린다,반면 럭셔리 화장품 '설화수'와 '후'를 내세운 아모레퍼시픽과 LG생활건강 등 '빅...,0
3,3,2020-01-01,실험실 홀랑 태울뻔…日화장품 원료 국산화한 군산 중소기업,“아모레퍼시픽·LG생활건강·콜마 등 국내 화장품 제조사에 보낸 샘플은 긍정적인 답변...,0
4,4,2020-01-01,"아모레 안젤라베이비 ‘설화수’ 누른 LG생활건강 이영애 ‘후’, 2020년에도...",2019년 LG생활건강의 럭셔리 브랜드 ‘후’가 선전하며 사상 최대 실적을 기록했다...,0


In [4]:
def preprocess_data(data, data_colname):
    """
      tips: csv 데이터를 받아 지정된 column의 내용을 preprocess 합니다.
      Args:
          data_path : csv데이터의 path
          data_colname : 지정할 column명
      Returns:
          lucy_data : DataFrame
    """
    lucy_data = data

    lucy_data[data_colname] = lucy_data[data_colname].str.replace("\(.*\)|\s-\s.*"," " ,regex=True)
    lucy_data[data_colname] = lucy_data[data_colname].str.replace("\[.*\]|\s-\s.*"," ",regex=True)
    lucy_data[data_colname] = lucy_data[data_colname].str.replace("\<.*\>|\s-\s.*"," ",regex=True)
    lucy_data[data_colname] = lucy_data[data_colname].str.replace("무단전재 및 재배포 금지"," ",regex=True)
    lucy_data[data_colname] = lucy_data[data_colname].str.replace("무단 전재 및 재배포 금지"," ",regex=True)
    lucy_data[data_colname] = lucy_data[data_colname].str.replace("©"," ",regex=True)
    lucy_data[data_colname] = lucy_data[data_colname].str.replace("ⓒ"," ",regex=True)
    lucy_data[data_colname] = lucy_data[data_colname].str.replace("저작권자"," ",regex=True)
    lucy_data[data_colname] = lucy_data[data_colname].str.replace(".* 기자", " ", regex=True) #기자 이름에서 오는 유사도 차단
    lucy_data[data_colname] = lucy_data[data_colname].str.replace("사진 = .*", " ", regex=True) #사진 첨부 문구 삭제
    lucy_data[data_colname] = lucy_data[data_colname].str.replace("사진=.*", " ", regex=True) #사진 첨부 문구 삭제
    lucy_data[data_colname] = lucy_data[data_colname].str.replace('\"', "",regex=True)
    lucy_data[data_colname] = lucy_data[data_colname].str.replace("([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+.[a-zA-Z0-9-.]+)", " ", regex=True) #이메일 주소에서 오는 유사도 차단
    lucy_data[data_colname] = lucy_data[data_colname].str.replace("\n"," ")
    lucy_data[data_colname] = lucy_data[data_colname].str.replace("\r"," ")
    lucy_data[data_colname] = lucy_data[data_colname].str.replace("\t"," ")
    lucy_data[data_colname] = lucy_data[data_colname].str.replace( "\’" , "", regex=True)
    lucy_data[data_colname] = lucy_data[data_colname].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]"," ")
    lucy_data[data_colname] = lucy_data[data_colname].str.replace("[ ]{2,}"," ",regex=True)
    lucy_data[data_colname] = lucy_data[data_colname].str.replace("?","",regex=True)
    
    return lucy_data

In [9]:
# Data preprocessing
#df.contents = df.contents.str[:3]
#df.category = df.category.str[:10]
# df의 title과 contents를 합쳐서 text column을 만들기
df['text'] = df['title'] + df['contents']

clean_data = preprocess_data(df, 'text')
clean_data['text'] = clean_data['text'].str.replace(">"," ")
esg_data = clean_data

#esg_data = esg_data.drop(['Unnamed: 0'], axis=1)

# esg_data의 date, text, label column만 가져오기
df_1 = esg_data[['date', 'text', 'label']]

In [18]:
real_data_list = []
for date, q, label in zip(esg_data['date'], esg_data['text'], esg_data['label']):
    data = []
    data.append(date)
    data.append(str(q))
    label = int(label)
    data.append(str(label))
    
    real_data_list.append(data)

### Config

In [19]:
# argment

max_len = 64
batch_size = 64
warmup_ratio = 0.1
num_epochs = 1
max_grad_norm = 1
log_interval = 200
learning_rate = 0.00006

### Set Dataloader

In [20]:
train_df, test_df = train_test_split(esg_data, test_size = 0.2, random_state=0, shuffle=True)

In [21]:
# train['text']를 string으로 변환
train_df['text'] = train_df['text'].astype(str)

In [22]:
# KoBERT Dataloader
class KoBERTDataset(Dataset):
    def __init__(self, dataset_text, dataset_date, dataset_label, bert_tokenizer, max_len,
                 pad, pair):
        
        transform = nlp.data.BERTSentenceTransform(bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)
        self.date = [[i] for i in dataset_date]
        self.text = [[i] for i in dataset_text]
        self.sentences = [transform([i]) for i in dataset_text]
        self.labels = [np.int32(i) for i in dataset_label]
        
    def __getitem__(self, i):
        return (self.sentences[i] + (self.text[i],) + (self.date[i],) + (self.labels[i],))
    
    def __len__(self):
        return (len(self.sentences))

In [23]:
# import pretrained kobert
from transformers import AutoTokenizer, AutoModelForMaskedLM
bertmodel, vocab = get_pytorch_kobert_model()
KoBERT_tokenizer = get_tokenizer()
BERT_tokenizer = AutoTokenizer.from_pretrained("snunlp/KR-FinBert")
tok = nlp.data.BERTSPTokenizer(KoBERT_tokenizer, vocab, lower=False)

using cached model. /nas1/yongk/kpmg/BERT_for_finetuning/.cache/kobert_v1.zip
using cached model. /nas1/yongk/kpmg/BERT_for_finetuning/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece
using cached model. /nas1/yongk/kpmg/BERT_for_finetuning/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


In [24]:
# KoBERT Dataloader

data_train = KoBERTDataset(train_df['text'], train_df['date'], train_df['label'], tok, max_len, True, False)
data_test = KoBERTDataset(test_df['text'], test_df['date'], test_df['label'], tok, max_len, True, False)
KoBERT_train_loader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, shuffle=True, num_workers=5)
KoBERT_test_loader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, shuffle=True, num_workers=5)

## Modeling

In [25]:
class BERTClassifier(nn.Module):
    def __init__(self, bert, hidden_size = 768, num_classes = 4, dr_rate = 0.2, params = None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
        
        self.classifier = nn.Linear(hidden_size, num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        
        return attention_mask.float()
    
    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        #pooler = pooler.logits
        
        if self.dr_rate:
            out = self.dropout(pooler)
            
        return self.classifier(out)

### Utils

In [26]:
# Accuracy

def calc_accuracy(X, Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    
    return train_acc

In [27]:
# For training

class AverageMeter(object):
    def __init__(self, name):
        self.name = name
        self.reset()

    def reset(self):
        self.sum = 0
        self.count = 0
        self.avg = 0

    def update(self, val, n=1):
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

    def __str__(self):
        fmtstr = f'{self.name:10s} {self.avg:.8f}'
        return fmtstr

class ProgressMeter(object):
    def __init__(self, meters, loader_length, prefix=""):
        self.meters = [AverageMeter(i) for i in meters]
        self.loader_length = loader_length
        self.prefix = prefix
    
    def reset(self):
        for m in self.meters:
            m.reset()
    
    def update(self, values, n=1):
        for m, v in zip(self.meters, values):
            m.update(v, n)
            self.__setattr__(m.name, m.avg)

    def display(self, batch_idx, postfix=""):
        batch_info = f'[{batch_idx+1:03d}/{self.loader_length:03d}]'
        msg = [self.prefix + ' ' + batch_info]
        msg += [str(meter) for meter in self.meters]
        msg = ' | '.join(msg)

        sys.stdout.write('\r')
        sys.stdout.write(msg + postfix)
        sys.stdout.flush()

## KoBERT

In [28]:
from transformers import AutoTokenizer, AutoModelForMaskedLM

#tokenizer = AutoTokenizer.from_pretrained("snunlp/KR-FinBert")
kr_fin = AutoModelForMaskedLM.from_pretrained("snunlp/KR-FinBert")

In [29]:
model = BERTClassifier(bertmodel, dr_rate = 0.2).to(device)

no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params':[p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params':[p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

t_total = len(KoBERT_train_loader) * num_epochs
warmup_step = int(t_total * warmup_ratio)

#loss_fn = nn.BCEWithLogitsLoss()
loss_fn = nn.CrossEntropyLoss().to(device)

optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

In [30]:
class Trainer(object):
    def __init__(self, model, criterion, optimizer, scheduler, device):
        self.model = model
        self.criterion = criterion
        self.optimizer = optimizer
        self.scheduler = scheduler
        self.device = device
        self.best_epoch, self.best_accuracy = 0, 0
    
    def train(self, train_loader, epoch):
        progress = ProgressMeter(["train_loss", "train_acc"], len(train_loader), prefix=f'EPOCH {epoch:03d}')
        self.model.train()
        start_time = time.time()
        
        for batch_id, (token_ids, valid_length, segment_ids, text, date, label) in enumerate(tqdm_notebook(train_loader)):
            self.optimizer.zero_grad()
            
            label = label.unsqueeze(1)
            label = label.to(torch.int64)
            label = label.squeeze(dim=-1)
            label = label.long()
            
            token_ids, valid_length, segment_ids, label = token_ids.to(self.device), valid_length.to(self.device), segment_ids.to(self.device), label.to(self.device)
            logits = self.model(token_ids, valid_length, segment_ids)
            logits = logits.to(torch.float32) # torch.size([64, 7])
            
            loss = self.criterion(logits, label)
            
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
            self.optimizer.step()
            self.scheduler.step()
            #label = label.cpu().detach().numpy()
            #logits = logits.cpu().detach().numpy()
            
            acc = calc_accuracy(logits, label)
            #macro_f1 = f1_loss(label, logits)
            
            loss = loss.item()
            progress.update([loss, acc], n=token_ids.size(0))
            if batch_id % 20 == 0:
                progress.display(batch_id+1)
                
        self.scheduler: self.scheduler.step()
        finish_time = time.time()
        epoch_time = finish_time - start_time
        progress.display(batch_id, f' | {epoch_time:.0f}s' + '\n')
        
    def validate(self, val_loader, epoch):
        progress = ProgressMeter(["val_loss", "val_acc"], len(val_loader), prefix=f'VALID {epoch:03d}')
        self.model.eval()
        
        with torch.no_grad():
            for batch_id, (token_ids, valid_length, segment_ids, text, date, label) in enumerate(tqdm_notebook(val_loader)):
                
                label = label.unsqueeze(1)
                label = label.to(torch.int64)
                label = label.squeeze(dim=-1)
                label = label.long()
                
                token_ids, valid_length, segment_ids, label = token_ids.to(self.device), valid_length.to(self.device), segment_ids.to(self.device), label.to(self.device)
                logits = self.model(token_ids, valid_length, segment_ids)
            
                logits = logits.to(torch.float32)
                
                loss = self.criterion(logits, label)
                
                acc = calc_accuracy(logits, label)
                #macro_f1 = f1_score(label, logits)
                progress.update([loss, acc], n=token_ids.size(0))
            
            progress.display(batch_id, '\n')
            
    def test(self, test_loader):
        progress = ProgressMeter(["test_loss", "test_acc"], len(test_loader), prefix=f'TEST')
        #ckpt = torch.load(self.output_path + '/ckpt.pt')
        #self.model.load_state_dict(ckpt['model_state_dict'])
        self.model.eval()

        with torch.no_grad():
            for batch_id, (token_ids, valid_length, segment_ids, text, date, label) in enumerate(tqdm_notebook(test_loader)):
                label = label.unsqueeze(1)
                label = label.to(torch.int64)
                label = label.squeeze(dim=-1)
                label = label.long()
                
                token_ids, valid_length, segment_ids, label = token_ids.to(self.device), valid_length.to(self.device), segment_ids.to(self.device), label.to(self.device)
                
                logits = self.model(token_ids, valid_length, segment_ids)
                logits = logits.to(torch.float32)
                
                loss = self.criterion(logits, label)
                
                acc = calc_accuracy(logits, label)
                #macro_f1 = f1_score(label, logits, zero_division='warn', average='macro')
                progress.update([loss, acc], n=token_ids.size(0))
            progress.display(batch_id, '\n')
            torch.save(self.model, root_path + '/2-1. analysis dataset/esg_class_finetuning.pt')

In [31]:
# trainer config define
trainer = Trainer(model, loss_fn, optimizer, scheduler, device)

In [32]:
# train & test
for epoch in range(num_epochs):
        trainer.train(KoBERT_train_loader, epoch)

  0%|          | 0/244 [00:00<?, ?it/s]

EPOCH 000 [244/244] | train_loss 0.44685975 | train_acc  0.85877446 | 65s


In [33]:
# test traiend kobert
test_esg = trainer.test(KoBERT_test_loader)
test_esg

  0%|          | 0/61 [00:00<?, ?it/s]

TEST [061/061] | test_loss  0.31383690 | test_acc   0.89915319


In [None]:
# load model
esg_class_model = torch.load(root_path + '/2-1. analysis dataset/esg_class_finetuning.pt')

In [183]:
# pre_label 이라는 column 2개를 가진 dataframe 생성 
pre_label = pd.DataFrame()
pre_text = pd.DataFrame()
pre_date = pd.DataFrame()

for batch_id, (token_ids, valid_length, segment_ids, text, date) in enumerate(tqdm_notebook(KoBERT_test_loader)):

    text_list = []
    pre_index_list = []
    date_list = []
    
    text = list(text)
    date = list(date)
    token_ids, valid_length, segment_ids = token_ids.to(device), valid_length.to(device), segment_ids.to(device)
                
    logits = esg_class_model(token_ids, valid_length, segment_ids)
    logits = logits.to(torch.float32)
    pre_index = torch.argmax(logits, dim=1)
    for i in range(pre_index.shape[0]):
        text_list.append(text[i])
        date_list.append(date[i])
        pre_index_list.append(int(pre_index[i].cpu().detach().numpy()))
    # pre_index_list를 pre_label dataframe에 추가       
    pre_label = pre_label.append(pd.DataFrame(pre_index_list, columns=['pre_label']))
    pre_text = pre_text.append(pd.DataFrame(text_list, columns=['text']))
    pre_date = pre_date.append(pd.DataFrame(date_list, columns=['date']))


  0%|          | 0/43 [00:00<?, ?it/s]

In [186]:
# pre_label과 pre_text를 concat 
pre_df = pd.concat([pre_text, pre_label], axis=1)