## Import Library

In [2]:
import re
import os

# for transformer 
from transformers import BertModel, BertTokenizer, AdamW
import gluonnlp as nlp
from kobert.pytorch_kobert import get_pytorch_kobert_model
from kobert.utils import get_tokenizer

import numpy as np
import pandas as pd
from tqdm import tqdm, tqdm_notebook

# torch library
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn

import warnings
warnings.filterwarnings(action='ignore')

In [2]:
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
model_path = os.getcwd()
#device = torch.device("cuda:0")

## Definition classification and sentiment analysis module

In [33]:
# KoBERT Dataloader
class KoBERTDataset(Dataset):
    def __init__(self, dataset_text, dataset_date, sent_idx, label_idx, bert_tokenizer, max_len,
                 pad, pair):
        
        transform = nlp.data.BERTSentenceTransform(bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)
        self.date = [[i] for i in dataset_date]
        self.text = [[i] for i in dataset_text]
        self.sentences = [transform([i[sent_idx]]) for i in dataset_text]
        #self.labels = [np.int32(i[label_idx]) for i in dataset]
        
    def __getitem__(self, i):
        return (self.sentences[i] + (self.text[i],) + (self.date[i],))
    
    def __len__(self):
        return (len(self.sentences))

In [26]:
# For ESG classification
from transformers import AutoTokenizer

_, vocab = get_pytorch_kobert_model()
KoBERT_tokenizer = get_tokenizer()
BERT_tokenizer = AutoTokenizer.from_pretrained("snunlp/KR-FinBert")
tok = nlp.data.BERTSPTokenizer(KoBERT_tokenizer, vocab, lower=False)
esg_model = torch.load(model_path + 'esg_model.pt')

using cached model. /nas1/yongk/bigkinds/.cache/kobert_v1.zip
using cached model. /nas1/yongk/bigkinds/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece
using cached model. /nas1/yongk/bigkinds/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


In [36]:
# BERT Classifier

class BERTClassifier(nn.Module):
    def __init__(self, bert, hidden_size = 768, num_classes = 7, dr_rate = 0.2, params = None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
        
        self.classifier = nn.Linear(hidden_size, num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        
        return attention_mask.float()
    
    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        # pooler = pooler.logits

        
        if self.dr_rate:
            out = self.dropout(pooler)
            
        return self.classifier(out)

In [None]:
# For sentiment score and labeling
from transformers import AutoModelForSequenceClassification, TextClassificationPipeline

# load model
sent_tokenizer = AutoTokenizer.from_pretrained("jaehyeong/koelectra-base-v3-generalized-sentiment-analysis")
sent_model = AutoModelForSequenceClassification.from_pretrained("jaehyeong/koelectra-base-v3-generalized-sentiment-analysis")
sentiment_classifier = TextClassificationPipeline(tokenizer=sent_tokenizer, model=sent_model)

## main code

In [118]:
# Data Loading to clustering

empty_list = []
root_path = os.getcwd()
root_path = root_path + '/2-1. analysis dataset/test/'
file_list = os.listdir(root_path)
file_list = [file[:-4] for file in file_list]

In [119]:
file_list

['test_hyun', 'test_sin', 'test_lot', 'test_lg', 'test_gs']

In [123]:
for file in file_list:
    
    """
    1. test_gs.csv : GS홈쇼핑
    2. test_hyun.csv : 현대홈쇼핑
    3. test_lot.csv : 롯데홈쇼핑
    4. test_sin.csv : 신세계
    5. test_lg.csv : LG생활건강

    """
    dir_path = root_path + '{}.csv'.format(file)
    df = pd.read_csv(dir_path)
    df = df.drop(['Unnamed: 0'], axis=1)
    df['text'] = df['title'] + " " + df['contents']
    df = df.dropna()
    
    test_esg_df_date = df['date'].tolist()
    test_esg_df_text = df['text'].tolist()
    
    
    """
    config & dataloader
    """
    
    max_len = 64
    batch_size = 64
    
    dataset = KoBERTDataset(test_esg_df_text, test_esg_df_date, 0, 1, tok, max_len, True, False)
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=5)
    
    """
    make a analysis dataframe
    """
    
    # pre_label 이라는 column 2개를 가진 dataframe 생성 
    pre_label = pd.DataFrame()
    pre_text = pd.DataFrame()
    pre_date = pd.DataFrame()

    for batch_id, (token_ids, valid_length, segment_ids, text, date) in enumerate(tqdm_notebook(dataloader)):
        text_list = []
        pre_index_list = []
        date_list = []
        
        text = [text[0][i] for i in range(len(text[0]))]
        date = [date[0][i] for i in range(len(date[0]))]
        token_ids, valid_length, segment_ids = token_ids.to(device), valid_length.to(device), segment_ids.to(device)
                    
        logits = esg_model(token_ids, valid_length, segment_ids)
        logits = logits.to(torch.float32)
        pre_index = torch.argmax(logits, dim=1)
        for i in range(pre_index.shape[0]):
            text_list.append(text[i])
            date_list.append(date[i])
            pre_index_list.append(int(pre_index[i].cpu().detach().numpy()))
        # pre_index_list를 pre_label dataframe에 추가       
        pre_label = pre_label.append(pd.DataFrame(pre_index_list, columns=['pre_label']))
        pre_text = pre_text.append(pd.DataFrame(text_list, columns=['text']))
        pre_date = pre_date.append(pd.DataFrame(date_list, columns=['date']))
        
    # pre_label과 pre_text를 concat 
    pre_df = pd.concat([pre_date, pre_text, pre_label], axis=1)

    # pre_df 인덱스 초기화 및 date 올림차순 정렬    
    pre_df = pre_df.sort_values(by=['date'], axis=0)
    pre_df = pre_df.reset_index(drop=True)
    
    pre_sentiment_list = []
    pre_sent_score_list = []

    
    for idx, review in enumerate(pre_text['text'].tolist()):
        pred = sentiment_classifier(review)
        pre_sentiment_list.append(pred[0]['label'])
        pre_sent_score_list.append(pred[0]['score'])

    pd_sentiment = pd.DataFrame(pre_sentiment_list, columns=['sentiment'])
    pd_sent_score = pd.DataFrame(pre_sent_score_list, columns=['sent_score'])

    analysis_df = pd.concat([pre_df, pd_sentiment, pd_sent_score], axis=1)
    
    output_path = root_path +'/output/'
    # output 폴더 없으면 만들기

    if not os.path.exists(output_path):
        os.makedirs(output_path)
        
    save_file = output_path + "{}_analysis.csv".format(file)

    analysis_df.to_csv(save_file, index = False, encoding = 'utf-8-sig')


  0%|          | 0/34 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/43 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/36 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/43 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/43 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av