# Reference
## - bert_naver_movie.ipynb
https://colab.research.google.com/drive/1tIf0Ugdqg4qT7gcxia3tL7und64Rv1dP#scrollTo=P58qy4--s5_x

<br>
<br>

# **준비 사항**

In [3]:
import tensorflow as tf
import torch

from transformers import BertTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
import random
import time
import datetime

In [4]:
# BERT의 토크나이저
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False)

In [5]:
# 디바이스 설정
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print('No GPU available, using the CPU instead.')

There are 3 GPU(s) available.
We will use the GPU: Tesla V100-PCIE-32GB


<br>
<br>

# **학습시킨 모델 불러오기**

In [6]:
# 저장한 모델 불러오기
model = torch.load('nsmc.pth')
model.cuda()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

<br>
<br>

# **민원 텍스트 감정 태깅**
### - 민원 텍스트를 분노와 긍정의 감정으로 태깅

In [7]:
# 입력 데이터 변환
def convert_input_data(sentences):

    # 민원 텍스트의 형식을 변경하는 코드 삽입
    sentences = ' '.join(sentences)
    sentences = ["[CLS]" + sentences + "[SEP]"]
  
    # BERT의 토크나이저로 문장을 토큰으로 분리
    tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]

    # 입력 토큰의 최대 시퀀스 길이
    MAX_LEN = 128

    # 토큰을 숫자 인덱스로 변환
    input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
    
    # 문장을 MAX_LEN 길이에 맞게 자르고, 모자란 부분을 패딩 0으로 채움
    input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

    # 어텐션 마스크 초기화
    attention_masks = []

    # 어텐션 마스크를 패딩이 아니면 1, 패딩이면 0으로 설정
    # 패딩 부분은 BERT 모델에서 어텐션을 수행하지 않아 속도 향상
    for seq in input_ids:
        seq_mask = [float(i>0) for i in seq]
        attention_masks.append(seq_mask)

    # 데이터를 파이토치의 텐서로 변환
    inputs = torch.tensor(input_ids)
    masks = torch.tensor(attention_masks)

    return inputs, masks

In [10]:
import torch.nn.functional as F

In [11]:
# 문장 테스트
def test_sentences(sentences):

    # 평가모드로 변경
    model.eval()

    # 문장을 입력 데이터로 변환
    inputs, masks = convert_input_data(sentences)

    # 데이터를 GPU에 넣음
    b_input_ids = inputs.to(device)
    b_input_mask = masks.to(device)
            
    # 그래디언트 계산 안함
    with torch.no_grad():     
        # Forward 수행
        outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask)

    # 로스 구함
    # outputs에 Softmax function을 사용함
    # 분노의 감정을 0~1로 나타내주기 위함
    logits = F.softmax(outputs[0])

    # CPU로 데이터 이동
    logits = logits.detach().cpu().numpy()

    return logits

In [9]:
import pandas as pd
import dask as dd
import pickle

# 민원 텍스트 블러오기
with open('dasan_CN.bin','rb') as f:
    df_CN = pickle.load(f)

In [None]:
# 민원 텍스트를 하나의 문장으로 만들기
for i in range(df_CN.shape[0]):
    ' '.join(df_CN['CN'][i])

In [16]:
# 민원 텍스트 감정 태깅 후 리스트에 append
stop=[]
for i in range(df_CN['CN'].shape[0]):
    if i%1000 == 0 :
        print(i)
    logits = test_sentences(df_CN['CN'][i])
    stop.append(logits)



2002000
2003000
2004000
2005000
2006000
2007000
2008000
2009000
2010000
2011000
2012000
2013000
2014000
2015000
2016000
2017000
2018000
2019000
2020000
2021000
2022000
2023000
2024000
2025000
2026000
2027000
2028000
2029000
2030000
2031000
2032000
2033000
2034000
2035000
2036000
2037000
2038000
2039000
2040000
2041000
2042000
2043000
2044000
2045000
2046000
2047000
2048000
2049000
2050000
2051000
2052000
2053000
2054000
2055000
2056000
2057000
2058000
2059000
2060000
2061000
2062000
2063000
2064000
2065000
2066000
2067000
2068000
2069000
2070000
2071000
2072000
2073000
2074000
2075000
2076000
2077000
2078000
2079000
2080000
2081000
2082000
2083000
2084000
2085000
2086000
2087000
2088000
2089000
2090000
2091000
2092000
2093000
2094000
2095000
2096000
2097000
2098000
2099000
2100000
2101000
2102000
2103000
2104000
2105000
2106000
2107000
2108000
2109000
2110000
2111000
2112000
2113000
2114000
2115000
2116000
2117000
2118000
2119000
2120000
2121000
2122000
2123000
2124000
2125000
2126000


In [18]:
# 감정 태깅 후 리스트 저장
np.save('bert_arr', stop) # x_save.npy

In [19]:
stop

loat32),
 array([[0.05444962, 0.9455504 ]], dtype=float32),
 array([[0.05444962, 0.9455504 ]], dtype=float32),
 array([[0.8580606 , 0.14193942]], dtype=float32),
 array([[0.8580606 , 0.14193942]], dtype=float32),
 array([[0.70312023, 0.2968798 ]], dtype=float32),
 array([[0.8580606 , 0.14193942]], dtype=float32),
 array([[0.70312023, 0.2968798 ]], dtype=float32),
 array([[0.70312023, 0.2968798 ]], dtype=float32),
 array([[0.70312023, 0.2968798 ]], dtype=float32),
 array([[0.8580606 , 0.14193942]], dtype=float32),
 array([[0.70312023, 0.2968798 ]], dtype=float32),
 array([[0.70312023, 0.2968798 ]], dtype=float32),
 array([[0.70312023, 0.2968798 ]], dtype=float32),
 array([[0.8580606 , 0.14193942]], dtype=float32),
 array([[0.70312023, 0.2968798 ]], dtype=float32),
 array([[0.70312023, 0.2968798 ]], dtype=float32),
 array([[0.8580606 , 0.14193942]], dtype=float32),
 array([[0.70312023, 0.2968798 ]], dtype=float32),
 array([[0.70312023, 0.2968798 ]], dtype=float32),
 array([[0.8580606 , 0

<br>
<br>

# **태깅된 민원 텍스트 확인**

In [2]:
import numpy as np
bert = np.load('bert_arr.npy')

In [8]:
# 왼쪽 - 분노 (0~1)
bert[200:250]

array([[[0.3626623 , 0.6373377 ]],

       [[0.3626623 , 0.6373377 ]],

       [[0.6014517 , 0.3985483 ]],

       [[0.3626623 , 0.6373377 ]],

       [[0.3626623 , 0.6373377 ]],

       [[0.3626623 , 0.6373377 ]],

       [[0.3626623 , 0.6373377 ]],

       [[0.6014517 , 0.3985483 ]],

       [[0.3626623 , 0.6373377 ]],

       [[0.6014517 , 0.3985483 ]],

       [[0.6014517 , 0.3985483 ]],

       [[0.6014517 , 0.3985483 ]],

       [[0.6014517 , 0.3985483 ]],

       [[0.6014517 , 0.3985483 ]],

       [[0.66625905, 0.3337409 ]],

       [[0.02564467, 0.97435534]],

       [[0.02564467, 0.97435534]],

       [[0.02564467, 0.97435534]],

       [[0.02564467, 0.97435534]],

       [[0.02564467, 0.97435534]],

       [[0.02564467, 0.97435534]],

       [[0.02564467, 0.97435534]],

       [[0.02564467, 0.97435534]],

       [[0.02564467, 0.97435534]],

       [[0.02564467, 0.97435534]],

       [[0.70312023, 0.2968798 ]],

       [[0.70312023, 0.2968798 ]],

       [[0.8086117 , 0.19138