In [None]:
from IPython.display import clear_output 

!pip install transformers==4.8.2
!pip install sentencepiece==0.1.96
!pip install tensorflow_addons
!pip install dill==0.2.8.2

clear_output()

import sklearn
import tensorflow
import transformers
import tensorflow_addons

print(sklearn.__version__) # 1.0.2
print(tensorflow.__version__) # 2.8.0
print(transformers.__version__) # 4.8.2
print(tensorflow_addons.__version__) # 0.16.1

import sentencepiece

import pandas as pd
import numpy as np

import os
import re
import pickle 
import dill # for saving a function as a file(.pkl)
import logging # for changing the tf's logging level
import urllib.request
from tqdm import tqdm

from sklearn import model_selection
from sklearn.metrics import accuracy_score

import tensorflow as tf
import tensorflow_addons as tfa # for using Rectified-Adam optimizer (instead of Adam optimizer) 
from tensorflow.keras import layers, initializers, losses, optimizers, metrics, callbacks 

import transformers
from transformers import TFBertModel # BertTokenizer 제외

import sentencepiece as spm

 The versions of TensorFlow you are currently using is 2.12.0 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons


1.2.2
2.12.0
4.8.2
0.19.0


In [None]:
# Random seed 고정
tf.random.set_seed(1234)
np.random.seed(1234)

# Transformers logging level 변경 (WARNING -> ERROR) @ https://huggingface.co/transformers/main_classes/logging.html
transformers.logging.set_verbosity(transformers.logging.ERROR)

# Tensorflow logging level 변경 
tf.get_logger().setLevel(logging.ERROR)

# Load Data

In [None]:
from google.colab import drive
drive.mount('/gdrive', force_remount=True)

Mounted at /gdrive


In [None]:
df = pd.read_csv("/gdrive/MyDrive/omz/train_dataset_4_labels.CSV", encoding='utf-8')

df.head()

Unnamed: 0,Sentence,Emotion
0,언니 동생으로 부르는게 맞는 일인가요..??,놀람공포
1,그냥 내 느낌일뿐겠지?,놀람공포
2,아직너무초기라서 그런거죠?,놀람공포
3,유치원버스 사고 낫다던데,놀람공포
4,근데 원래이런거맞나요,놀람공포


# Preprocessing

In [None]:
df[ df['Sentence'].duplicated(keep=False) == True ].sort_values('Sentence')

Unnamed: 0,Sentence,Emotion
34827,... 그래. 알았어. 그만 하자. 내가 잘못했어. 그만해.,슬픔
33827,... 그래. 알았어. 그만 하자. 내가 잘못했어. 그만해.,슬픔
34779,... 너 오늘 유난히 대답 잘 한다? 라임 맞춰가면서.,분노혐오
33779,... 너 오늘 유난히 대답 잘 한다? 라임 맞춰가면서.,분노혐오
5468,7로 나온다든데 아니었나,놀람공포
...,...,...
44086,회식?,놀람공포
35447,효자 생색은 지 혼자 다 내시더니 자리도 안 지키고.. 빨랑 와!,분노혐오
38737,효자 생색은 지 혼자 다 내시더니 자리도 안 지키고.. 빨랑 와!,분노혐오
42115,후…,분노혐오


In [None]:
df = df.drop_duplicates('Sentence') 
df = df.reset_index(drop=True)

In [None]:
emotion_mapping = {"분노":0, "당황불안":1, "슬픔상처":2,"기쁨":3 }

df['Emotion'] = df['Emotion'].map(emotion_mapping)

In [None]:
df['Emotion'].value_counts()

Series([], Name: Emotion, dtype: int64)

In [None]:
train_x, test_x, train_y, test_y = model_selection.train_test_split(df['Sentence'],df['Emotion'], 
                                                                    test_size=0.2, 
                                                                    random_state=42) 

# Modeling

In [None]:
#KoBERT 불러오기
!git clone https://github.com/monologg/KoBERT-Transformers.git
!mv KoBERT-Transformers/kobert_transformers/tokenization_kobert.py /content
clear_output() # clear the output

#토크나이저 생성
from tokenization_kobert import KoBertTokenizer 

tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert')

Downloading:   0%|          | 0.00/371k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/77.8k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/51.0 [00:00<?, ?B/s]

In [None]:
df['Sentence'].str.len().sort_values(ascending=False)[:5]

13789    299
27362    298
33259    295
31680    261
42389    209
Name: Sentence, dtype: int64

In [None]:
len(tokenizer.tokenize(df['Sentence'][27362]))

292

In [None]:
SEQ_LEN = 300

token_ids = []
token_segments = []
token_masks = []

train_labels = []

for idx in tqdm(range(len(train_x))):

  # 'Sentence'칼럼에서 0번째 행~ 꺼내오기
  train_sentence = train_x.iloc[idx] 

  # 특수문자 제거
  cleaned_sentence = re.sub("[^\s0-9a-zA-Zㄱ-ㅎㅏ-ㅣ가-힣]", "", train_sentence)
  
  # 토큰화 + 토큰별 시퀀스번호 부여 + 패딩
  encoded_dict = tokenizer.encode_plus(text = cleaned_sentence,
                                       padding='max_length',
                                       truncation=True,
                                       max_length=SEQ_LEN
                                       )
  
  #'input_ids', 'token_type_ids', 'attention_mask'
  token_ids.append(encoded_dict['input_ids'])
  token_masks.append(encoded_dict['attention_mask'])
  token_segments.append(encoded_dict['token_type_ids'])

  train_labels.append(train_y.iloc[idx])


train_inputs = (np.array(token_ids), np.array(token_masks), np.array(token_segments))
train_labels = np.array(train_labels)
     

100%|██████████| 35516/35516 [00:16<00:00, 2159.48it/s]


In [None]:
def create_model(max_length=300):

    bert_base_model = TFBertModel.from_pretrained("monologg/kobert", from_pt=True) 

    input_token_ids   = layers.Input((max_length,), dtype=tf.int32, name='input_token_ids')   # tokens_tensor
    input_masks       = layers.Input((max_length,), dtype=tf.int32, name='input_masks')       # masks_tensor
    input_segments    = layers.Input((max_length,), dtype=tf.int32, name='input_segments')    # segments_tensor  

    bert_outputs = bert_base_model([input_token_ids, input_masks, input_segments]) 
    

    bert_outputs = bert_outputs[1] # ('pooler_output', )
    bert_outputs = layers.Dropout(0.2)(bert_outputs)
    final_output = layers.Dense(units=4, activation='softmax', kernel_initializer=initializers.TruncatedNormal(stddev=0.02), name="classifier")(bert_outputs)

    model = tf.keras.Model(inputs=[input_token_ids, input_masks, input_segments], 
                        outputs=final_output)

    model.compile(optimizer=tfa.optimizers.RectifiedAdam(learning_rate=1e-5, weight_decay=0.0025, warmup_proportion=0.05),
                  loss=losses.SparseCategoricalCrossentropy(), 
                  metrics=[metrics.SparseCategoricalAccuracy()])
    
    return model

In [None]:
resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='grpc://' + os.environ['COLAB_TPU_ADDR'])
tf.config.experimental_connect_to_cluster(resolver)
tf.tpu.experimental.initialize_tpu_system(resolver)
print("\nAll devices: ", tf.config.list_logical_devices('TPU'))

strategy = tf.distribute.TPUStrategy(resolver)

with strategy.scope(): 
    model = create_model(max_length=300)


All devices:  [LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:0', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:1', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:2', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:3', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:4', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:5', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:6', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:7', device_type='TPU')]


# Train

In [None]:
# Checkpoint setting for saving the best model
from google.colab import drive
drive.mount('/gdrive')

checkpoint_path = '/gdrive/MyDrive/colab_data_4/temp_data/saved_models/'

if not os.path.exists(checkpoint_path):
    os.makedirs(checkpoint_path)

callback_checkpoint = callbacks.ModelCheckpoint(filepath=checkpoint_path + 'best_bert_weights.h5', 
                                                monitor='val_sparse_categorical_accuracy',
                                                save_best_only=True, 
                                                save_weights_only = True, 
                                                verbose=1) 
                                                
# Early-stopping for preventing the overfitting
callback_earlystop = callbacks.EarlyStopping(monitor='val_sparse_categorical_accuracy', 
                                             min_delta=0.0001, 
                                             patience=5) 

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [None]:
history = model.fit(train_inputs, train_labels, validation_split=0.2,
                    epochs=7, batch_size=100,
                    verbose=1,
                    callbacks=[callback_checkpoint, callback_earlystop])

Epoch 1/7


INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, (((<tf.Tensor 'cond/Identity:0' shape=(None, 300) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 300) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 300) dtype=int64>), <tf.Tensor 'cond/Identity_24:0' shape=(None,) dtype=float32>),), {}]
INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, (((<tf.Tensor 'cond/Identity:0' shape=(None, 300) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 300) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 300) dtype=int64>), <tf.Tensor 'cond/Identity_24:0' shape=(None,) dtype=float32>),), {}]




INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, (((<tf.Tensor 'cond/Identity:0' shape=(None, 300) dtype=int64>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 300) dtype=int64>, <tf.Tensor 'cond/Identity_16:0' shape=(None, 300) dtype=int64>), <tf.Tensor 'cond/Identity_24:0' shape=(None,) dtype=float32>),), {}]



Epoch 1: val_sparse_categorical_accuracy improved from -inf to 0.00000, saving model to /gdrive/MyDrive/colab_data_4/temp_data/saved_models/best_bert_weights.h5
Epoch 2/7
Epoch 2: val_sparse_categorical_accuracy did not improve from 0.00000
Epoch 3/7
Epoch 3: val_sparse_categorical_accuracy did not improve from 0.00000
Epoch 4/7
Epoch 4: val_sparse_categorical_accuracy did not improve from 0.00000
Epoch 5/7
Epoch 5: val_sparse_categorical_accuracy did not improve from 0.00000
Epoch 6/7
Epoch 6: val_sparse_categorical_accuracy did not improve from 0.00000


# Evaluate

In [None]:
SEQ_LEN = 300 # 최대 token 개수 이상의 값으로 임의로 설정

token_ids =[]
token_masks =[]
token_segments =[]

test_labels = []


for idx in tqdm(range(len(test_x))):

    test_sentence = test_x.iloc[idx]
    
    # 특수문자 제거
    cleaned_sentence = re.sub("[^\s0-9a-zA-Zㄱ-ㅎㅏ-ㅣ가-힣]", "", test_sentence) 
    
    # Tokenizing / Tokens to sequence numbers / Padding
    encoded_dict = tokenizer.encode_plus(text=cleaned_sentence,
                                         padding='max_length', 
                                         truncation=True,
                                         max_length=SEQ_LEN)
    
    token_ids.append(encoded_dict['input_ids']) # tokens_tensor
    token_masks.append(encoded_dict['attention_mask']) # masks_tensor
    token_segments.append(encoded_dict['token_type_ids']) # segments_tensor

    test_labels.append(test_y.iloc[idx])


test_inputs = (np.array(token_ids), np.array(token_masks), np.array(token_segments))
test_labels = np.array(test_labels)

100%|██████████| 8879/8879 [00:01<00:00, 4489.67it/s]


In [None]:
checkpoint_path = '/gdrive/MyDrive/colab_data_4/temp_data/saved_models/'

model.load_weights(filepath=checkpoint_path + 'best_bert_weights.h5') # Load the best model's weights from checkpoint file

preds = model.predict(test_inputs)
preds = tf.argmax(preds, axis=1)

print("Accuracy: ",accuracy_score(preds, test_labels))

In [None]:
from sklearn.metrics import precision_score , recall_score , confusion_matrix, classification_report

#분노 : 0 
#당황불안 : 1
#슬픔상처 : 2
#기쁨 : 3
target_names = ['분노','당황불안','슬픔상처','기쁨']

precision = precision_score(test_labels, preds,average= "macro")
recall = recall_score(test_labels, preds,average= "macro")

print("< Confusion Matrix >\n\n",confusion_matrix(test_labels, preds))
print("\n")
print("< Classification Report >\n\n",classification_report(test_labels, preds, target_names=target_names))

# Save Model

In [None]:
data_path = '/gdrive/MyDrive/colab_data_4/temp_data/' 

if not os.path.exists(data_path): 
    os.makedirs(data_path)


with open(data_path + 'model_BERTfunction_v1.pkl', 'wb') as f:
    pickle.dump(create_model, f) # use dill to pickle a function (https://j.mp/3CeSIzP & https://j.mp/3AaXxYW)

with open(data_path + 'tokenizer-bert.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)  

# Load Model

In [None]:
from google.colab import drive
drive.mount('/gdrive', force_remount=True)

data_path = '/gdrive/MyDrive/colab_data_4/temp_data/' 


# 1) Load the Model-builder (function)
with open(data_path + 'model_BERTfunction_v1.pkl', 'rb') as f:
    create_model = pickle.load(f) # use dill to pickle a python function

# 2) Load the Bert-tokenizer 
with open(data_path + 'tokenizer-bert.pkl', 'rb') as f:
    tokenizer = pickle.load(f) 


# 3) Create the model & load the Model-weights (from checkpoint file)
resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='grpc://' + os.environ['COLAB_TPU_ADDR'])
tf.config.experimental_connect_to_cluster(resolver)
tf.tpu.experimental.initialize_tpu_system(resolver)
strategy = tf.distribute.TPUStrategy(resolver) # Obsolete : tf.distribute.experimental.TPUStrategy()

with strategy.scope(): 
    model = create_model(max_length=100) 

checkpoint_path = '/gdrive/MyDrive/colab_data_4/temp_data/saved_models/'
model.load_weights(filepath=checkpoint_path + 'best_bert_weights.h5')

Mounted at /gdrive


In [None]:
def predict_sentiment(sentence, tokenizer, model):
    
    SEQ_LEN = 100 

    # Tokenizing / Tokens to sequence numbers / Padding
    encoded_dict = tokenizer.encode_plus(text=re.sub("[^\s0-9a-zA-Zㄱ-ㅎㅏ-ㅣ가-힣]", "", sentence),
                                         padding='max_length', 
                                         truncation = True,
                                         max_length=SEQ_LEN) 
    
    token_ids = np.array(encoded_dict['input_ids']).reshape(1, -1) 
    token_masks = np.array(encoded_dict['attention_mask']).reshape(1, -1)
    token_segments = np.array(encoded_dict['token_type_ids']).reshape(1, -1)
    
    new_inputs = (token_ids, token_masks, token_segments)

    # Prediction
    prediction = model.predict(new_inputs)
    predicted_probability = np.round(np.max(prediction) * 100, 2) 
    predicted_class = ['분노','당황불안','슬픔상처','기쁨'][np.argmax(prediction, axis=1)[0]] 
    
    print("{}% 확률로 {} 텍스트입니다.".format(predicted_probability, predicted_class))

In [None]:
song="오예 오늘 저녁 불고기! ㅎㅎ"
predict_sentiment(song, tokenizer, model)