In [1]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split

In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from konlpy.tag import Okt

In [3]:
year_4_data = pd.read_csv("data/4차년도.csv", encoding="cp949")
year_5_data = pd.read_csv("data/5차년도.csv", encoding="cp949")
year_5_data_2 = pd.read_csv("data/5차년도_2차.csv", encoding="cp949")

okt = Okt()

In [4]:
print(f"4 : {len(year_4_data)}, 5_1 : {len(year_5_data)}, 5_2 : {len(year_5_data_2)}")

4 : 14606, 5_1 : 10011, 5_2 : 19374


In [5]:
data = pd.concat((year_4_data,year_5_data,year_5_data_2))
print(len(data))

43991


In [25]:
# 기쁨 : 0, 슬픔 : 1, 화남 : 2, 두려움 : 3, 역겨움 : 4, 중립 : 5, 놀라움 : 6
emot_indexing_dic = {'happiness' : 0, 'sadness' : 1, 'angry' : 2, 'fear' : 3, 'disgust' : 4, 'neutral' : 5, 'surprise' : 6}
emotion_dic = {}
for i, emot in enumerate(data['상황'].unique()):
    emotion_dic[emot] = i
    
print(emotion_dic)

{'anger': 0, 'sad': 1, 'fear': 2, 'disgust': 3, 'neutral': 4, 'happiness': 5, 'sadness': 6, 'angry': 7, 'surprise': 8}


In [61]:
x_origin = []
y = []
for i in range(len(data)):
    emot = [0]*7
    line = data.iloc[i]
    x_origin.append(line['발화문'])
    for j in range(1,6):
        emot[emot_indexing_dic[line[f'{j}번 감정'].lower()]] += line[f'{j}번 감정세기']
    y.append(emot)

In [62]:
x = []
for i in x_origin:
    x.append(okt.morphs(i))

In [None]:
y = np.array(y)

In [63]:
word_num = 10000


tokenizer = Tokenizer(num_words=word_num)
tokenizer.fit_on_texts(x)
x = tokenizer.texts_to_sequences(x)
padded_x = pad_sequences(x, padding='post')

In [27]:

emotion_y = np.array([emotion_dic[emot] for emot in data['상황']])

In [28]:
x_train, x_test, y_train, y_test = train_test_split(padded_x, emotion_y, test_size=0.3, random_state=777)

In [29]:
maxlen = padded_x.shape[1]

In [31]:
emotion_y.shape

(43991,)

In [32]:
model = tf.keras.Sequential([
  tf.keras.layers.Embedding(20000, 300, input_length=maxlen), 
  tf.keras.layers.LSTM(units=50), 
  tf.keras.layers.Dense(9, activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [33]:
history = model.fit(x_train, y_train, epochs=50, batch_size=64, validation_split=0.2)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], 'b-', label='loss')
plt.plot(history.history['val_loss'], 'r--', label='val_loss')
plt.xlabel('Epoch')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'], 'g-', label='accuracy')
plt.plot(history.history['val_accuracy'], 'k--', label='val_accuracy')
plt.xlabel('Epoch')
plt.legend()

plt.show()

In [35]:
model.save_weights('./checkpoints/my_checkpoint')

In [68]:
test_sentence = '나는 사과만 보면 너무 좋아'
print(1,test_sentence)
test_sentence = [okt.morphs(test_sentence)]
print(2,test_sentence)


test_sentence = tokenizer.texts_to_sequences(test_sentence)
print(3,test_sentence)
test_sentence = pad_sequences(test_sentence, padding='post', maxlen=maxlen)
print(4,test_sentence)

print(np.argmax(model.predict(test_sentence)))

1 나는 사과만 보면 너무 좋아
2 [['나', '는', '사과', '만', '보면', '너무', '좋아']]
3 [[5, 18, 397, 54, 633, 7, 233]]
4 [[  5  18 397  54 633   7 233   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]]
5
