In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import re
import urllib.request
from konlpy.tag import Okt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [4]:
import json

with open('dataset/EmotionLines/Friends/friends_train.json') as f:
    data = f.read()
    train_json_obj = json.loads(re.sub(r'\\u[\d+]*', '', data))

In [5]:
import itertools

In [6]:
train_json_arr = []
for obj in train_json_obj:
    train_json_arr.extend(obj)

In [10]:
train_data = pd.DataFrame(train_json_arr)
test_data = pd.read_csv('dataset/en_data.csv')

In [11]:
import nltk

In [12]:
test_data

Unnamed: 0,id,i_dialog,i_utterance,speaker,utterance
0,0,0,0,Phoebe,"Alright, whadyou do with him?"
1,1,0,1,Monica,Oh! You're awake!
2,2,0,2,Joey,Then you gotta come clean with Ma! This is not...
3,3,0,3,Mr. Tribbiani,"Yeah, but this is"
4,4,0,4,Joey,I don't wanna hear it! Now go to my room!
...,...,...,...,...,...
1618,1618,150,14,Joey,Nooo.
1619,1619,150,15,Lauren,"Hi, Kate!"
1620,1620,150,16,Kate,"Hi, Lauren."
1621,1621,150,17,Joey,"Hi, Lauren."


In [13]:
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 

In [14]:
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

ps = PorterStemmer()

In [19]:
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize

X_train = []

train_data['utterance'] = train_data['utterance'].str.replace(r'[-=\\+,#/\?:^$.@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`\'…》]',"")
stop_words = set(stopwords.words('english'))

for sentence in train_data['utterance']:
    temp_X = []
    temp_X = word_tokenize(sentence) # 토큰화
    temp_X = [word for word in temp_X if not word in stop_words] # 불용어 제거
    X_train.append(temp_X)

In [20]:
X_test = []

test_data['utterance'] = test_data['utterance'].str.replace(r'[-=+,\\#/\?:^$.@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`\'…》]',"")
stop_words = set(stopwords.words('english')) 

for sentence in test_data['utterance']:
    temp_X = []
    temp_X = word_tokenize(sentence) # 토큰화
    temp_X = [word for word in temp_X if not word in stop_words] # 불용어 제거
    X_test.append(temp_X)

In [21]:
X_train

[['also',
  'I',
  'point',
  'person',
  'companys',
  'transition',
  'KL5',
  'GR6',
  'system'],
 ['You', 'mustve', 'hands', 'full'],
 ['That', 'I', 'That', 'I'],
 ['So', 'lets', 'talk', 'little', 'bit', 'duties'],
 ['My', 'duties', 'All', 'right'],
 ['Now', 'youll', 'heading', 'whole', 'division', 'youll', 'lot', 'duties'],
 ['I', 'see'],
 ['But', 'therell', 'perhaps', '30', 'people', 'dump', 'certain', 'amount'],
 ['Good', 'know'],
 ['We', 'go', 'detail'],
 ['No', 'dont', 'I', 'beg'],
 ['All',
  'right',
  'well',
  'definite',
  'answer',
  'Monday',
  'I',
  'think',
  'I',
  'say',
  'confidence',
  'youll',
  'fit',
  'well'],
 ['Really'],
 ['Absolutely', 'You', 'relax', ';', 'great'],
 ['But', 'The', 'waitress', 'I', 'went', 'last', 'month'],
 ['You', 'know', 'Forget'],
 ['Nononono', 'Who', 'talking'],
 ['No', 'IIII', 'dont', 'I', 'actually', 'dont', 'know'],
 ['Ok'],
 ['All', 'right', 'well'],
 ['Im', 'gon', 'na', 'see', 'I', 'get', 'room', 'night', 'Ill'],
 ['Ill', 'see', 

In [21]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

In [22]:
threshold = 3
total_cnt = len(tokenizer.word_index) # 단어의 수
rare_cnt = 0 # 등장 빈도수가 threshold보다 작은 단어의 개수를 카운트
total_freq = 0 # 훈련 데이터의 전체 단어 빈도수 총 합
rare_freq = 0 # 등장 빈도수가 threshold보다 작은 단어의 등장 빈도수의 총 합

# 단어와 빈도수의 쌍(pair)을 key와 value로 받는다.
for key, value in tokenizer.word_counts.items():
    total_freq = total_freq + value

    # 단어의 등장 빈도수가 threshold보다 작으면
    if(value < threshold):
        rare_cnt = rare_cnt + 1
        rare_freq = rare_freq + value

print('단어 집합(vocabulary)의 크기 :',total_cnt)
print('등장 빈도가 %s번 이하인 희귀 단어의 수: %s'%(threshold - 1, rare_cnt))
print("단어 집합에서 희귀 단어의 비율:", (rare_cnt / total_cnt)*100)
print("전체 등장 빈도에서 희귀 단어 등장 빈도 비율:", (rare_freq / total_freq)*100)

단어 집합(vocabulary)의 크기 : 6033
등장 빈도가 2번 이하인 희귀 단어의 수: 4092
단어 집합에서 희귀 단어의 비율: 67.8269517652909
전체 등장 빈도에서 희귀 단어 등장 빈도 비율: 9.549337601480996


In [23]:
# 전체 단어 개수 중 빈도수 2이하인 단어 개수는 제거.
# 0번 패딩 토큰과 1번 OOV 토큰을 고려하여 +2
vocab_size = total_cnt - rare_cnt + 2
print('단어 집합의 크기 :',vocab_size)

단어 집합의 크기 : 1943


In [24]:
tokenizer = Tokenizer(vocab_size, oov_token = 'OOV') 
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [25]:
num_of_label = len(train_data['emotion'].unique())
label_tokenizer = Tokenizer(num_of_label, split=None, oov_token = 'OOV') 
label_tokenizer.fit_on_texts(train_data['emotion'])
y_train = label_tokenizer.texts_to_sequences(train_data['emotion'])
y_train = to_categorical(y_train)                                   

In [26]:
label_tokenizer.word_index

{'OOV': 1,
 'neutral': 2,
 'nonneutral': 3,
 'joy': 4,
 'surprise': 5,
 'anger': 6,
 'sadness': 7,
 'disgust': 8,
 'fear': 9}

In [27]:
y_train

array([[0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [28]:
drop_train = [index for index, sentence in enumerate(X_train) if len(sentence) < 1]

In [29]:
# 빈 샘플들을 제거
X_train = np.delete(X_train, drop_train, axis=0)
y_train = np.delete(y_train, drop_train, axis=0)
print(len(X_train))
print(len(y_train))

10537
10537


  return array(a, dtype, copy=False, order=order)


In [30]:
max_len = 30


In [31]:
X_train = pad_sequences(X_train, maxlen = max_len)
X_test = pad_sequences(X_test, maxlen = max_len)

In [32]:
X_train

array([[  0,   0,   0, ...,   1,   1, 830],
       [  0,   0,   0, ..., 749, 628, 750],
       [  0,   0,   0, ...,   2,  81,   2],
       ...,
       [  0,   0,   0, ...,  43,   1,  12],
       [  0,   0,   0, ...,   0,   0,   6],
       [  0,   0,   0, ...,   2,  18, 162]])

In [33]:
from tensorflow.keras.layers import Embedding, Dense, LSTM, Dropout, Bidirectional
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [34]:
# LSTM
model = Sequential()
model.add(Embedding(vocab_size, 100))
model.add(LSTM(128))
model.add(Dense(8, activation='softmax'))

In [35]:
# Transformer
import tensorflow as tf
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, embedding_dim, num_heads=8):
        super(MultiHeadAttention, self).__init__()
        self.embedding_dim = embedding_dim # d_model
        self.num_heads = num_heads

        assert embedding_dim % self.num_heads == 0

        self.projection_dim = embedding_dim // num_heads
        self.query_dense = tf.keras.layers.Dense(embedding_dim)
        self.key_dense = tf.keras.layers.Dense(embedding_dim)
        self.value_dense = tf.keras.layers.Dense(embedding_dim)
        self.dense = tf.keras.layers.Dense(embedding_dim)

    def scaled_dot_product_attention(self, query, key, value):
        matmul_qk = tf.matmul(query, key, transpose_b=True)
        depth = tf.cast(tf.shape(key)[-1], tf.float32)
        logits = matmul_qk / tf.math.sqrt(depth)
        attention_weights = tf.nn.softmax(logits, axis=-1)
        output = tf.matmul(attention_weights, value)
        return output, attention_weights

    def split_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.projection_dim))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, inputs):
        # x.shape = [batch_size, seq_len, embedding_dim]
        batch_size = tf.shape(inputs)[0]

        # (batch_size, seq_len, embedding_dim)
        query = self.query_dense(inputs)
        key = self.key_dense(inputs)
        value = self.value_dense(inputs)

        # (batch_size, num_heads, seq_len, projection_dim)
        query = self.split_heads(query, batch_size)  
        key = self.split_heads(key, batch_size)
        value = self.split_heads(value, batch_size)

        scaled_attention, _ = self.scaled_dot_product_attention(query, key, value)
        # (batch_size, seq_len, num_heads, projection_dim)
        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])  

        # (batch_size, seq_len, embedding_dim)
        concat_attention = tf.reshape(scaled_attention, (batch_size, -1, self.embedding_dim))
        outputs = self.dense(concat_attention)
        return outputs
    
class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, embedding_dim, num_heads, dff, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = MultiHeadAttention(embedding_dim, num_heads)
        self.ffn = tf.keras.Sequential(
            [tf.keras.layers.Dense(dff, activation="relu"),
             tf.keras.layers.Dense(embedding_dim),]
        )
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)
    
class TokenAndPositionEmbedding(tf.keras.layers.Layer):
    def __init__(self, max_len, vocab_size, embedding_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.pos_emb = tf.keras.layers.Embedding(max_len, embedding_dim)

    def call(self, x):
        max_len = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=max_len, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions
    
    
import tensorflow as tf
embedding_dim = 128  # Embedding size for each token
num_heads = 8  # Number of attention heads
dff = 128 # Hidden layer size in feed forward network inside transformer

inputs = tf.keras.layers.Input(shape=(max_len,))
embedding_layer = TokenAndPositionEmbedding(max_len, vocab_size, embedding_dim)
x = embedding_layer(inputs)
transformer_block = TransformerBlock(embedding_dim, num_heads, dff)
x = transformer_block(x)
x = tf.keras.layers.GlobalAveragePooling1D()(x)
x = tf.keras.layers.Dropout(0.5)(x)
x = tf.keras.layers.Dense(128, activation="relu")(x)
x = tf.keras.layers.Dropout(0.5)(x)
outputs = tf.keras.layers.Dense(8, activation="softmax")(x)

model = tf.keras.Model(inputs=inputs, outputs=outputs)
model = tf.keras.Model(inputs=inputs, outputs=outputs)

In [36]:
model.compile("adam", "categorical_crossentropy", metrics=["accuracy"])
history = model.fit(X_train, y_train, batch_size=32, epochs=10, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [37]:
pred = pd.DataFrame(model.predict(X_test))

In [38]:
reverse_word_map = dict(map(reversed, label_tokenizer.word_index.items()))
pred = pred.apply(lambda a:np.argmax(a), axis=1).apply(lambda a: reverse_word_map[a])

In [39]:
test_data['Predicted'] = np.where(pred == 'nonneutral', 'non-neutral', pred)

In [40]:
test_data[['id', 'Predicted']].to_csv('en_submission.csv', index=False)