In [2]:
import os
import re
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from konlpy.tag import Okt
from tensorflow.keras import Sequential
from tensorflow.keras.layers import LSTM, Embedding, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tqdm.auto import tqdm

In [3]:
train_data_path = "../../data/garments_train.csv"
test_data_path = "../../data/garments_test.csv"

In [4]:
X_col, y_col = "SentimentText", "Aspect"

In [5]:
train_df = pd.read_csv(train_data_path).loc[:, [X_col, y_col]].drop_duplicates().reset_index(drop=True)
train_df.head(2)

Unnamed: 0,SentimentText,Aspect
0,사이즈가잘맞네요,사이즈
1,좀크게나온듯,사이즈


In [6]:
label_encoder = LabelEncoder()
enc_data = label_encoder.fit_transform(train_df[y_col])
num_labels = len(set(enc_data))

In [7]:
label_items = label_encoder.classes_
label_numbers = label_encoder.transform(label_items)
dict(zip(label_items, label_numbers))

{'가격': 0, '기능': 1, '디자인': 2, '사이즈': 3, '품질': 4}

In [8]:
X_train, y_train = train_df.loc[:, X_col].to_list(), enc_data

In [9]:
okt = Okt()

def discompose(text):
    result = []
    text = re.sub("[^가-힣]", "", text)
    morph_list = okt.morphs(text, norm=True, stem=True)
    stopwords = ['은','는','이','가','하','아','것','들','의','있','되','수','보','주','등','한']
    for morph in morph_list:
        if morph not in stopwords:
            result.append(morph)
    return result

In [10]:
X_train_pos = []
for x in tqdm(X_train):
  X_train_pos.append(discompose(x))
X_train_pos[:2]

  0%|          | 0/45024 [00:00<?, ?it/s]

[['사이즈', '잘맞다'], ['좀', '크게', '나오다']]

In [11]:
MAX_LEN = 27

tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(X_train_pos)

def encode(x):
    sequence = tokenizer.texts_to_sequences(x)
    return pad_sequences(sequence, maxlen=MAX_LEN, padding="post")

X_train_encoding = encode(X_train_pos)
X_train_encoding[:5]

array([[  7,  50,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0],
       [ 21,  56,  45,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0],
       [ 14,   2,   3,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0],
       [  5,   6,  86,   6,  13,   1,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0],
       [  2,  18,  12, 127,  87,  10,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0]], dtype=int32)

In [12]:
model = Sequential([
    Embedding(20000, 300, input_length=MAX_LEN),
    LSTM(units=50),
    Dense(num_labels, activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 27, 300)           6000000   
                                                                 
 lstm (LSTM)                 (None, 50)                70200     
                                                                 
 dense (Dense)               (None, 5)                 255       
                                                                 
Total params: 6,070,455
Trainable params: 6,070,455
Non-trainable params: 0
_________________________________________________________________


In [13]:
model.fit(X_train_encoding, y_train, epochs=3, batch_size=32, validation_split=0.1)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f8fb796e9e0>

In [14]:
test_df = pd.read_csv(test_data_path).loc[:, [X_col, y_col]].drop_duplicates().reset_index(drop=True)
test_df.head(2)

Unnamed: 0,SentimentText,Aspect
0,싸구려 느낌이 팍팍.,품질
1,털빠짐이 심함.,품질


In [15]:
X_test = test_df.loc[:, X_col].to_list()
y_test = label_encoder.transform(test_df.loc[:, y_col].to_list())

In [16]:
X_test_pos = []
for x in tqdm(X_test):
  X_test_pos.append(discompose(x))
X_test_pos[:2]

  0%|          | 0/11898 [00:00<?, ?it/s]

[['싸구려', '느낌', '팍팍'], ['털', '빠지다', '심하다']]

In [17]:
X_test_encoding = encode(X_test_pos)
X_test_encoding[:5]

array([[ 497,   82, 1729,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0],
       [ 125,   96,  267,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0],
       [ 225,    3,   11, 1327,  117,   52,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0],
       [   2,   18,  155,    1,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0],
       [ 390, 2512,  203,   43,   26,   41,    4,   46,   25,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0]], dtype=int32)

In [18]:
predictions = model.predict(X_test_encoding)
predictions



array([[2.7134223e-02, 5.1220419e-04, 1.5911283e-03, 1.1143129e-03,
        9.6964812e-01],
       [1.1973479e-03, 2.5191248e-04, 9.4067486e-04, 5.7386683e-04,
        9.9703616e-01],
       [2.0812490e-04, 9.9603766e-01, 1.6273013e-03, 3.0940448e-04,
        1.8174039e-03],
       ...,
       [8.1041327e-04, 5.1795912e-04, 8.0723449e-04, 6.4148568e-04,
        9.9722290e-01],
       [1.6996168e-03, 6.9988979e-04, 5.8401295e-04, 3.7289242e-04,
        9.9664366e-01],
       [1.2847998e-03, 2.5479507e-04, 1.6072300e-03, 7.2692498e-04,
        9.9612623e-01]], dtype=float32)

In [19]:
y_pred = np.argmax(predictions, axis=1)
y_pred

array([4, 4, 1, ..., 4, 4, 4])

In [20]:
accuracy_score(y_test, y_pred)

0.9453689695747184