In [2]:
import os
import re
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from konlpy.tag import Okt
from tensorflow.keras import Sequential
from tensorflow.keras.layers import LSTM, Embedding, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tqdm.auto import tqdm

In [3]:
train_data_path = "../../data/garments_train.csv"
test_data_path = "../../data/garments_test.csv"

In [4]:
train_df = pd.read_csv(train_data_path).loc[:, ["SentimentText", "SentimentPolarity"]].drop_duplicates().reset_index(drop=True)
train_df.head(2)

Unnamed: 0,SentimentText,SentimentPolarity
0,사이즈가잘맞네요,1
1,좀크게나온듯,-1


In [5]:
label_encoder = LabelEncoder()
enc_data = label_encoder.fit_transform(train_df['SentimentPolarity'])
num_labels = len(set(enc_data))

In [6]:
label_items = label_encoder.classes_
label_numbers = label_encoder.transform(label_items)
dict(zip(label_items, label_numbers))

{-1: 0, 0: 1, 1: 2}

In [7]:
X_train, y_train = train_df.loc[:, "SentimentText"].to_list(), enc_data

In [8]:
okt = Okt()

def discompose(text):
    result = []
    text = re.sub("[^가-힣]", "", text)
    morph_list = okt.morphs(text, norm=True, stem=True)
    stopwords = ['은','는','이','가','하','아','것','들','의','있','되','수','보','주','등','한']
    for morph in morph_list:
        if morph not in stopwords:
            result.append(morph)
    return result

In [9]:
X_train_pos = []
for x in tqdm(X_train):
  X_train_pos.append(discompose(x))
X_train_pos[:2]

  0%|          | 0/45047 [00:00<?, ?it/s]

[['사이즈', '잘맞다'], ['좀', '크게', '나오다']]

In [16]:
MAX_LEN = 16

tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(X_train_pos)

def encode(x):
    sequence = tokenizer.texts_to_sequences(x)
    return pad_sequences(sequence, maxlen=MAX_LEN, padding="post")

X_train_encoding = encode(X_train_pos)
X_train_encoding[:5]

array([[  7,  50,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0],
       [ 21,  56,  42,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0],
       [ 14,   2,   3,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0],
       [  5,   6,  86,   6,  13,   1,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0],
       [  2,  18,  12, 127,  87,  10,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0]], dtype=int32)

In [17]:
model = Sequential([
    Embedding(20000, 300, input_length=MAX_LEN),
    LSTM(units=50),
    Dense(num_labels, activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 16, 300)           6000000   
                                                                 
 lstm_2 (LSTM)               (None, 50)                70200     
                                                                 
 dense_2 (Dense)             (None, 3)                 153       
                                                                 
Total params: 6070353 (23.16 MB)
Trainable params: 6070353 (23.16 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [18]:
model.fit(X_train_encoding, y_train, epochs=3, batch_size=32, validation_split=0.1)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x7d182e4310f0>

In [19]:
test_df = pd.read_csv(test_data_path).loc[:, ["SentimentText", "SentimentPolarity"]].drop_duplicates().reset_index(drop=True)
test_df.head(2)

Unnamed: 0,SentimentText,SentimentPolarity
0,싸구려 느낌이 팍팍.,-1
1,털빠짐이 심함.,-1


In [20]:
X_test = test_df.loc[:, "SentimentText"].to_list()
y_test = label_encoder.transform(test_df.loc[:, "SentimentPolarity"].to_list())

In [21]:
X_test_pos = []
for x in tqdm(X_test):
  X_test_pos.append(discompose(x))
X_test_pos[:2]

  0%|          | 0/11906 [00:00<?, ?it/s]

[['싸구려', '느낌', '팍팍'], ['털', '빠지다', '심하다']]

In [22]:
X_test_encoding = encode(X_test_pos)
X_test_encoding[:5]

array([[ 497,   82, 1729,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0],
       [ 125,   96,  267,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0],
       [ 225,    3,   11, 1327,  117,   52,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0],
       [   2,   18,  155,    1,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0],
       [ 390, 2512,  203,   44,   26,   41,    4,   46,   25,    0,    0,
           0,    0,    0,    0,    0]], dtype=int32)

In [23]:
predictions = model.predict(X_test_encoding)
predictions



array([[0.9949713 , 0.00283124, 0.00219748],
       [0.99525183, 0.00264492, 0.00210317],
       [0.9785859 , 0.00972326, 0.01169086],
       ...,
       [0.65406466, 0.32915244, 0.01678291],
       [0.9971762 , 0.00116743, 0.00165635],
       [0.990433  , 0.00714785, 0.00241917]], dtype=float32)

In [24]:
y_pred = np.argmax(predictions, axis=1)
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [25]:
accuracy_score(y_test, y_pred)

0.89274315471191