In [22]:
import os
import re
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from konlpy.tag import Okt
from tensorflow.keras import Sequential
from tensorflow.keras.layers import LSTM, Embedding, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tqdm.auto import tqdm

In [3]:
train_data_path = "../../data/garments_train.csv"
test_data_path = "../../data/garments_test.csv"

In [4]:
train_df = pd.read_csv(train_data_path).loc[:, ["RawText", "GeneralPolarity"]].drop_duplicates().reset_index(drop=True)
train_df.head(2)

Unnamed: 0,RawText,GeneralPolarity
0,이번에구매한데님은사이즈가잘맞네요 색상구성도괜찮고맘에든답니다 잘입겠습니다,1.0
1,바지는 너무 편하고 좋은데 좀크게나온듯 그리고 허리고리 하나가 안달려서 밑단수선하면...,0.0


In [5]:
label_encoder = LabelEncoder()
enc_data = label_encoder.fit_transform(train_df['GeneralPolarity'])
num_labels = len(set(enc_data))

In [6]:
label_items = label_encoder.classes_
label_numbers = label_encoder.transform(label_items)
dict(zip(label_items, label_numbers))

{-1.0: 0, 0.0: 1, 1.0: 2}

In [7]:
X_train, y_train = train_df.loc[:, "RawText"].to_list(), enc_data

In [8]:
okt = Okt()

def discompose(text):
    result = []
    text = re.sub("[^가-힣]", "", text)
    morph_list = okt.morphs(text, norm=True, stem=True)
    stopwords = ['은','는','이','가','하','아','것','들','의','있','되','수','보','주','등','한']
    for morph in morph_list:
        if morph not in stopwords:
            result.append(morph)
    return result

In [10]:
X_train_pos = []
for x in tqdm(X_train):
  X_train_pos.append(discompose(x))
X_train_pos[:2]

  0%|          | 0/28559 [00:00<?, ?it/s]

[['이번',
  '에',
  '구',
  '매',
  '데님',
  '사이즈',
  '잘맞다',
  '색상',
  '구성',
  '도',
  '괜찮다',
  '맘',
  '에든',
  '답',
  '니',
  '다',
  '자다',
  '입다'],
 ['바지',
  '너무',
  '편하다',
  '좋다',
  '좀',
  '크게',
  '나오다',
  '그리고',
  '허리',
  '고',
  '리하나',
  '가안',
  '달다',
  '밑',
  '단수',
  '선하다',
  '달다',
  '남자',
  '벨트',
  '고리',
  '없다',
  '안되다',
  '다해',
  '서',
  '밑단',
  '먼저',
  '하다',
  '반품',
  '하다',
  '없다',
  '신경',
  '좀',
  '써다',
  '확인',
  '후',
  '배송',
  '하다',
  '바',
  '래',
  '요',
  '바지',
  '편하다',
  '좋다',
  '대요']]

In [11]:
MAX_LEN = 27

tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(X_train_pos)

def encode(x):
    sequence = tokenizer.texts_to_sequences(x)
    return pad_sequences(sequence, maxlen=MAX_LEN, padding="post")

X_train_encoding = encode(X_train_pos)
X_train_encoding[:5]

array([[ 321,    4,  160,  541,  572,    8,   82,   19,  230,    3,   41,
          31,  510, 1412,   42,   27,   28,    5,    0,    0,    0,    0,
           0,    0,    0,    0,    0],
       [ 777,  832, 1051,   37,  203, 1052,  100,  896,  946,    1,   88,
           1,   37,  562,   21,  511,  542,  292,   62,    1, 1134,  443,
           9,   25,   11,    2,  823],
       [ 176,   25,   37,  341,  166,    5,   11,  131,  725,  504,   30,
        1005,  277,   22,    1,   24,    6,    4,   86,   99,   11,  162,
          28,   22,    1,    0,    0],
       [  39,   12,   12,    2,   10,    3,   19,    3,   12,    2,    6,
          45,   34,  191,  115,    9,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0],
       [  85,    4,  170,   42,   27,   43,  782,   20,    7, 2247,    4,
          85,  217,  322, 1166,  986, 2374,  186,    2,    6,    3,    2,
          35,   19,    3,   12,   18]], dtype=int32)

In [12]:
model = Sequential([
    Embedding(20000, 300, input_length=MAX_LEN),
    LSTM(units=50),
    Dense(num_labels, activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 27, 300)           6000000   
                                                                 
 lstm (LSTM)                 (None, 50)                70200     
                                                                 
 dense (Dense)               (None, 3)                 153       
                                                                 
Total params: 6070353 (23.16 MB)
Trainable params: 6070353 (23.16 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [13]:
model.fit(X_train_encoding, y_train, epochs=3, batch_size=32, validation_split=0.1)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x7fcd6cf95f30>

In [14]:
test_df = pd.read_csv(test_data_path).loc[:, ["RawText", "GeneralPolarity"]].drop_duplicates().reset_index(drop=True)
test_df.head(2)

Unnamed: 0,RawText,GeneralPolarity
0,바늘질 마감처리 불량. 싸구려 느낌이 팍팍. 털빠짐이 없다해서 구매했는데 털빠짐이 ...,-1.0
1,가격대비 퀄리티 좋습니다. 두께도 적당하고 자켓안에 있기에도 좋네요~~ 키 178 ...,1.0


In [15]:
X_test = test_df.loc[:, "RawText"].to_list()
y_test = label_encoder.transform(test_df.loc[:, "GeneralPolarity"].to_list())

In [16]:
X_test_pos = []
for x in tqdm(X_test):
  X_test_pos.append(discompose(x))
X_test_pos[:2]

  0%|          | 0/7073 [00:00<?, ?it/s]

[['바늘',
  '질',
  '마감',
  '처리',
  '불량',
  '싸구려',
  '느낌',
  '팍팍',
  '털',
  '빠지다',
  '없다',
  '하다',
  '구매',
  '하다',
  '털',
  '빠지다',
  '심하다',
  '한겨울',
  '에',
  '도입',
  '을',
  '있다',
  '서사',
  '데',
  '너무',
  '얇다',
  '한겨울',
  '에',
  '입',
  '을',
  '만',
  '수준',
  '못',
  '되다',
  '상품',
  '평이',
  '괜찮다',
  '사다',
  '제품',
  '좋다',
  '사람',
  '이해',
  '불가'],
 ['가격',
  '대비',
  '퀄리티',
  '좋다',
  '두께',
  '도',
  '적당하다',
  '자켓',
  '안',
  '에',
  '있다',
  '좋다',
  '네',
  '요키',
  '몸무게',
  '블랙',
  '사이즈',
  '구매',
  '하다',
  '딱',
  '맞다']]

In [17]:
X_test_encoding = encode(X_test_pos)
X_test_encoding[:5]

array([[ 435,  474,    4,  758,   14,   17, 2222,  189,   12,   51,  474,
           4,   71,   14,   75, 1741,  148,   48,   55,  877,   41,   43,
          80,    2,  380, 1561, 2685],
       [   6,   45,  404,    2,  252,    3,  139,  348,  108,    4,   17,
           2,   40,  708, 1492,  135,    8,   22,    1,   79,   47,    0,
           0,    0,    0,    0,    0],
       [  29,   15,  258,  433,  496, 3263,  730,    5,  213,    6,    3,
           2,  304,  103,    3, 3247, 1295, 2817,  807,   40,    9,  176,
         552,   31,    4,   23,    1],
       [ 235,  169,   90,   24,   57,  655, 3314,  151,   26,  201,    8,
         415,  210,   29,   11,  512,  100,  114,    3,   56,  323,  852,
          35,  406,    1,    0,    0],
       [  29,   35,   10,    2,  147, 1215,   30,  253,   65,  202,  362,
          91,  144,  225,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0]], dtype=int32)

In [20]:
predictions = model.predict(X_test_encoding)
predictions



array([[3.1893176e-01, 5.6719154e-01, 1.1387659e-01],
       [8.9921039e-03, 7.3618256e-02, 9.1738969e-01],
       [9.2898554e-05, 1.0711761e-03, 9.9883598e-01],
       ...,
       [9.8322153e-01, 1.6197540e-02, 5.8104109e-04],
       [9.9110067e-01, 6.9396119e-03, 1.9597721e-03],
       [9.7767681e-01, 2.1388277e-02, 9.3495194e-04]], dtype=float32)

In [21]:
y_pred = np.argmax(predictions, axis=1)
y_pred

array([1, 2, 2, ..., 0, 0, 0])

In [23]:
accuracy_score(y_test, y_pred)

0.806588434893256