In [1]:
!pip install konlpy

Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.4/19.4 MB[0m [31m31.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting JPype1>=0.7.0 (from konlpy)
  Downloading JPype1-1.4.1-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (465 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m465.3/465.3 kB[0m [31m38.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: JPype1, konlpy
Successfully installed JPype1-1.4.1 konlpy-0.6.0


In [39]:
import re
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from konlpy.tag import Okt
from tensorflow.keras import Sequential
from tensorflow.keras.layers import LSTM, Embedding, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [3]:
base_df = pd.read_csv("garments_train.csv")
base_df.head(2)

Unnamed: 0,Index,RawText,Source,Domain,MainCategory,ProductName,ReviewScore,Syllable,Word,RDate,GeneralPolarity,Aspect,SentimentText,SentimentWord,SentimentPolarity
0,128484,이번에구매한데님은사이즈가잘맞네요 색상구성도괜찮고맘에든답니다 잘입겠습니다,쇼핑몰,패션,남성의류,OO 남성 매** 데님 3종,100,39,3,20180315,1.0,사이즈,사이즈가잘맞네요,1,1
1,128494,바지는 너무 편하고 좋은데 좀크게나온듯 그리고 허리고리 하나가 안달려서 밑단수선하면...,쇼핑몰,패션,남성의류,OO 남성 매** 데님 3종,60,118,24,20180317,0.0,사이즈,좀크게나온듯,1,-1


In [4]:
label_encoder = LabelEncoder()
enc_data = label_encoder.fit_transform(base_df['Aspect'])
num_labels = len(set(enc_data))

In [5]:
label_items = label_encoder.classes_
label_numbers = label_encoder.transform(label_items)
dict(zip(label_items, label_numbers))

{'가격': 0, '기능': 1, '디자인': 2, '사이즈': 3, '품질': 4}

In [6]:
X_train, y_train = base_df.loc[:, "SentimentText"].to_list(), enc_data

In [7]:
X_train[:3]

['사이즈가잘맞네요', '좀크게나온듯', '저렴한가격에']

In [8]:
okt = Okt()

def discompose(text):
    result = []
    text = re.sub("[^가-힣ㄱ-ㅎㅏ-ㅣ\\s]", '', text)
    text = re.sub(r"[^가-힣A-Za-z)-9(),!?\'\`]", "", text)
    morph_list = okt.morphs(text, stem=True)
    stopwords = ['은','는','이','가','하','아','것','들','의','있','되','수','보','주','등','한']
    for morph in morph_list:
        if morph not in stopwords:
            result.append(morph)
    return result

In [9]:
discompose("가격은 비싸지만 품질은 좋습니다.")

['가격', '비싸다', '품질', '좋다']

In [10]:
X_train_pos = [discompose(x) for x in X_train]
X_train_pos[:5]

[['사이즈', '잘맞다'],
 ['좀', '크게', '나오다'],
 ['저렴하다', '가격', '에'],
 ['디자인', '도', '색상', '도', '너무', '좋다'],
 ['가격', '대비', '품질', '짱', '이네', '요']]

In [25]:
MAX_LEN = 27

tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(X_train_pos)

def encode(x):
    sequence = tokenizer.texts_to_sequences(x)
    return pad_sequences(sequence, maxlen=MAX_LEN, padding="post")

X_train_encoding = encode(X_train_pos)
X_train_encoding[:5]

array([[  7,  38,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0],
       [ 23,  58,  48,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0],
       [ 11,   1,   4,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0],
       [  3,   5,  90,   5,  16,   2,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0],
       [  1,  18,  13, 106,  87,  10,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0]], dtype=int32)

In [18]:
model = Sequential([
    Embedding(20000, 300, input_length=MAX_LEN),
    LSTM(units=50),
    Dense(5, activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 27, 300)           6000000   
                                                                 
 lstm_2 (LSTM)               (None, 50)                70200     
                                                                 
 dense_2 (Dense)             (None, 5)                 255       
                                                                 
Total params: 6,070,455
Trainable params: 6,070,455
Non-trainable params: 0
_________________________________________________________________


In [19]:
model.fit(X_train_encoding, y_train, epochs=2, batch_size=32, validation_split=0.1)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x780613c21a20>

In [21]:
test_df = pd.read_csv("garments_test.csv")
test_df.head(2)

Unnamed: 0,Index,RawText,Source,Domain,MainCategory,ProductName,ReviewScore,Syllable,Word,RDate,GeneralPolarity,Aspect,SentimentText,SentimentWord,SentimentPolarity
0,112814,바늘질 마감처리 불량. 싸구려 느낌이 팍팍. 털빠짐이 없다해서 구매했는데 털빠짐이 ...,쇼핑몰,패션,남성의류,OO 프** 경량 다운 2종,20,128,29,20181215,-1.0,품질,싸구려 느낌이 팍팍.,3,-1
1,112814,바늘질 마감처리 불량. 싸구려 느낌이 팍팍. 털빠짐이 없다해서 구매했는데 털빠짐이 ...,쇼핑몰,패션,남성의류,OO 프** 경량 다운 2종,20,128,29,20181215,-1.0,품질,털빠짐이 심함.,2,-1


In [24]:
def test(x):
    labels = ["디자인", "사이즈", "가격", "품질", "기능"]
    aspects = x["Aspect"].to_list()
    result = []
    for label in labels:
        if label in aspects:
            result.append(1)
        else:
            result.append(0)
    return np.array(result)

onehot_df = test_df.groupby("RawText").apply(test).reset_index().rename(columns={ 0: "LabelList" })
onehot_df.head(2)

Unnamed: 0,RawText,LabelList
0,(12/25)생각 이상으로 아주 좋습니다. 한번 세탁해서 착용하라는 스티커대로 세탁...,"[1, 0, 0, 1, 0]"
1,*****최악최악최악최악최악***** 엄마가 선물 받아 기분 좋다고 바로 신고 나가...,"[0, 0, 0, 1, 0]"


In [None]:
X_test_pos = [discompose(x) for x in onehot_df["RawText"].to_list()]
X_test_pos[:2]

In [27]:
X_test_encoding = encode(X_test_pos)
X_test_encoding[:2]

array([[ 152,   12,  737,   74,  234,  202, 1739, 2580,  737, 1868,  321,
         832,   59, 1937,  834,  611,  127,  330,  472,  130,   74,  234,
        1751,    9, 5225,  425, 1051],
       [ 619,   60,   54, 6294,    5,   51,  611,  117,  745, 1984,   97,
         429,  232,  311, 3326,  375,   46,  429,    6, 2752,  224,  639,
        3699,  909,   59,  266,  799]], dtype=int32)

In [29]:
prediction = model.predict(X_test_encoding)
prediction[:5]



array([[1.4545528e-02, 3.2968950e-03, 8.0520630e-02, 1.1466084e-01,
        7.8697616e-01],
       [5.9797075e-02, 2.3533706e-02, 9.9059284e-02, 4.0637933e-02,
        7.7697206e-01],
       [3.6829806e-04, 1.4085379e-04, 2.4446310e-03, 9.9358630e-01,
        3.4598748e-03],
       [3.5146443e-04, 1.3524953e-04, 2.6279863e-03, 9.9385440e-01,
        3.0308936e-03],
       [2.2293854e-01, 1.0744626e-02, 3.0387370e-02, 5.4922527e-01,
        1.8670411e-01]], dtype=float32)

In [37]:
onehot_df["PredLogits"] = pd.Series(list(prediction))
onehot_df.head(2)

Unnamed: 0,RawText,LabelList,PredLogits
0,(12/25)생각 이상으로 아주 좋습니다. 한번 세탁해서 착용하라는 스티커대로 세탁...,"[1, 0, 0, 1, 0]","[0.014545528, 0.003296895, 0.08052063, 0.11466..."
1,*****최악최악최악최악최악***** 엄마가 선물 받아 기분 좋다고 바로 신고 나가...,"[0, 0, 0, 1, 0]","[0.059797075, 0.023533706, 0.099059284, 0.0406..."


In [42]:
tf.keras.activations.sigmoid(onehot_df["PredLogits"].to_list()).numpy()

array([[0.50363636, 0.5008242 , 0.52011925, 0.5286339 , 0.68718165],
       [0.5149448 , 0.50588316, 0.52474463, 0.5101581 , 0.6850272 ],
       [0.5000921 , 0.5000352 , 0.5006111 , 0.7297957 , 0.500865  ],
       ...,
       [0.5000678 , 0.5002806 , 0.73026884, 0.50026387, 0.50039107],
       [0.5000839 , 0.5000341 , 0.5007427 , 0.72981375, 0.5007198 ],
       [0.50147736, 0.69569874, 0.5252115 , 0.50955325, 0.5070102 ]],
      dtype=float32)

In [44]:
def logits_to_onehot_aspects(logits, threshold):
    sig_probabilities = tf.keras.activations.sigmoid(logits).numpy()
    return list(np.where(sig_probabilities > threshold, 1, 0))

# 0.5 일 시 모두 1임...
aspect_bools = logits_to_onehot_aspects(onehot_df["PredLogits"].to_list(), 0.6)
pred_series = pd.Series(aspect_bools)
pred_series.head()

0    [0, 0, 0, 0, 1]
1    [0, 0, 0, 0, 1]
2    [0, 0, 0, 1, 0]
3    [0, 0, 0, 1, 0]
4    [0, 0, 0, 1, 0]
dtype: object

In [55]:
count_series = pred_series.apply(lambda x: np.count_nonzero(x))
count_series.value_counts()

1    6772
0     202
2      99
dtype: int64

In [45]:
onehot_df["PredList"] = pred_series
onehot_df.head()

Unnamed: 0,RawText,LabelList,PredLogits,PredList
0,(12/25)생각 이상으로 아주 좋습니다. 한번 세탁해서 착용하라는 스티커대로 세탁...,"[1, 0, 0, 1, 0]","[0.014545528, 0.003296895, 0.08052063, 0.11466...","[0, 0, 0, 0, 1]"
1,*****최악최악최악최악최악***** 엄마가 선물 받아 기분 좋다고 바로 신고 나가...,"[0, 0, 0, 1, 0]","[0.059797075, 0.023533706, 0.099059284, 0.0406...","[0, 0, 0, 0, 1]"
2,",한사이즈 크게 주문했는데 많이크진 않네요~~스판럭은 좋구요. 그냥 입을만 해요","[0, 1, 0, 0, 0]","[0.00036829806, 0.0001408538, 0.002444631, 0.9...","[0, 0, 0, 1, 0]"
3,.사이즈가 정사이즈가 아닌듯 넘 작네요밝은색 빼고는 별루네요 화이트는넘두꺼워요,"[0, 1, 0, 0, 0]","[0.00035146443, 0.00013524953, 0.0026279863, 0...","[0, 0, 0, 1, 0]"
4,1+1 가격저렴 ~완전~ 맘에 들어요 사이즈도 딱맞고 색상도 그림이랑 또같아요,"[0, 0, 1, 0, 0]","[0.22293854, 0.010744626, 0.03038737, 0.549225...","[0, 0, 0, 1, 0]"


In [47]:
test_series = onehot_df.apply(lambda x: x["LabelList"] == x["PredList"], axis=1)
test_series.head()

0    [False, True, True, False, False]
1     [True, True, True, False, False]
2     [True, False, True, False, True]
3     [True, False, True, False, True]
4     [True, True, False, False, True]
dtype: object

In [48]:
# 완전일치
def check_full_accord(x):
    for each in x:
        if not each:
            return 0
    return 1

test_series.apply(check_full_accord).mean()

0.02601442103774919

In [49]:
# 부분일치
def check_partial_accord(x):
    result = 0
    for each in x:
        if each:
            result += 1
    return result / 5

test_series.apply(check_partial_accord).mean()

0.5591969461331825