In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import urllib.request
from konlpy.tag import Okt
from tqdm import tqdm
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Dense, LSTM
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

def get_data(file_url, file_name):
    urllib.request.urlretrieve(file_url, filename=file_name)

    return pd.read_table(file_name)

def text_processing(train_data):
    processed_data = train_data
    processed_data.drop_duplicates(subset=['document'], inplace=True)   # document 열의 중복 제거 (원본 값 변경 허용)
    processed_data = processed_data.dropna(how = 'any') # document 열의 NaN 제거
    processed_data['document'] = processed_data['document'].str.replace("[^ㄱ-힣 ]", "")    # document 열의 값들 한국어와 공백만 남기고 모두 제거
    processed_data['document'].replace("", np.nan, inplace=True)    # 빈칸 document값은 NaN으로 변환 (원본 값 변경 허용)
    processed_data = processed_data.dropna(how = 'any') # document 열의 NaN 제거

    return processed_data

def tokenization(data):
    okt = Okt()
    tokenized = list()

    for sentence in tqdm(data['document']):
        tokenized.append([pos_set[0] for pos_set in okt.pos(okt.normalize(sentence)) if pos_set[1] != 'Josa'])

    return tokenized

def integer_encoding(tokenized_data, processed_data, min_freq=3):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(tokenized_data)

    word_count = len(tokenizer.word_index) + 1
    for (_, freq) in tokenizer.word_counts.items():
        if freq < min_freq: word_count -= 1

    tokenizer = Tokenizer(word_count)
    tokenizer.fit_on_texts(tokenized_data)

    ret = tokenizer.texts_to_sequences(tokenized_data)
    sol = np.array(processed_data['label'])
    drop = [idx for (idx, sentence) in enumerate(ret) if len(sentence) == 0]

    return {
        'encoded': np.delete(ret, drop, axis=0),
        'solution': np.delete(sol, drop, axis=0),
        'size': word_count
    }

def padding(integer_encoded, max_len=None):
    return pad_sequences(integer_encoded) if max_len is None else pad_sequences(integer_encoding, max_len=max_len)

def train(train_data, solution, size, show_accuracy=True, best_model_name='best_model', embedding_dim=100, hidden_units=128, verbose=1, patience=4, epochs=15, batch_size=64, validation_split=0.2, save_best_only=True):
    model = Sequential()
    model.add(Embedding(size, embedding_dim))
    model.add(LSTM(hidden_units))
    model.add(Dense(1, activation='sigmoid'))

    es = EarlyStopping(monitor='val_loss', mode='min', verbose=verbose, patience=patience)
    mc = ModelCheckpoint(best_model_name + '.h5', monitor='val_acc', mode='max', verbose=verbose, save_best_only=save_best_only)

    model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
    model.fit(train_data, solution, epochs=epochs, callbacks=[es, mc], batch_size=batch_size, validation_split=validation_split)

    if show_accuracy: print("accuracy: %.4f" % (load_model(best_model_name + '.h5').evaluate(train_data, solution)[1]))

    return model

def predict(review, word_count, pad_max_len=None, best_model_name='best_model'):
    okt = Okt()
    tokenizer = Tokenizer(word_count)

    review = review.replace("[^ㄱ-힣 ]", "")
    review = [[pos_set[0] for pos_set in okt.pos(okt.normalize(review)) if pos_set[1] != 'Josa']]
    encoded = tokenizer.texts_to_sequences(review)
    pad_new = padding(encoded, max_len=pad_max_len)
    score = float(load_model(best_model_name + '.h5').predict(pad_new)) # 예측
    
    return score

class AI:
    def __init__(self, train_file_url, train_file_name):
        self.data = text_processing(get_data(train_file_url, train_file_name))
        self.tokenized = tokenization(self.data)
        self.encoded = integer_encoding(self.tokenized, self.data)
        self.padded = padding(self.encoded['encoded'])
        self.model = train(self.padded, self.encoded['solution'], self.encoded['size'])

    def is_positive(self, review, is_print=True):
        score = predict(review, self.encoded['size'])
        if is_print: print("{:.2f}% positive" % (score * 100))
        else: return score

In [2]:
ai = AI("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt", "ratings_train.txt")

  processed_data['document'] = processed_data['document'].str.replace("[^ㄱ-힣 ]", "")    # document 열의 값들 한국어와 공백만 남기고 모두 제거
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  processed_data['document'] = processed_data['document'].str.replace("[^ㄱ-힣 ]", "")    # document 열의 값들 한국어와 공백만 남기고 모두 제거
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  processed_data['document'].replace("", np.nan, inplace=True)    # 빈칸 document값은 NaN으로 변환 (원본 값 변경 허용)
100%|██████████| 145804/145804 [09:30<00:00, 255.68it/s]
  arr = asarray(arr)


Epoch 1/15
Epoch 1: val_acc improved from -inf to 0.84346, saving model to best_model.h5
Epoch 2/15
Epoch 2: val_acc improved from 0.84346 to 0.85536, saving model to best_model.h5
Epoch 3/15
Epoch 3: val_acc improved from 0.85536 to 0.86184, saving model to best_model.h5
Epoch 4/15
Epoch 4: val_acc did not improve from 0.86184
Epoch 5/15
Epoch 5: val_acc did not improve from 0.86184
Epoch 6/15
Epoch 6: val_acc did not improve from 0.86184
Epoch 7/15
Epoch 7: val_acc did not improve from 0.86184
Epoch 8/15
Epoch 8: val_acc did not improve from 0.86184
Epoch 8: early stopping
accuracy: 0.8868


In [3]:
ai.is_positive("올해 최고의 영화! 세 번 넘게 봐도 질리지가 않네요.")

ValueError: in user code:

    File "c:\Python310\lib\site-packages\keras\engine\training.py", line 2041, in predict_function  *
        return step_function(self, iterator)
    File "c:\Python310\lib\site-packages\keras\engine\training.py", line 2027, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Python310\lib\site-packages\keras\engine\training.py", line 2015, in run_step  **
        outputs = model.predict_step(data)
    File "c:\Python310\lib\site-packages\keras\engine\training.py", line 1983, in predict_step
        return self(x, training=False)
    File "c:\Python310\lib\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "c:\Python310\lib\site-packages\keras\backend.py", line 4950, in <listcomp>
        inputs, [inp[0] for inp in flatted_inputs]

    ValueError: Exception encountered when calling layer "lstm" "                 f"(type LSTM).
    
    slice index 0 of dimension 0 out of bounds. for '{{node strided_slice_1}} = StridedSlice[Index=DT_INT32, T=DT_FLOAT, begin_mask=0, ellipsis_mask=0, end_mask=0, new_axis_mask=0, shrink_axis_mask=1](transpose, strided_slice_1/stack, strided_slice_1/stack_1, strided_slice_1/stack_2)' with input shapes: [0,?,100], [1], [1], [1] and with computed input tensors: input[1] = <0>, input[2] = <1>, input[3] = <1>.
    
    Call arguments received by layer "lstm" "                 f"(type LSTM):
      • inputs=tf.Tensor(shape=(None, 0, 100), dtype=float32)
      • mask=None
      • training=False
      • initial_state=None
