In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import re
from konlpy.tag import Okt
from tensorflow.keras.preprocessing.text import Tokenizer
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences

## word2vec 임포트

In [11]:
from gensim.models import Word2Vec
ko_model= Word2Vec.load('word2vec_movie.model')

In [12]:
# 모델 확인 
ko_model['평점']

  


array([ 1.37894905e+00,  9.87708807e-01, -5.03856897e-01, -5.77842712e-01,
        2.71781266e-01, -4.84355837e-01,  7.09133685e-01,  4.83562052e-01,
       -2.12013578e+00, -1.53143942e+00, -2.21780896e+00, -4.36312646e-01,
       -7.34049320e-01,  1.32041967e+00, -5.64270079e-01, -1.27724886e+00,
       -6.67323589e-01,  1.25785872e-01,  8.86485755e-01,  5.52864611e-01,
        1.83218852e-01,  3.81816566e-01, -1.45142281e+00,  1.32863295e+00,
       -2.28901446e-01, -8.47232699e-01, -1.07557023e+00,  1.31017220e+00,
       -1.11492407e+00,  4.67573434e-01, -1.73956227e+00,  5.30944943e-01,
       -7.81446993e-01,  8.03073198e-02,  1.06718056e-02, -7.76855171e-01,
       -5.18462360e-01,  1.82522148e-01, -3.42364371e-01,  8.30587745e-01,
       -6.77727163e-01,  2.17546612e-01, -8.99270713e-01,  3.50216419e-01,
        1.27970874e-01, -1.43166259e-02, -6.87184393e-01,  7.87147760e-01,
        3.13703835e-01, -1.34181023e+00, -2.88948655e-01,  4.87908751e-01,
       -4.40034330e-01, -

In [13]:
print(ko_model.wv.most_similar("최민식"))

[('한석규', 0.9022693634033203), ('이민호', 0.8778703808784485), ('이미숙', 0.8716947436332703), ('설경구', 0.8695714473724365), ('김명민', 0.8679181933403015), ('이정재', 0.863978385925293), ('정재영', 0.8619986176490784), ('메릴', 0.8613458871841431), ('이주승', 0.861168622970581), ('공리', 0.8594560027122498)]


## Train Test Data 불러오기

In [4]:
import pickle
import pandas as pd
test = pd.read_pickle("token_test_data_ver2.pkl")
train = pd.read_pickle("token_train_data_ver2.pkl")

# train, test 명시 
training_sentences = train['token']
testing_sentences = test['token']

training_labels = train['label']
testing_labels = test['label']

training_sentences.shape, testing_sentences.shape, training_labels.shape, testing_labels.shape

((146182,), (49157,), (146182,), (49157,))

In [5]:
training_sentences

0                                       [더빙, 진짜, 짜증나다, 목소리]
1                  [흠, 포스터, 보고, 초딩, 영화, 줄, 오버, 연기, 가볍다, 않다]
2                                  [무재, 밓었, 다그, 래서, 보다, 추천]
3                      [교도소, 이야기, 구먼, 솔직하다, 재미, 없다, 평점, 조정]
4         [몬페, 의, 익살스럽다, 연기, 돋보이다, 영화, 스파이더맨, 늙다, 보이다, 하...
                                ...                        
149995                                      [인간, 문제, 소, 죄인]
149996                                             [평점, 낮다]
149997                  [이, 뭐, 한국인, 거들다, 먹거리, 필리핀, 혼혈, 착하다]
149998                      [청춘, 영화, 최고봉, 방황, 우울하다, 날, 자화상]
149999                    [한국, 영화, 최초, 수간, 하다, 내용, 담기다, 영화]
Name: token, Length: 146182, dtype: object

## Tokenize / Padding 

In [6]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# 파라미터 명시 
vocab_size = 20000
embedding_dim = 200
max_length = 30
truct_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>'

In [7]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index

word_index, len(word_index)

({'<OOV>': 1,
  '영화': 2,
  '하다': 3,
  '보다': 4,
  '없다': 5,
  '있다': 6,
  '좋다': 7,
  '재밌다': 8,
  '정말': 9,
  '것': 10,
  '되다': 11,
  '같다': 12,
  '진짜': 13,
  '이다': 14,
  '이': 15,
  '점': 16,
  '아니다': 17,
  '않다': 18,
  '만들다': 19,
  '연기': 20,
  '나오다': 21,
  '평점': 22,
  '최고': 23,
  '왜': 24,
  '스토리': 25,
  '생각': 26,
  '드라마': 27,
  '1': 28,
  '감동': 29,
  '사람': 30,
  'ㅋㅋㅋ': 31,
  '보고': 32,
  '이렇다': 33,
  '말': 34,
  '아깝다': 35,
  '더': 36,
  '때': 37,
  '배우': 38,
  'ㅋㅋ': 39,
  '내': 40,
  '거': 41,
  '감독': 42,
  '재미있다': 43,
  '뭐': 44,
  '내용': 45,
  '재미': 46,
  '그냥': 47,
  '주다': 48,
  '그': 49,
  '좀': 50,
  '자다': 51,
  '지루하다': 52,
  '재미없다': 53,
  '시간': 54,
  '쓰레기': 55,
  '가다': 56,
  '수': 57,
  '모르다': 58,
  '들다': 59,
  '그렇다': 60,
  '10': 61,
  '싶다': 62,
  '나': 63,
  '작품': 64,
  '알다': 65,
  '사랑': 66,
  '하나': 67,
  '다시': 68,
  '마지막': 69,
  '볼': 70,
  '2': 71,
  '오다': 72,
  'ㅋ': 73,
  '처음': 74,
  '완전': 75,
  '정도': 76,
  '많다': 77,
  '장면': 78,
  '액션': 79,
  '주인공': 80,
  '3': 81,
  '차다': 82,
  '안되다': 83,
  'ㅠㅠ': 

In [8]:
# Sequence 만들기 / Padding하기 - train, test 모두 
training_sequences  = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences, maxlen=max_length, 
                                padding=padding_type, truncating=truct_type)


testing_sequences  = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, 
                                padding=padding_type, truncating=truct_type)

## Word2Vec weight 만들기 

In [14]:
ko_model

<gensim.models.word2vec.Word2Vec at 0x22c5d871cc0>

In [16]:
import numpy as np

vocab_size = len(tokenizer.index_word) + 1
embedding_dim = 200

embedding_matrix = np.zeros((vocab_size, embedding_dim))

# tokenizer에 있는 단어 사전을 순회하면서 word2vec의 200차원 vector를 가져오기
for word, idx in tokenizer.word_index.items():
    embedding_vector = ko_model[word] if word in ko_model else None
    if embedding_vector is not None:
        embedding_matrix[idx] = embedding_vector
        
embedding_matrix.shape

  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.


(44612, 200)

In [17]:
word_index['최고']

23

In [18]:
ko_model['최고']

  """Entry point for launching an IPython kernel.


array([-0.8588482 ,  0.92817163,  0.02203734,  1.3697246 , -0.4709991 ,
       -1.5010548 , -0.49414608,  0.4925356 , -0.23244537, -0.04845205,
        0.09304108, -0.11300896,  0.5571053 , -0.11795222,  0.49380097,
       -0.5556328 ,  0.11953602, -0.18062858, -1.2581393 , -0.81152546,
        0.42466506,  0.8579601 ,  0.19897878, -0.81691873,  0.8666564 ,
       -1.0465446 , -0.4982905 ,  0.2539313 , -0.09490417, -0.93356013,
       -1.643275  ,  0.3243457 ,  1.2613564 ,  1.1039764 , -0.21867326,
       -0.55674183,  0.90235335, -1.1716101 ,  0.77494115, -0.5213071 ,
       -0.21292363, -0.302229  , -0.80103284, -0.12620486, -1.195164  ,
       -0.22341141,  0.38163522,  0.00649113,  0.2602783 ,  0.02779087,
        0.95976293, -0.24270368, -0.39551368,  0.12866431, -0.59832066,
        0.12882997, -0.12808815,  0.87104553,  0.00889622, -0.32881373,
        0.03664391,  0.78057766,  0.28789434,  2.0120773 ,  0.0571333 ,
        1.2447547 , -0.7004512 , -0.26596418, -0.51164573,  0.31

In [19]:
embedding_matrix[16]

array([-0.65613872, -0.1878451 , -2.28804922, -0.60850179, -0.20944199,
       -0.73954386,  0.45581877,  0.21562536, -2.23071456,  0.82642019,
       -2.05929804, -1.13468277,  0.03894617,  0.85293633, -0.44432551,
       -1.03018618, -0.01358046,  0.33014327, -0.36980888,  0.31315118,
       -0.07146841, -0.06222904,  0.22098194,  1.10881281,  0.55845213,
       -1.3358885 , -0.57032239,  0.09883324,  0.27421305,  0.37706429,
       -0.09907344,  0.66710508, -0.56833786,  0.21092594,  0.28369167,
       -1.00925922,  0.43218195,  1.21249568,  1.07392037,  1.28291428,
       -1.90316272,  1.17173433, -0.47084916,  0.65514529, -0.10202166,
       -1.52839541, -0.12795947,  1.60741544,  1.15357125, -1.92110121,
       -0.47887081,  0.40243226, -1.28478301, -0.75054759,  0.51739645,
       -0.70001084, -2.03475833, -0.02315634, -1.20234525, -0.87681413,
       -0.80264157, -1.0190345 ,  0.43861088,  0.06535848, -0.78121197,
        0.28091148,  0.85786098, -0.21858822, -1.00957274, -0.46

## 모델링

In [20]:
# 모델링  - L2 추가 
embedding_dim = 200
filter_sizes = (3, 4, 5)
num_filters = 100
dropout = 0.5
hidden_dims = 100

conv_blocks =[]
input_shape = (30)
model_input = tf.keras.layers.Input(shape=input_shape)
z = model_input
for sz in filter_sizes:
    embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length,
                                         weights = [embedding_matrix], trainable = False)(z)
    conv = tf.keras.layers.Conv1D(filters=num_filters,
                         kernel_size=sz,
                         padding="valid",
                         activation="relu",
                         strides=1)(embedding)
    conv = tf.keras.layers.GlobalAveragePooling1D()(conv)
    conv = tf.keras.layers.Flatten()(conv)
    conv_blocks.append(conv)
z = tf.keras.layers.Concatenate()(conv_blocks) if len(conv_blocks) > 1 else conv_blocks[0]

z = tf.keras.layers.Dense(hidden_dims, activation="relu", kernel_regularizer=tf.keras.regularizers.l2(0.003), bias_regularizer=tf.keras.regularizers.l2(0.003))(z)
z = tf.keras.layers.Dropout(dropout)(z)
model_output = tf.keras.layers.Dense(1, activation="sigmoid")(z)
model = tf.keras.Model(model_input, model_output)

In [21]:
model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 30)]         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 30, 200)      8922400     input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 30, 200)      8922400     input_1[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 30, 200)      8922400     input_1[0][0]                    
_______________________________________________________________________________________

In [None]:
def plot_graphs(history, string, name='model'):
    plt.plot(history.history[string])
    plt.plot(history.history['val_' + string])
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.title(name)
    plt.legend([string, 'val_' + string])

    fig = plt.gcf()
    ##저장될 폴더생성
    result_dir = './result_file'
    if not os.path.exists(result_dir):
        os.makedirs(result_dir)
    fig.savefig(result_dir+'/{}.png'.format(name), dpi = fig.dpi)
    print('<{}.png> result_file폴더에 결과 그래프 저장 완료'.format(name))
    plt.show()

In [None]:
batch_size = 50
num_epochs = 10
min_word_count = 1
context = 10

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

checkpoint_dir = './ckpt2'
if not os.path.exists(checkpoint_dir):
  os.makedirs(checkpoint_dir)
callbacks = [
  keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=0),   
  keras.callbacks.ModelCheckpoint(
      filepath=checkpoint_dir + '/ckpt2-loss={loss:.3f}')
  ]

history = model.fit(training_padded, training_labels, epochs=10, callbacks=callbacks, batch_size = batch_size, validation_data=(testing_padded, testing_labels))
accuracy_graph = plot_graphs(history, 'accuracy',name='model2_accuracy')
loss_graph= plot_graphs(history, 'loss',name='model2_loss')  