In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import re
import pickle

from konlpy.tag import Okt

In [2]:
text_morph_df = pd.read_csv('morph_text.csv', encoding='utf-8')
text_nouns_df = pd.read_csv('noun_text.csv', encoding='utf-8')
label_df = pd.read_csv('classifi_corpus_label.csv', encoding='utf-8') 

In [3]:
data_df = pd.DataFrame()
data_df['ntext'] = text_nouns_df['text']
data_df['mtext'] = text_morph_df['text']
data_df['label'] = label_df['label']


In [4]:
with open('index_to_word_byOkt.pickle', 'rb') as f :
    index_to_word = pickle.load(f)
with open('index_to_noums_byOkt.pickle', 'rb') as f :
    index_to_noums = pickle.load(f)
with open('word_to_index_byOkt.pickle', 'rb') as f :
    word_to_index = pickle.load(f)
with open('noums_to_index_byOkt.pickle', 'rb') as f :
    noums_to_index = pickle.load(f)

In [5]:
(len(index_to_word), len(word_to_index)), (len(index_to_noums), len(noums_to_index))

((9997, 9997), (9999, 9999))

In [6]:
num_data_df = data_df.drop('mtext', axis= 1)
mor_data_df = data_df.drop('ntext', axis= 1)

In [7]:
num_data_df = num_data_df.dropna()
mor_data_df = mor_data_df.dropna()

In [8]:
num_data_df.ntext = num_data_df.ntext.apply(lambda x: x.split())
mor_data_df.mtext = mor_data_df.mtext.apply(lambda x: x.split())

In [9]:
#단어 -> 인덱스 변환
def n_to_int(x):
    pad_index = []
    for i in x:
        pad_index.append(noums_to_index[i])
    return pad_index

#morph -> 인덱스 변환
def w_to_int(x):
    pad_index = []
    for i in x:
        pad_index.append(word_to_index[i])
    return pad_index

In [10]:
nom_X = num_data_df.ntext.apply(lambda x: n_to_int(x))
mor_X = mor_data_df.mtext.apply(lambda x: w_to_int(x))

In [11]:
from keras.preprocessing.sequence import pad_sequences

X_nom =pad_sequences(nom_X, maxlen=15, padding='post', truncating='pre')
X_mor =pad_sequences(mor_X, maxlen=15, padding='post', truncating='pre')


print(X_nom[:5])
print(X_mor[:5])

Using TensorFlow backend.


[[2860 2329 8723 9849 4787  391    0    0    0    0    0    0    0    0
     0]
 [6440 8723 9849 4467 7516    0    0    0    0    0    0    0    0    0
     0]
 [6440 8723 9849 4467 7516    0    0    0    0    0    0    0    0    0
     0]
 [7241  359    0    0    0    0    0    0    0    0    0    0    0    0
     0]
 [3267 8723 3884 1464 3742 5649 4380    0    0    0    0    0    0    0
     0]]
[[1733 4535 3053 3843 6070  637  889  755    0    0    0    0    0    0
     0]
 [1910 3053 3843 9477 5769 4793    0    0    0    0    0    0    0    0
     0]
 [1910 3053 3843 9477 5769 4793    0    0    0    0    0    0    0    0
     0]
 [7808   61 3688    0    0    0    0    0    0    0    0    0    0    0
     0]
 [ 169 3053 4934 3923 2480 1667 2248 4346  169 3390 8004 9250 5123 7137
   158]]


In [12]:
X_nom.shape , X_mor.shape

((250614, 15), (256913, 15))

In [13]:
Y_num = num_data_df['label'].values
Y_mor = mor_data_df['label'].values

In [14]:
from sklearn.model_selection import train_test_split

x_n_train, x_n_test, y_n_train, y_n_test = train_test_split(X_nom, Y_num, test_size = 0.2, random_state = 77)

In [15]:
x_n_train.shape, y_n_train.shape

((200491, 15), (200491,))

In [16]:
from keras.models import Sequential
from keras.layers import Embedding, SimpleRNN, Dense
from keras.layers import LSTM
model1 = Sequential()
model1.add(Embedding(9999, 32,input_length=15))
model1.add(LSTM(64, return_sequences= True))
model1.add(LSTM(32))
model1.add(Dense(1, activation="sigmoid"))






In [17]:
model1.compile(loss= 'binary_crossentropy', optimizer = 'adam', metrics = ['acc'])
history = model1.fit(x_n_train, y_n_train, batch_size =128, epochs =10 , validation_split =0.2)



Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

Train on 160392 samples, validate on 40099 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [18]:
from sklearn.model_selection import train_test_split

x_m_train, x_m_test, y_m_train, y_m_test = train_test_split(X_mor, Y_mor, test_size = 0.2, random_state = 77)

In [19]:
from keras.models import Sequential
from keras.layers import Embedding, SimpleRNN, Dense
from keras.layers import LSTM
model2 = Sequential()
model2.add(Embedding(9997, 32,input_length=15))
model2.add(LSTM(64, return_sequences= True))
model2.add(LSTM(32))
model2.add(Dense(1, activation="sigmoid"))

In [20]:
model2.compile(loss= 'binary_crossentropy', optimizer = 'adam', metrics = ['acc'])
history = model1.fit(x_m_train, y_m_train, batch_size =128, epochs =10 , validation_split =0.2)

Train on 164424 samples, validate on 41106 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [21]:
model1.evaluate(x_n_test, y_n_test)



[0.7555449545238375, 0.8393751371613909]

In [22]:
model2.evaluate(x_m_test, y_m_test)



[0.6910165342005722, 0.8018021524547567]