In [1]:
#import trainer
#import data_loader

from tensorflow.keras.preprocessing.text import Tokenizer
import pandas as pd
import urllib.request
%matplotlib inline
import matplotlib.pyplot as plt
import re
from konlpy.tag import Okt
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from collections import Counter
from konlpy.tag import Mecab

In [2]:
class DataLoader():
    def __init__(self, train_path, test_path):
        self.train_data = pd.read_table('~/aiffel/sentiment_classification/data/ratings_train.txt')
        self.test_data = pd.read_table('~/aiffel/sentiment_classification/data/ratings_test.txt')

    def load_data(self, num_words=10000):
        tokenizer = Mecab()
        stopwords = ['의', '가', '이', '은', '들', '는', '좀', '잘', '걍', '과', '도', '를', '으로', '자', '에', '와', '한', '하다']
        self.train_data.drop_duplicates(subset=['document'], inplace=True)
        self.train_data = self.train_data.dropna(how='any')
        self.test_data.drop_duplicates(subset=['document'], inplace=True)
        self.test_data = self.test_data.dropna(how='any')

        X_train = []
        for sentence in self.train_data['document']:
            temp_X = tokenizer.morphs(sentence)  # 토큰화
            temp_X = [word for word in temp_X if not word in stopwords]  # 불용어 제거
            X_train.append(temp_X)

        X_test = []
        for sentence in self.test_data['document']:
            temp_X = tokenizer.morphs(sentence)  # 토큰화
            temp_X = [word for word in temp_X if not word in stopwords]  # 불용어 제거
            X_test.append(temp_X)

        words = np.concatenate(X_train).tolist()
        counter = Counter(words)
        counter = counter.most_common(num_words - 4)
        vocab = ['<PAD>', '<BOS>', '<UNK>', '<UNUSED>'] + [key for key, _ in counter]
        self.word_to_index = {word: index for index, word in enumerate(vocab)}

        def wordlist_to_indexlist(wordlist):
            return [self.word_to_index[word] if word in self.word_to_index else self.word_to_index['<UNK>'] for word in wordlist]

        self.X_train = list(map(wordlist_to_indexlist, X_train))
        self.X_test = list(map(wordlist_to_indexlist, X_test))
        self.Y_train = np.array(list(self.train_data['label']))
        self.Y_test = np.array(list(self.test_data['label']))
        #return self.X_train, self.Y_train, self.X_test, self.Y_test, self.word_to_index

    def get_maxlen(self):
        total_data_text = list(self.X_train) + list(self.X_test)
        # 텍스트데이터 문장길이의 리스트를 생성한 후
        num_tokens = [len(tokens) for tokens in total_data_text]
        num_tokens = np.array(num_tokens)
        # 문장길이의 평균값, 최대값, 표준편차를 계산해 본다.
        print('문장길이 평균 : ', np.mean(num_tokens))
        print('문장길이 최대 : ', np.max(num_tokens))
        print('문장길이 표준편차 : ', np.std(num_tokens))

        # 예를들어, 최대 길이를 (평균 + 2*표준편차)로 한다면,
        max_tokens = np.mean(num_tokens) + 2 * np.std(num_tokens)
        self.maxlen = int(max_tokens)
        print('pad_sequences maxlen : ', self.maxlen)
        print('전체 문장의 {}%가 maxlen 설정값 이내에 포함됩니다. '.format(np.sum(num_tokens < max_tokens) / len(num_tokens)))
        return self.maxlen

    def set_pad(self, padding='post'):
        self.X_train = keras.preprocessing.sequence.pad_sequences(self.X_train,
                                                             value=self.word_to_index["<PAD>"],
                                                             padding='post',  # 혹은 'pre'
                                                             maxlen=self.maxlen)

        self.X_test = keras.preprocessing.sequence.pad_sequences(self.X_test,
                                                            value=self.word_to_index["<PAD>"],
                                                            padding='post',  # 혹은 'pre'
                                                            maxlen=self.maxlen)
        return self.X_train, self.X_test

    def get_data(self, train_idx=30000):
        x_train = self.X_train[:train_idx]
        x_val = self.X_train[train_idx:]

        y_train = self.Y_train[:train_idx]
        y_val = self.Y_train[train_idx:]

        return x_train, y_train, x_val, y_val, self.X_test, self.Y_test

In [None]:
from sklearn.model_selection import train_test_split

class Trainer():
    def data_split(self, src_data, tgt_data,  val_size = 0.2, test_size = 0.2):
        x_data, self.test_x, y_data, self.test_y = train_test_split(src_data, tgt_data, test_size=test_size)
        self.train_x, self.val_x, self.train_y, self.val_y = train_test_split(x_data, y_data, test_size=val_size)

    def train(self, model, optimizer, loss, epochs=100, batch_size=512, verbose=2):
        model.compile(loss=loss, optimizer=optimizer, metrics=['accuracy'])
        self.hist = model.fit(self.train_x, self.train_y, epochs=epochs, batch_size=batch_size, validation_data=(self.val_x, self.val_y), verbose=verbose)

    def test(self, model):
        #result = model.evaluate(self.test_x, self.test_y, verbose=2)
        #print(result)
        test_loss, test_acc = self.model.evaluate(self.test_x, self.test_y)
        print("test_loss    :{}".format(test_loss))
        print("test_accuracy:{}".format(test_acc))

    def visualization(self):
        history_dict = self.hist.history
        
        acc = history_dict['accuracy']
        val_acc = history_dict['val_accuracy']
        loss = history_dict['loss']
        val_loss = history_dict['val_loss']

        epochs = range(1, len(acc) + 1)

        plt.figure(figsize=(12, 8))

        # loss 그래프
        plt.subplot(1, 2, 1)
        plt.plot(epochs, loss, 'b', label='Training loss')
        plt.plot(epochs, val_loss, 'r', label='Validation loss')
        plt.title('Training and validation loss')
        plt.xlabel('Epochs')
        plt.ylabel('Loss')
        plt.legend()

        # accuracy 그래프
        plt.subplot(1, 2, 2)
        plt.plot(epochs, acc, 'b', label='Training acc')
        plt.plot(epochs, val_acc, 'r', label='Validation acc')
        plt.title('Training and validation accuracy')
        plt.xlabel('Epochs')
        plt.ylabel('Accuracy')
        plt.legend()

        plt.show()

Error: Kernel is dead

In [4]:
train_path = '~/aiffel/sentiment_classification/data/ratings_train.txt'
test_path = '~/aiffel/sentiment_classification/data/ratings_test.txt'

In [5]:
data_loader = DataLoader(train_path, test_path)

In [6]:
data_loader.load_data(10000)
data_loader.get_maxlen()
data_loader.set_pad('post')
x_train, y_train , x_val, y_val, x_test, y_test = data_loader.get_data(train_idx=30000)

문장길이 평균 :  15.96940191154864
문장길이 최대 :  116
문장길이 표준편차 :  12.843571191092
pad_sequences maxlen :  41
전체 문장의 0.9342988343341575%가 maxlen 설정값 이내에 포함됩니다. 


# Base Model
LSTM Model

In [7]:
vocab_size = 10000    # 어휘 사전의 크기입니다(10,000개의 단어)
word_vector_dim = 25

lstm_model = keras.Sequential()
lstm_model.add(keras.layers.Embedding(vocab_size, word_vector_dim))
lstm_model.add(keras.layers.LSTM(512, activation = 'relu'))
lstm_model.add(keras.layers.Dense(512, activation='relu'))
lstm_model.add(keras.layers.Dense(128, activation='relu'))
lstm_model.add(keras.layers.Dense(1, activation='sigmoid'))

lstm_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 25)          250000    
_________________________________________________________________
lstm (LSTM)                  (None, 512)               1101824   
_________________________________________________________________
dense (Dense)                (None, 512)               262656    
_________________________________________________________________
dense_1 (Dense)              (None, 128)               65664     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 129       
Total params: 1,680,273
Trainable params: 1,680,273
Non-trainable params: 0
_________________________________________________________________


# Attention model

In [23]:
input_dim = 41

# input layer
inputs = keras.layers.Input(shape=(input_dim,))

# attention layer
attention_probs = keras.layers.Dense(input_dim, activation='softmax')(inputs)
attention_mul = keras.layers.multiply([inputs, attention_probs])

# fc layer
fc = keras.layers.Dense(64)(attention_mul)
output = keras.layers.Dense(1, activation='sigmoid')(attention_mul)

attention_model = keras.Model(inputs=[inputs], outputs=output)

attention_model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, 41)]         0                                            
__________________________________________________________________________________________________
dense_9 (Dense)                 (None, 41)           1722        input_3[0][0]                    
__________________________________________________________________________________________________
multiply_2 (Multiply)           (None, 41)           0           input_3[0][0]                    
                                                                 dense_9[0][0]                    
__________________________________________________________________________________________________
dense_11 (Dense)                (None, 1)            42          multiply_2[0][0]           

In [52]:
trainer = Trainer()

In [53]:
trainer.train_x = x_train
trainer.val_x = x_val
trainer.test_x = x_test
trainer.train_y = y_train
trainer.val_y = y_val
trainer.test_y = y_test

In [54]:
trainer.train(model=attention_model, optimizer='adam', loss='binary_crossentropy', epochs=10, verbose=2)

Epoch 1/10
59/59 - 1s - loss: 0.7634 - val_loss: 0.7441
Epoch 2/10
59/59 - 0s - loss: 0.7118 - val_loss: 0.7331
Epoch 3/10
59/59 - 0s - loss: 0.7040 - val_loss: 0.7348
Epoch 4/10
59/59 - 0s - loss: 0.7022 - val_loss: 0.7147
Epoch 5/10
59/59 - 0s - loss: 0.7007 - val_loss: 0.7255
Epoch 6/10
59/59 - 0s - loss: 0.7041 - val_loss: 0.7391
Epoch 7/10
59/59 - 0s - loss: 0.7205 - val_loss: 0.7530
Epoch 8/10
59/59 - 0s - loss: 0.7079 - val_loss: 0.7287
Epoch 9/10
59/59 - 0s - loss: 0.7054 - val_loss: 0.7291
Epoch 10/10
59/59 - 0s - loss: 0.7027 - val_loss: 0.7415


In [55]:
trainer.visualization()

KeyError: 'accuracy'

In [56]:
trainer.test(attention_model)


1537/1537 - 2s - loss: 0.7302
0.7302318811416626


In [None]:
trainer.test()
