<a href="https://colab.research.google.com/github/emiatej9/sentence-classifier/blob/master/colab/cnn_yoon_kim_2014.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [18]:
%tensorflow_version 2.x

TensorFlow is already loaded. Please restart the runtime to change versions.


In [0]:
import json
import requests
import os
import tensorflow as tf
import numpy as np

git_url = 'https://raw.githubusercontent.com/emiatej9/sentence-classifier/master'

tf.keras.backend.clear_session()

# nsmc 데이터셋 다운로드 


*   mecab를 이용하여 미리 전처리 된 데이터셋을 다운로드(./data/nsmc)
*   문장과 sentences.txt와 긍/부정 라벨 labels.txt을 train, dev, test 별로 저장함



In [58]:
dataset_path = 'data/nsmc'
dataset_files = {
    'train': ('train/sentences.txt', 'train/labels.txt'),
    'dev': ('dev/sentences.txt', 'dev/labels.txt'),
    'test': ('test/sentences.txt', 'test/labels.txt')}

# all dataset splits are loaded into dict object
dataset = {
    'train': dict(),
    'dev': dict(),
    'test': dict()
}

if not os.path.exists(dataset_path):
    os.makedirs(dataset_path)

for split, files  in dataset_files.items():
    _path = f'{dataset_path}/{split}'
    if not os.path.exists(_path):
        os.makedirs(_path)

    with open(f'{_path}/sentences.txt', 'w', encoding='utf-8') as f:
        # download dataset from github
        _url = f'{git_url}/{dataset_path}/{files[0]}'
        _res = requests.get(_url)
        assert _res.status_code == 200, _url

        # save dataset as txt file
        sentences = _res.content.decode('utf-8')
        f.write(sentences)
        dataset[split]['sentences'] = sentences.split('\n')
        
    
    with open(f'{_path}/labels.txt', 'w', encoding='utf-8') as f:
        _url = f'{git_url}/{dataset_path}/{files[1]}'
        _res = requests.get(_url)
        assert _res.status_code == 200, _url

        labels = _res.content.decode('utf-8')
        f.write(labels)
        dataset[split]['labels'] = list(map(int, labels.split('\n')))

평점 나쁘 않다 더더욱 아니


# 파라미터 다운로드

*   파라미터 JSON 파일 dataset_params.json, model_params.json, training_params.json 을 ./params 밑에 다운로드
* 다운로드 후, 각 JSON 파일을 data_params, model_params, training_params 객체로 초기화.



In [0]:
params_files = ('dataset_params.json', 
                'model_params.json', 
                'training_params.json')

if not os.path.exists('./params'):
    os.makedirs('./params')

for params_file in params_files:
    _url = f'{git_url}/params/{params_file}'
    _res = requests.get(_url)
    assert _res.status_code == 200, _url

    with open(f'params/{params_file}', 'w', encoding='utf-8') as f:
        param_json = _res.content.decode('utf-8')
        f.write(param_json)
        
with open(f'params/dataset_params.json') as f:
    data_params = json.load(f)

with open(f'params/model_params.json') as f:
    model_params = json.load(f)

with open(f'params/training_params.json') as f:
    training_params = json.load(f)

In [0]:
if not os.path.exists('./model'):
    os.makedirs('./model')

# downloads input_fn.py 
_url = f'{git_url}/model/input_fn.py'
_res = requests.get(_url)
assert _res.status_code == 200, _url

with open(f'model/input_fn.py', 'w', encoding='utf-8') as f:
        input_fn_py = _res.content.decode('utf-8')
        f.write(input_fn_py)

# downloads model_fn.py
_url = f'{git_url}/model/model_fn.py'
_res = requests.get(_url)
assert _res.status_code == 200, _url

with open(f'model/model_fn.py', 'w', encoding='utf-8') as f:
        model_fn_py = _res.content.decode('utf-8')
        f.write(model_fn_py)
        
from model.input_fn import input_fn
from model.model_fn import model_fn

# 데이터 정수 인코딩


*   input_fn에서 각 단어를 정수 형태로 인코딩하여, 문장을 벡터로 변환 후 동일한 길이로 padding 처리.
*   vocab의 크기와 문장 벡터의 차원은 data_params['vocab_size']과 data_params['max_sentence_length']을 이용.
*   긍정(1)과 부정(0) label은 바로 numpy 형태로 변환



In [0]:
training_sentences = input_fn(dataset['train']['sentences'], data_params)
training_labels = np.asarray([int(label) for label in dataset['train']['labels']])
dev_sentences = input_fn(dataset['dev']['sentences'], data_params)
dev_labels = np.asarray([int(label) for label in dataset['dev']['labels']])

# 모델 생성

In [65]:
model = model_fn(data_params, model_params)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# print(model.summary())

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


# 모델 학습

In [0]:
batch_size = training_params['batch_size']
epochs = training_params['epochs']

with tf.device('/GPU:0'):
    history = model.fit(training_sentences, training_labels, 
                        batch_size=batch_size,
                        epochs=epochs,
                        validation_data=(dev_sentences, dev_labels),
                        shuffle=True)

Train on 136630 samples, validate on 29278 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [0]:
import matplotlib.pyplot as plt
%matplotlib inline

plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

In [0]:
model.save_weights('weight.h5')

In [0]:
test_sentences = input_fn(dataset['test']['sentences'], data_params)
test_labels = np.asarray([int(label) for label in dataset['test']['labels']])

loss, acc = model.evaluate(test_sentences, test_labels)
print(loss, acc)