# Text Generation using LSTM Model for Korean Community Website Posts

### Importing Libraries

In [60]:
from __future__ import print_function
from keras.callbacks import LambdaCallback
from keras.callbacks import ModelCheckpoint
from keras.models import Sequential
from keras.models import model_from_json
from keras.layers import Dense, Activation, Dropout
from keras.layers import LSTM
from keras.utils.data_utils import get_file
from keras.optimizers import RMSprop
from keras.optimizers import Adam

In [61]:
import pandas as pd
import numpy as np

from urllib.request import urlopen
import json
import io
import random
import sys

### Importing Data

In [62]:
# # When running on Google Colab
# from google.colab import drive
# drive.mount('/content/drive')

# url = 'https://raw.githubusercontent.com/duckonomy/cs344/master/project/api/models/dcinside.json'
# data = urlopen(url).read().decode('utf-8')
# data = json.loads(data)

# load_path = 'drive/My Drive/Colab Notebooks/checkpoint/model-ilbe.h5'
# load_model = 'drive/My Drive/Colab Notebooks/checkpoint/model.json'

In [63]:
# When working on a Local System
load_weights_dcinside = 'api/models/model-dcinside.h5'
load_model_dcinside = 'api/models/model.json'
data_file_dcinside = 'api/models/dcinside.json'

load_weights_opgg = 'api/models/model-opgg.h5'
load_model_opgg = 'api/models/model.json'
data_file_opgg = 'api/models/opgg.json'

For this example we will be using a single community datafile

In [64]:
data_new = {}
data_title = {}
data_content = {}

In [65]:
with open(data_file_dcinside, encoding='utf-8') as json_file:
    j = 0

    data = json.load(json_file)

    for i in data:
        title = i['title']
        title = [c for c in title if '\xa0' not in c]
        title = [c for c in title if '\n' not in c]
        title = [c for c in title if 'jpg' not in c]
        title = [c for c in title if 'gif' not in c]
        title = [c for c in title if 'fact' not in c]

        title_str = ''.join(map(str, title))
        data_title[str(j)] = title_str.strip()
        content = i['content']
        content = [c for c in content if 'http' not in c]
        content = [c for c in content if '\xa0' not in c]
        content = [c for c in content if '\n' not in c]
        content = [c for c in content if '- dc official App' not in c]
        content = [c for c in content if '\.jpg' not in c]
        content = [c for c in content if '\.gif' not in c]
        content = [c for c in content if '\.' not in c]

        content_str = ''.join(map(str, content))
        data_content[str(j)] = content_str.strip()
        j += 1

    data_new['title'] = data_title
    data_new['content'] = data_content
    
json_final = json.dumps(data_new, ensure_ascii=False)

In [66]:
df = pd.read_json(json_final)
title_text_arr = df['title'].to_numpy()
content_text_arr = df['content'].to_numpy()

text = df['content'].str.lower()
text_content = df['title'].str.lower()

text = text.append(text_content)

In [67]:
text = text.map(lambda s: ' '.join([x for x in s.split() if '\u200b' not in x]))

# Eliminate text that isn't as long
text = text[text.map(len) > 13]

# Map the characters bidirectionally for encoding
chars = sorted(list(set(''.join(text))))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

chars_length = len(chars)

In [74]:
max_sequence_length = 20
step = 2

In [68]:
def sample(predictions, temperature=0.2):
    predictions = np.asarray(predictions).astype('float64')
    predictions = np.log(predictions) / temperature
    exp_preds = np.exp(predictions)
    predictions = exp_preds / np.sum(exp_preds)
    probabilities = np.random.multinomial(1, predictions, 1)
    return np.argmax(probabilities)

# Similar to print_current_model() in lstm_train.ipynb
def generate_text(sequence, diversity):
    sequence = sequence[0:max_sequence_length]
    generated = ''
    generated += sequence

    sys.stdout.write(generated)

    for i in range(40):
        x_pred = np.zeros((1, max_sequence_length, len(chars)))
        for t, char in enumerate(sequence):
            x_pred[0, t, char_indices[char]] = 1.

        preds = model.predict(x_pred, verbose=0)[0]
        next_index = sample(preds, diversity)
        next_char = indices_char[next_index]

        generated += next_char
        sequence = sequence[1:] + next_char

    return generated

Load model to evaluate

In [69]:
json_file = open(load_model_dcinside, 'r')
model_json = json_file.read()
json_file.close()
model = model_from_json(model_json)
model.load_weights(load_weights_dcinside)

In [70]:
optimizer = Adam()
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

In [71]:
blacklist = [
    "어",
    "듬",
    "음",
    "슴",
    "지",
    "삼",
    "듯",
    "음",
    "야",
    "럼",
    "옴",
    "임",
    "며",
    "좀",
    "김",
    "림",
    "금"
    "나",
    "네",
    "다",
    "점",
    "함",
    "셈",
    "자",
    "써",
    "데",
    "요",
    "라",
    "퍼",
    "중",
    "됨",
    "셈",
    "까",
    "짐",
    "당",
    "님",
    "분",
    "니",
    "햇",
    "가",
    "냐",
    "븃",
    "죠",
    "구나",
    "조음",
    "웃김",
    "들아",
    "있노",
    "냐고",
    "앗서",
    "잖아",
    "ㅅㄱ",
    "ㅋㅋ",
    "ㄹㅇ",
    "ㄷㄷ",
    "ㅜㅜ",
    "ㅎㅎ",
    "네ㅔ",
]

In [72]:
def blacklist_contains(blacklist, seed):
    for i in blacklist:
        if i in seed[-2:]:
            return True
    return False

In [73]:
my_text = random.choice(list(text))

while True:
    if (blacklist_contains(blacklist, my_text)):
        my_text = random.choice(list(text))
    else:
        break

print(generate_text(my_text, 0.2))

더워서 창문열엇더니 바로 재채기 나오더워서 창문열엇더니 바로 재채기 나오내버릴놓구도지부터 커스리랑 소용한 사업이 굴치보루 해담히었는 전이어보고하


  predictions = np.log(predictions) / temperature
