# Text Generation using LSTM Model for Korean Community Website Posts

### Importing Libraries

In [1]:
from __future__ import print_function
from keras.callbacks import LambdaCallback
from keras.callbacks import ModelCheckpoint
from keras.models import Sequential
from keras.models import model_from_json
from keras.layers import Dense, Activation, Dropout
from keras.layers import LSTM
from keras.utils.data_utils import get_file
from keras.optimizers import RMSprop
from keras.optimizers import Adam

Using TensorFlow backend.


In [2]:
import pandas as pd
import numpy as np

from urllib.request import urlopen
import json
import io
import random
import sys

### Importing Data

In [None]:
# # When running on Google Colab
# from google.colab import drive
# drive.mount('/content/drive')

# url = 'https://raw.githubusercontent.com/duckonomy/cs344/master/project/api/models/dcinside.json'
# data = urlopen(url).read().decode('utf-8')
# data = json.loads(data)

# load_path = 'drive/My Drive/Colab Notebooks/checkpoint/model-ilbe.h5'
# load_model = 'drive/My Drive/Colab Notebooks/checkpoint/model.json'

In [13]:
# When working on a Local System
load_weights_dcinside = 'api/models/model-dcinside.h5'
load_model_dcinside = 'api/models/model.json'
data_file_dcinside = 'api/models/dcinside.json'

load_weights_opgg = 'api/models/model-opgg.h5'
load_model_opgg = 'api/models/model.json'
data_file_opgg = 'api/models/opgg.json'

For this example we will be using a single community datafile

In [14]:
data_new = {}
data_title = {}
data_content = {}

In [15]:
with open(data_file_dcinside, encoding='utf-8') as json_file:
    j = 0

    data = json.load(json_file)

    for i in data:
        title = i['title']
        title = [c for c in title if '\xa0' not in c]
        title = [c for c in title if '\n' not in c]
        title = [c for c in title if 'jpg' not in c]
        title = [c for c in title if 'gif' not in c]
        title = [c for c in title if 'fact' not in c]
        title_str = ''.join(map(str, title))
        data_title[str(j)] = title_str.strip()
        content = i['content']
        content = [c for c in content if 'http' not in c]
        content = [c for c in content if '\xa0' not in c]
        content = [c for c in content if '\n' not in c]
        content = [c for c in content if '- dc official App' not in c]
        content = [c for c in content if '\.jpg' not in c]
        content = [c for c in content if '\.gif' not in c]
        content = [c for c in content if '\.' not in c]
        content_str = ''.join(map(str, content))
        data_content[str(j)] = content_str.strip()
        j += 1

    data_new['title'] = data_title
    data_new['content'] = data_content

In [16]:
json_final = json.dumps(data_new, ensure_ascii=False)

In [17]:
df = pd.read_json(json_final)
title_text_arr = df['title'].to_numpy()
content_text_arr = df['content'].to_numpy()

In [18]:
text = df['content'].str.lower()
text_content = df['title'].str.lower()

text = text.append(text_content)

In [19]:
text = text.map(lambda s: ' '.join([x for x in s.split() if 'http' not in x]))
text = text.map(lambda s: ' '.join([x for x in s.split() if 'gif' not in x]))
text = text.map(lambda s: ' '.join([x for x in s.split() if 'jpg' not in x]))
text = text.map(lambda s: ' '.join([x for x in s.split() if 'fact' not in x]))
text = text.map(lambda s: ' '.join([x for x in s.split() if '\.' not in x]))
text = text.map(lambda s: ' '.join([x for x in s.split() if '\u200b' not in x]))

text = text[text.map(len) > 13]

chars = sorted(list(set(''.join(text))))
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

total chars: 2057


In [27]:
maxlen = 20
step = 2

In [21]:
def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

def generate_w2_seed(sentence, diversity):
    sentence = sentence[0:maxlen]
    generated = ''
    generated += sentence

    sys.stdout.write(generated)

    for i in range(40):
        x_pred = np.zeros((1, maxlen, len(chars)))
        for t, char in enumerate(sentence):
            x_pred[0, t, char_indices[char]] = 1.

        preds = model.predict(x_pred, verbose=0)[0]
        next_index = sample(preds, diversity)
        next_char = indices_char[next_index]

        generated += next_char
        sentence = sentence[1:] + next_char

    return generated

In [22]:
json_file = open(load_model_dcinside, 'r')
model_json = json_file.read()
json_file.close()
model = model_from_json(model_json)
model.load_weights(load_weights_dcinside)

Loaded model from disk


In [23]:
optimizer = Adam()
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

In [31]:
blacklist = [
    "어",
    "듬",
    "음",
    "슴",
    "지",
    "삼",
    "듯",
    "음",
    "야",
    "럼",
    "옴",
    "임",
    "며",
    "좀",
    "김",
    "ㄷㄷ",
    "림",
    "금"
    "구나",
    "조음",
    "웃김",
    "ㅅㄱ",
    "나",
    "ㅋㅋ",
    "들아",
    "다",
    "네",
    "있노",
    "냐",
    "냐고",
    "점",
    "함",
    "네ㅔ",
    "셈",
    "앗서",
    "자",
    "써",
    "ㄹㅇ",
    "데",
    "요",
    "라",
    "퍼",
    "중",
    "됨",
    "셈",
    "ㅜㅜ",
    "ㅎㅎ",
    "까",
    "짐",
    "당",
    "님",
    "분",
    "니",
    "햇",
    "가",
    "죠",
    "잖아",
    "븃"]

In [32]:
def blacklist_contains(blacklist, seed):
    for i in blacklist:
        if i in seed[-2:]:
            return True
    return False

In [34]:
my_text = random.choice(list(text))

while True:
    if (blacklist_contains(blacklist, my_text)):
        my_text = random.choice(list(text))
    else:
        break
print(my_text)
print(my_text[-2:])

print(generate_w2_seed(my_text, 0.2))

이 귀여운 생물체는 정체가 뭐냐
heh
밥 먹으러 왔는데 옆자리에 페미언냐 앉음
heh
딘 얼굴에서 눈꼬리 살짝 내리고안경 씌워주면 딱 나일듯
heh
오버워치 잘 하면 발로란트도 잘함
heh
우리 인싸들 열심히 싸돌아다니고있노
heh
초심 잃었네진짜 개구리나 뉴트로지나나 삶아먹으라고
heh
시盧엘리스 7월 1일에 오픈한다는데
heh
우리들의 처음을 앗아갔던 게임 속 그녀들
녀들


  preds = np.log(preds) / temperature


녀로 결리에 실부한가 남면 악가 있어나도 좋을 생각이나 세명하고대도 좀해서 울준 고능인 남자들고 당신가거서 뭔가 자리 게 알아버렸다고거그거네ㅋㅋ내리 아니리사에 기가고 있다는거고 울려고 신리가 연관의 포적당 야성기, 
