# LSTM Model Training for Korean Community Website Posts

**DISCLAIMER**: *I have primarily used a paid instance of Google Colab (https://colab.research.google.com/signup) for training my model. I have not tested this on my local system. So I cannot guarantee that this will run on any system. However, I did modify the code for weight import to account for running on a local system. On Colab you have to load it from a Google Drive to retain persistence*.

### Importing Libraries

In [27]:
from __future__ import print_function
from keras.callbacks import LambdaCallback
from keras.callbacks import ModelCheckpoint
from keras.models import Sequential
from keras.models import model_from_json
from keras.layers import Dense, Activation, Dropout
from keras.layers import LSTM
from keras.utils.data_utils import get_file
from keras.optimizers import RMSprop
from keras.optimizers import Adam

In [28]:
import pandas as pd
import numpy as np

from urllib.request import urlopen
import json
import io
import random
import sys

### Importing Data

In [29]:
# # When running on Google Colab
# from google.colab import drive
# drive.mount('/content/drive')

# url = 'https://raw.githubusercontent.com/duckonomy/cs344/master/project/api/models/dcinside.json'
# data = urlopen(url).read().decode('utf-8')
# data = json.loads(data)

# load_path = 'drive/My Drive/Colab Notebooks/checkpoint/model-ilbe.h5'
# load_model = 'drive/My Drive/Colab Notebooks/checkpoint/model.json'

In [30]:
# When working on a Local System
load_weights_dcinside = 'api/models/model-dcinside.h5'
load_model_dcinside = 'api/models/model.json'
data_file_dcinside = 'api/models/dcinside.json'

load_weights_opgg = 'api/models/model-opgg.h5'
load_model_opgg = 'api/models/model.json'
data_file_opgg = 'api/models/opgg.json'

For this example we will be using a single community datafile

In [31]:
data_new = {}
data_title = {}
data_content = {}

In [32]:
with open(data_file_dcinside, encoding='utf-8') as json_file:
    j = 0

    data = json.load(json_file)

    for i in data:
        title = i['title']
        title = [c for c in title if '\xa0' not in c]
        title = [c for c in title if '\n' not in c]
        title = [c for c in title if 'jpg' not in c]
        title = [c for c in title if 'gif' not in c]
        title = [c for c in title if 'fact' not in c]
        title_str = ''.join(map(str, title))
        data_title[str(j)] = title_str.strip()
        content = i['content']
        content = [c for c in content if 'http' not in c]
        content = [c for c in content if '\xa0' not in c]
        content = [c for c in content if '\n' not in c]
        content = [c for c in content if '- dc official App' not in c]
        content = [c for c in content if '\.jpg' not in c]
        content = [c for c in content if '\.gif' not in c]
        content = [c for c in content if '\.' not in c]
        content_str = ''.join(map(str, content))
        data_content[str(j)] = content_str.strip()
        j += 1

    data_new['title'] = data_title
    data_new['content'] = data_content

In [33]:
json_final = json.dumps(data_new, ensure_ascii=False)

In [34]:
df = pd.read_json(json_final)
title_text_arr = df['title'].to_numpy()
content_text_arr = df['content'].to_numpy()

In [35]:
text = df['content'].str.lower()
text_content = df['title'].str.lower()

text = text.append(text_content)

In [36]:
text = text.map(lambda s: ' '.join([x for x in s.split() if 'http' not in x]))
text = text.map(lambda s: ' '.join([x for x in s.split() if 'gif' not in x]))
text = text.map(lambda s: ' '.join([x for x in s.split() if 'jpg' not in x]))
text = text.map(lambda s: ' '.join([x for x in s.split() if 'fact' not in x]))
text = text.map(lambda s: ' '.join([x for x in s.split() if '\.' not in x]))
text = text.map(lambda s: ' '.join([x for x in s.split() if '\u200b' not in x]))

text = text[text.map(len) > 13]

chars = sorted(list(set(''.join(text))))
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

total chars: 2057


In [44]:
maxlen = 20
step = 2

In [672]:
def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

def generate_w2_seed(sentence, diversity):
    sentence = sentence[0:maxlen]
    generated = ''
    generated += sentence

    sys.stdout.write(generated)

    for i in range(40):
        x_pred = np.zeros((1, maxlen, len(chars)))
        for t, char in enumerate(sentence):
            x_pred[0, t, char_indices[char]] = 1.

        preds = model.predict(x_pred, verbose=0)[0]
        next_index = sample(preds, diversity)
        next_char = indices_char[next_index]

        generated += next_char
        sentence = sentence[1:] + next_char

    return generated

In [693]:
def on_epoch_end(epoch, _):
    print()
    print('Generating text after Epoch: %d' % epoch)
    
    tweet = np.random.choice(text)
    start_index = 0

    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print('diversity:', diversity)

        generated = ''
        sentence = tweet[start_index: start_index + maxlen]
        generated += sentence
        print('Generating with seed: "' + sentence + '"')
        sys.stdout.write(generated)

        for i in range(30):
            x_pred = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_indices[char]] = 1.

            preds = model2.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = indices_char[next_index]

            generated += next_char
            sentence = sentence[1:] + next_char

            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()

In [697]:
print('Build model...')
model = Sequential()
model.add(LSTM(128, input_shape=(maxlen, len(chars)),return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(len(chars), activation='softmax'))

optimizer = Adam()
model2.compile(loss='categorical_crossentropy', optimizer=optimizer)

Build model...


In [700]:
epochs = 400

model.load_weights(load_weights_dcinside)

print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

checkpointer = ModelCheckpoint(filepath=load_weights_dcinside, monitor='loss', verbose=1, save_best_only=True, mode='min')

model.fit(x, y,
          batch_size=128,
          epochs=epochs,
          callbacks=[print_callback, checkpointer])


