# LSTM Model Training for Korean Community Website Posts

**DISCLAIMER**: *I have primarily used a paid instance of Google Colab (https://colab.research.google.com/signup) for training my model. I have not tested this on my local system. So I cannot guarantee that this will run on any system. However, I did modify the code for weight import to account for running on a local system. On Colab you have to load it from a Google Drive to retain persistence*.

### Importing Libraries

In [27]:
from __future__ import print_function
from keras.callbacks import LambdaCallback
from keras.callbacks import ModelCheckpoint
from keras.models import Sequential
from keras.models import model_from_json
from keras.layers import Dense, Activation, Dropout
from keras.layers import LSTM
from keras.utils.data_utils import get_file
from keras.optimizers import RMSprop
from keras.optimizers import Adam

In [28]:
import pandas as pd
import numpy as np

from urllib.request import urlopen
import json
import io
import random
import sys

### Importing Data

In [716]:
# # When running on Google Colab
# from google.colab import drive
# drive.mount('/content/drive')

# url = 'https://raw.githubusercontent.com/duckonomy/cs344/master/project/api/models/dcinside.json'
# data = urlopen(url).read().decode('utf-8')
# data = json.loads(data)

# load_path = 'drive/My Drive/Colab Notebooks/checkpoint/model-ilbe.h5'
# load_model = 'drive/My Drive/Colab Notebooks/checkpoint/model.json'

In [717]:
# When working on a Local System
load_weights_dcinside = 'api/models/model-dcinside.h5'
load_model_dcinside = 'api/models/model.json'
data_file_dcinside = 'api/models/dcinside.json'

load_weights_opgg = 'api/models/model-opgg.h5'
load_model_opgg = 'api/models/model.json'
data_file_opgg = 'api/models/opgg.json'

For this example we will be using a single community datafile

In [718]:
data_new = {}
data_title = {}
data_content = {}

Importing the data from the JSON exported from Scrapy.

In [747]:
with open(data_file_dcinside, encoding='utf-8') as json_file:
    j = 0

    data = json.load(json_file)

    for i in data:
        title = i['title']
        title = [c for c in title if '\xa0' not in c]
        title = [c for c in title if '\n' not in c]
        title = [c for c in title if 'jpg' not in c]
        title = [c for c in title if 'gif' not in c]
        title = [c for c in title if 'fact' not in c]
        title_str = ''.join(map(str, title))
        data_title[str(j)] = title_str.strip()
        content = i['content']
        content = [c for c in content if 'http' not in c]
        content = [c for c in content if '\xa0' not in c]
        content = [c for c in content if '\n' not in c]
        content = [c for c in content if '- dc official App' not in c]
        content = [c for c in content if '\.jpg' not in c]
        content = [c for c in content if '\.gif' not in c]
        content = [c for c in content if '\.' not in c]
        content_str = ''.join(map(str, content))
        data_content[str(j)] = content_str.strip()
        j += 1

    data_new['title'] = data_title
    data_new['content'] = data_content

# Ensure utf-8 is used
json_final = json.dumps(data_new, ensure_ascii=False)

Convert the data to Pandas -> Numpy

In [727]:
df = pd.read_json(json_final)
title_text_arr = df['title'].to_numpy()
content_text_arr = df['content'].to_numpy()

text = df['content'].str.lower()
text_content = df['title'].str.lower()

text = text.append(text_content)

Clean up the data Further

In [728]:
text = text.map(lambda s: ' '.join([x for x in s.split() if '\u200b' not in x]))

# Eliminate text that isn't as long
text = text[text.map(len) > 13]

# Map the characters bidirectionally for encoding
chars = sorted(list(set(''.join(text))))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

chars_length = len(chars)

Create `sequences` that serve as the input training examples and `next_chars` that serve as the output for the training examples. Less sequences and steps means that there is more examples to iterate through. Thus, taking the training longer.

In [731]:
max_sequence_length = 20
step = 2

sequences = []
next_chars = []
for t in text:
    for i in range(0, len(t) - max_sequence_length, step):
        sequences.append(t[i: i + max_sequence_length])
        next_chars.append(t[i + max_sequence_length])

sequences_length = len(sequences)

Generating a One-Hot encoding with input being a 3D matrix of `sequences_length` (number of sequences) * `max_sequence_length` (length of sequences) * `chars_length` (number of characters). Futhermore, the output y is a 2D matrix of `sequence_length` * `chars_length`.

In [732]:
x = np.zeros((sequences_length, max_sequence_length, chars_length), dtype=np.bool)
y = np.zeros((sequences_length, chars_length), dtype=np.bool)

for i, sequence in enumerate(sequences):
    for t, char in enumerate(sequence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

Sampler that generates chooses a probability based on a temperature rather than always returning the highest probability.

In [733]:
def sample(predictions, temperature=0.2):
    predictions = np.asarray(predictions).astype('float64')
    predictions = np.log(predictions) / temperature
    exp_preds = np.exp(predictions)
    predictions = exp_preds / np.sum(exp_preds)
    probabilities = np.random.multinomial(1, predictions, 1)
    return np.argmax(probabilities)

Callback function used for printing out the generated text for each epoch

In [734]:
def print_current_model(epoch, _):
    print()
    print('Current Epoch: %d' % epoch)
    tweet = np.random.choice(text)
    start_index = 0
    
    # Iterate through various temperatures
    for var_temp in [0.2, 0.5, 1.0, 1.2]:
        print('diversity:', var_temp)

        generated = ''
        sentence = tweet[start_index: start_index + max_sequence_length]
        generated += sentence
        print('Seed: "' + sentence + '"')
        sys.stdout.write(generated)

        for i in range(30):
            x_pred = np.zeros((1, max_sequence_length, chars_length))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_indices[char]] = 1.

            predictions = model2.predict(x_pred, verbose=0)[0]
            next_index = sample(predictions, var_temp)
            next_char = indices_char[next_index]

            generated += next_char
            sentence = sentence[1:] + next_char

            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()

Finalized Model

In [735]:
model = Sequential()
model.add(LSTM(128, input_shape=(max_sequence_length, chars_length),return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(chars_length, activation='softmax'))

optimizer = Adam()
model2.compile(loss='categorical_crossentropy', optimizer=optimizer)

400 epochs and two callbacks to save progress on each improvement

In [700]:
epochs = 400

model.load_weights(load_weights_dcinside)

print_callback = LambdaCallback(on_epoch_end=print_current_model)
checkpoint_callback = ModelCheckpoint(filepath=load_weights_dcinside, monitor='loss', verbose=1, save_best_only=True, mode='min')

model.fit(x, y,
          batch_size=128,
          epochs=epochs,
          callbacks=[print_callback, checkpoint_callback])


