In [None]:
from keras.engine import *
from keras.engine.topology import Container
from keras.models import Sequential
from keras.layers import *
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.losses import *
from keras.optimizers import *
import numpy as np
import keras.backend as K
from functools import reduce
from tqdm import tqdm_notebook
from keras.utils.np_utils import to_categorical
from matplotlib import pylab as plt
from IPython.display import clear_output
from prefetch_generator import BackgroundGenerator
import pickle
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [None]:
TEXT_LENGTH = 100

In [None]:
with open("./parsed.txt", "r") as f:
    texts = f.read().split("\n")
tokenizer = Tokenizer(filters="\n", lower=False)

In [None]:
tokenizer.fit_on_texts(texts)
VOCAB = len(tokenizer.word_index)
X = tokenizer.texts_to_sequences(texts)

In [None]:
class InvTokenizer:
    def __init__(self, dic):
        self.dic = dic
        self.num = len(dic)
        self.invdec = ["" for _ in range(self.num + 1)]
        for c, i in self.dic.items():
            self.invdec[i] = c
    
    def __call__(self, array):
        if len(array) == 0: return("")
        return(reduce(lambda x,y:x+y,[self.invdec[int(i)] for i in list(array)]))
            
texgen = InvTokenizer(tokenizer.word_index)

In [None]:
def temperature_softmax(y, tau):
    return(np.exp(y/tau)/np.exp(y/tau).sum())
    
def get_model():
    inputs = Input(shape=(None,))
    y = inputs
    y = Embedding(VOCAB+1, 512, mask_zero=True)(y)
    y = Dropout(0.1)(y)
    y = GRU(512, return_sequences=True, recurrent_dropout=0.1)(y)
    y = Dropout(0.1)(y)
    y = GRU(512, return_sequences=True, recurrent_dropout=0.1)(y)
    y = Dropout(0.1)(y)
    y = TimeDistributed(Dense(VOCAB+1, activation="softmax", use_bias=False))(y)
    model = Model(inputs, y)
    model.compile(loss="sparse_categorical_crossentropy", optimizer=RMSprop(1e-3))
    return(model)

def datagen(X, batch_size=128):
    x, y = [], []
    np.random.shuffle(X)
    for text in tqdm_notebook(X):
        x.append(text[:-1])
        y.append(text[1:])
        if len(y) >= batch_size:
            x = pad_sequences(np.array(x), maxlen=TEXT_LENGTH, padding="post")
            y = pad_sequences(y, maxlen=TEXT_LENGTH, padding="post")
            y = np.expand_dims(np.array(y), -1)
            yield(x, y)
            x, y = [], []

def get_text(mod, tau=1.0):
    TEXT_MIN = 20
    x = [[tokenizer.word_index["@SOS"]]]
    while len(x[0]) < TEXT_LENGTH:
        y = mod.predict(np.array(x))[0][-1]
        y = np.log(y)
        y = temperature_softmax(y, tau)
        y = np.random.choice(VOCAB+1, p=y)
        if y == 1:
            return(texgen(x[0][1:]))
        x[0].append(y)
    return(texgen(x[0][1:]))

In [None]:
mod = get_model()
mod.summary()
mod.load_weights("./auto_twitter_2.h5")
history = []

In [None]:
epoch = 100
batch_size = 200
mod.optimizer = Adam()
for e in range(epoch):
    gen = datagen(X, batch_size=batch_size)
    for x, y in BackgroundGenerator(gen):
        loss = mod.train_on_batch(x, y)
        history.append(loss)
    clear_output()
    plt.plot(history)
    plt.show()
    print(get_text(mod))
    mod.save(filepath="./auto_twitter_2.h5", overwrite=True)

In [None]:
print("\n".join([get_text(mod, tau=1.5) for _ in range(20)]))