In [None]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from keras.layers import Dense,LSTM,Embedding, Dropout
from keras.models import Sequential

from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
import keras.utils as ku 
from keras.callbacks import EarlyStopping

# set seeds for reproducability
from numpy.random import seed
seed(1)

import pandas as pd
import numpy as np
import string, os 

import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

import os
print(os.listdir("../input"))


In [None]:
df = pd.read_csv('../input/latin.txt', header=None)

corpus = df.loc[:, 0].tolist()
corpus = [x.split('.')[0] for x in corpus]
corpus[:10]
len(corpus)
#corpus[:10]

In [None]:
def get_sequence_of_tokens(corpus):
    unique = []
    for word in corpus:
        for char in word:
            if char not in unique:
                unique.append(char)
    #print(len(unique))
    ## convert data to sequence of tokens 
    input_sequences = []
    for word in corpus:
        token_list = list([unique.index(ch) for ch in word])
        for i in range(3, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    
    return input_sequences, len(unique), unique

In [None]:
seq, length, lookup = get_sequence_of_tokens(corpus)
#seq[:20]
total_chars = len(lookup)

r_seq = [list(reversed(el)) for el in seq]
r_seq[:10]

In [None]:
def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = ku.to_categorical(label, num_classes=total_chars)
    return predictors, label, max_sequence_len

predictors, label, max_sequence_len = generate_padded_sequences(seq)
r_predictors, r_label, r_max_sequence_len = generate_padded_sequences(r_seq)

In [None]:
def create_model(max_sequence_len, total_chars):
    input_len = max_sequence_len - 1
    model = Sequential()
    
    # Add Input Embedding Layer
    model.add(Embedding(total_chars, 10, input_length=input_len))
    
    # Add Hidden Layer 1 - LSTM Layer
    model.add(LSTM(100))
    model.add(Dropout(0.2))
    
    
    
    # Add Output Layer
    model.add(Dense(total_chars, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    return model

model = create_model(max_sequence_len, total_chars)
r_model = create_model(max_sequence_len, total_chars)
model.summary()

In [None]:
model.fit(predictors, label, epochs=100, verbose=2)
r_model.fit(r_predictors, r_label, epochs=100, verbose=2)

In [None]:
def generate_text(seed_text, next_chars, model, max_sequence_len):
    for _ in range(next_chars):
        token_list = [lookup.index(ch) for ch in seed_text]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        print(token_list)
        predicted = model.predict_classes(token_list, verbose=0)
        #print(predicted)
        output_word = ''.join([lookup[x] for x in predicted])
        seed_text += output_word
    return seed_text

In [None]:
print (generate_text("cori", 3, model, max_sequence_len))


In [None]:
   
r_model.save("r_latinize.h5")
model.save("latinize.h5")
print("Saved model to disk")