In [1]:
import pandas as pd
import numpy as np
import re
import string
from pickle import dump
from keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.models import load_model
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.





In [2]:
def load_doc(filename):
    file = open(filename, 'r')
    data = file.read()
    file.close()
    return data

In [3]:
def clean_doc(doc):
    tokens = doc.split()
    
#     re_punct = re.compile('[%s]'%re.escape(string.punctuation))
    
#     cleaned_text = [re_punct.sub('',w) for w in tokens]
    
#     cleaned_text = [word.lower() for word in cleaned_text]
    
    return ' '.join(tokens)

In [4]:
data = load_doc('rhyme.txt')

In [5]:
data

"Sing a song of sixpence,\nA pocket full of rye.\nFour and twenty blackbirds, Baked in a pie. \nWhen the pie was opened The birds began to sing; Wasn't that a dainty dish, To set before the king. \nThe king was in his counting house, Counting out his money;\nThe queen was in the parlour, Eating bread and honey. \nThe maid was in the garden, Hanging out the clothes, When down came a blackbird And pecked off her nose. \n\n"

In [6]:
raw_text = clean_doc(data)

In [7]:
def prepare_sequences(length, raw_text):
    sequences = []
    for i in range(length, len(raw_text)):
        sequence = raw_text[i-length : i+1]
        sequences.append(sequence)
    return sequences

In [8]:
length = 10
sequences = prepare_sequences(length, raw_text)

In [9]:
print("Length Of Sequences : ", len(sequences))

Length Of Sequences :  399


In [10]:
def save_list(filename, sequences):
    data = '\n'.join(sequences)
    file = open(filename,'w')
    file.write(data)
    file.close()

In [11]:
save_list('char_sequences.txt', sequences)

# Train Language Model

### Load Data

In [12]:
raw_data = load_doc('char_sequences.txt')
lines = raw_data.split('\n')

### Encoding

In [13]:
chars = sorted(list(set(raw_data)))

In [14]:
mapping = dict()
for i,c in enumerate(chars):
    mapping[c] = i

In [15]:
sequences = []
for line in lines:
    encoded_sequence = [mapping[char] for char in line]
    sequences.append(encoded_sequence)

In [16]:
vocabulary_size = len(mapping)
vocabulary_size

38

In [17]:
sequences = np.array(sequences)
X,y = sequences[:,:-1], sequences[:,-1]

In [18]:
print(X.shape)

(399, 10)


In [19]:
print(X.shape)

(399, 10)


In [20]:
X[0]

array([12, 23, 27, 21,  1, 15,  1, 32, 28, 27])

In [21]:
sequences = [to_categorical(item, num_classes = vocabulary_size) for item in X]
X = np.array(sequences)

In [22]:
X.shape

(399, 10, 38)

In [23]:
y = np.array(to_categorical(y, num_classes= vocabulary_size))

In [24]:
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [25]:
X.shape[2]

38

In [26]:
def make_model(X, vocabulary_size):
    model = Sequential()
    model.add(LSTM(75, input_shape = (X.shape[1], X.shape[2]), activation = 'relu'))
    model.add(Dense(vocabulary_size,activation = 'softmax' ))
    model.compile(loss = 'categorical_crossentropy', metrics = ['accuracy'], optimizer = 'adam')
    return model

In [35]:
model = make_model(X, vocabulary_size)

In [36]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_2 (LSTM)                (None, 75)                34200     
_________________________________________________________________
dense_2 (Dense)              (None, 38)                2888      
Total params: 37,088
Trainable params: 37,088
Non-trainable params: 0
_________________________________________________________________


### model was run AWS EC2

In [37]:
model.fit(X,y, epochs = 100, verbose = 2)

Epoch 1/100
 - 2s - loss: 3.6057 - acc: 0.1529
Epoch 2/100
 - 0s - loss: 3.5165 - acc: 0.1830
Epoch 3/100
 - 0s - loss: 3.2307 - acc: 0.1905
Epoch 4/100
 - 0s - loss: 3.0644 - acc: 0.1905
Epoch 5/100
 - 0s - loss: 3.0255 - acc: 0.1905
Epoch 6/100
 - 0s - loss: 3.0121 - acc: 0.1905
Epoch 7/100
 - 0s - loss: 2.9991 - acc: 0.1905
Epoch 8/100
 - 0s - loss: 2.9625 - acc: 0.1905
Epoch 9/100
 - 0s - loss: 2.9462 - acc: 0.1905
Epoch 10/100
 - 0s - loss: 2.9278 - acc: 0.1905
Epoch 11/100
 - 0s - loss: 2.9079 - acc: 0.1905
Epoch 12/100
 - 0s - loss: 2.8910 - acc: 0.1905
Epoch 13/100
 - 0s - loss: 2.8754 - acc: 0.1905
Epoch 14/100
 - 0s - loss: 2.8539 - acc: 0.1905
Epoch 15/100
 - 0s - loss: 2.8257 - acc: 0.2080
Epoch 16/100
 - 0s - loss: 2.7877 - acc: 0.2306
Epoch 17/100
 - 0s - loss: 2.7725 - acc: 0.1980
Epoch 18/100
 - 0s - loss: 2.7431 - acc: 0.2807
Epoch 19/100
 - 0s - loss: 2.7884 - acc: 0.2581
Epoch 20/100
 - 0s - loss: 2.7012 - acc: 0.2481
Epoch 21/100
 - 0s - loss: 2.6138 - acc: 0.2782
E

<keras.callbacks.History at 0x7f34d403b358>

In [38]:
def make_text(sequence_length, seed_text, char_length, mapping, model):
    out_char = ''
    in_text = seed_text
    for _ in range(char_length):
        
        encoded = [mapping[char] for char in in_text]
        
        
        encoded = pad_sequences([encoded], maxlen = sequence_length, padding= 'pre')
        
        
        encoded = np.array(to_categorical(encoded, num_classes= len(mapping)))
        
        y_hat = model.predict_classes(encoded, verbose = 0)
        
        for char,index in mapping.items():
            if index == y_hat:
                out_char = char
                break
        in_text += out_char
    return in_text

In [39]:
print(make_text(10, 'sing a son', 20, mapping, model))

sing a song of sixpence, A poc


In [40]:
print(make_text(10, 'king was i', 20, mapping, model))

king was in his counting house
