In [1]:
import numpy as np
from keras.models import Sequential
from keras.models import Model
from keras.layers import Input, LSTM, Embedding, Dense, RepeatVector, TimeDistributed, merge

Using Theano backend.


In [2]:
abbreviation, meaning = list(zip(*[(key, value) for key, value in {
    'ao': 'adult only',
    'u': 'you',
    'gf': 'girl friend',
    'sul': 'see you later'
#   'tmi': 'too much information'
}.iteritems()]))

# abbreviation, meaning = list(zip(*[tuple(s.split()) for s in [
#     'gf': 'girl friend',
#     'sul': 'see you later',
#     '2mor': 'tomorrow',
#     'tmi': 'too much information']]))

In [3]:
# Define the symbol alphabet, including the padding symbol '#'
letters = '#'  +''.join(sorted({c for s in abbreviation+meaning for c in s}))

# Map from letters to indexes
letter_index = {c:i for i,c in enumerate(letters)}

# Compute the size of our alphabet
n_letters = len(letters)
nb_filters=10

# Compute the maximum string length
max_length = max(map(len, abbreviation+meaning))

n_letters, max_length, letters

(16, 13, '# adefgilnorstuy')

In [4]:
'''
From now on, we need to work with numerical types rather than unicode strings.
So, we define functions to convert between lists of integers and strings using our dataset:
'''
def encode_string(s): 
    return [letter_index[c] for c in s]
def decode_string(a): 
    return ''.join(letters[i] for i in a)

In [5]:
'''
For efficiency reasons all computations will be done using dense matrices. Since we deal with strings of different
length we need to pad them into dense rectangular matrices. While this results in some unneccesary computation, 
it is well worth the increased speed of dense matrix operations (especially on a GPU). 
Keras has the helper function pad_sequences for this purpose.
'''
from keras.preprocessing.sequence import pad_sequences

def encode_and_pad(words):
    return pad_sequences(list(map(encode_string, words)), padding='post', maxlen=max_length)

# Our inputs (x) will be the participles
padded_x = encode_and_pad(abbreviation)

# And the outputs (y) will be the canonial forms of the abbreviation
padded_y = encode_and_pad(meaning)

# As in the previous output, but with the padding.
padded_x[0], padded_y[0], decode_string(padded_x[0]), decode_string(padded_y[0])

(array([6, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32),
 array([ 6,  7, 11,  8,  1,  5, 11,  7,  4,  9,  3,  0,  0], dtype=int32),
 'gf###########',
 'girl friend##')

In [6]:
# Constructing the Model now

# 1st keras requires to specify the maximum length of input sequences, 
# So, we use the number of columns from our padded_x matrix.
word = Input((max_length,))

# Letters are represented by 32-dimensional embeddings covering the alphabet (of n_letters).
embedded_word = Embedding(32, n_letters, mask_zero=False)(word)

# Our LSTM encoder will produce a 128-dimensional vector as output.
encoding = LSTM(128)(embedded_word)

# Repeat the encoding over the length of the output string.
repeated_encoding = RepeatVector(max_length)(encoding)

# We concatenate the repeated encoding with the embedded input string (in the last dimension).
merged_encoding = merge([repeated_encoding, embedded_word], mode='concat')

# Our LSTM decoder will produce a _sequence_ of 128-dimensional vectors.
decoded = LSTM(128, return_sequences=True)(merged_encoding)

# Each of these vectors will be passed through (the same) fully connected layer with softmax activations.
# The result is a sequence of distributions over the alphabet.
output_word = TimeDistributed(Dense(n_letters, activation='softmax'))(decoded)

In [7]:
# Training the model
model = Model(input=word, output=output_word)
# Here optimization algorithm called 'adam' was a good default choice to use and for loss function.
# The goal is to make the output distributions as close as possible to the values in the training set,
# so we use categorical cross-entropy loss.
model.compile(optimizer='adam', loss='categorical_crossentropy')

In [8]:
padded_y_dist = np.zeros(padded_y.shape+(n_letters,), dtype=np.float32)
for i,row in enumerate(padded_y):
    for j,letter in enumerate(row):
        padded_y_dist[i,j,letter] = True

# Note that all elements are zero except the one at the position of 'r' in our alphabet 
# (this is the first letter of 'running')
padded_y_dist[0,0], padded_y_dist[0,0,letter_index['r']]

(array([ 0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.], dtype=float32), 0.0)

In [9]:
from time import time
t0 = time()
history = model.fit(padded_x, padded_y_dist, nb_epoch=200, verbose=0)
'%.2f seconds' % (time()-t0)

'40.61 seconds'

In [10]:
def predict_and_decode(x):
    predicted_y = model.predict(x)
    return [decode_string(row.argmax(axis=-1)).replace("#",'') for row in predicted_y]

# Do predictions on the training set.
predict_and_decode(padded_x)

['girl friend', 'see you later', 'you', 'adult only']

In [11]:
predict_and_decode(encode_and_pad(['ao','sul','u']))

['adult only', 'see you later', 'you']

In [52]:
import numpy
raw_text='i will see you tomorrow, please be prepare for me because i dont want to be late. Thank you and see you then.Regards to the family'
unique = []
for char in raw_text[::]:
    if char not in unique:
        unique.append(char)
print(unique), len(unique)

['i', ' ', 'w', 'l', 's', 'e', 'y', 'o', 'u', 't', 'm', 'r', ',', 'p', 'a', 'b', 'f', 'c', 'd', 'n', '.', 'T', 'h', 'k', 'R', 'g'] 26


In [64]:
raw_text = raw_text.lower()
chars = sorted(list(set(raw_text)))
char_to_int = dict((c,i) for i,c in enumerate(chars))
n_chars = len(raw_text)
n_vocab = len(chars)
print n_chars, n_vocab
seq_length = 25
dataX=[]
dataY=[]
for i in range(0, n_chars - seq_length, 1):
#     print i
    seq_in = raw_text[i:i + seq_length]
    seq_out = raw_text[i + seq_length]
# print len(seq_in), len(seq_out)
# print seq_in, seq_out
    dataX.append(char_to_int[char] for char in seq_in)
    dataY.append(char_to_int[seq_out])
print len(dataX), len(dataY)

130 24
105 105
