In [1]:
import numpy as np
import urllib
from matplotlib import pyplot as plt

from notebook.services.config import ConfigManager

cm = ConfigManager()
cm.update('livereveal', {
              'theme': 'league',
              'transition': 'fade',
              'center': 'false',
              'overview' : 'true',
              'start_slideshow_at': 'selected'
})

%matplotlib inline



# Playing With LSTMs for Language Modeling
[Fabio A. González](http://dis.unal.edu.co/~fgonza/), Universidad Nacional de Colombia

In [2]:
from __future__ import print_function
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
from keras.layers.wrappers import TimeDistributed
import numpy as np
import random
import sys

Using Theano backend.


## Dataset

In [63]:
path = get_file('nietzsche.txt', origin="https://s3.amazonaws.com/text-datasets/nietzsche.txt")
text = open(path).read().lower()
chars = sorted(list(set(text)))
vocab_size = len(chars)
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))
print("Total number of chars:", len(text))
print("Vocabulary size:", vocab_size)

Total number of chars: 600901
Vocabulary size: 59


In [64]:
print(text[31000:31500])

ts object purely and simply as "the thing in itself," without any
falsification taking place either on the part of the subject or the
object. i would repeat it, however, a hundred times, that "immediate
certainty," as well as "absolute knowledge" and the "thing in itself,"
involve a contradictio in adjecto; we really ought to free ourselves
from the misleading significance of words! the people on their part may
think that cognition is knowing all about things, but the philosopher
must say to him


# Defining the neural network

In [65]:
maxlen = 40
model = Sequential()
model.add(LSTM(128, input_shape=(maxlen, vocab_size), return_sequences=True, name="lstm_1"))
model.add(TimeDistributed(Dense(vocab_size), name="dense_1"))#Check names to see how to load weights
model.add(Activation('softmax', name="activation_1"))
model.summary(70)

______________________________________________________________________
Layer (type)           Output Shape   Param # Connected to            
lstm_1 (LSTM)          (None, 40, 128)96256   lstm_input_4[0][0]      
______________________________________________________________________
dense_1 (TimeDistribute(None, 40, 59) 7611    lstm_1[0][0]            
______________________________________________________________________
activation_1 (Activatio(None, 40, 59) 0       dense_1[0][0]           
Total params: 103867
______________________________________________________________________


# Load pretrained weights

In [66]:
h5file = 'weights_nietzche.hdf5'
optimizer = RMSprop(lr=0.01)
model.load_weights(h5file)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

# Calculating the probability of a text

* The probability of a text is:  
$$P(c_1, \dots, c_n) = P(c_1)\prod_{i=2}^{n}\ P(c_i | c_{1},\dots, c_{i-1})$$

In [7]:
# Codify text as one-hot representation
def parse_text(text, vocab_size, padding=False):
    if padding:
        X = np.zeros((1, maxlen, vocab_size), dtype=np.bool)
    else:
        X = np.zeros((1, len(text), vocab_size), dtype=np.bool)
    for t, char in enumerate(text):
        X[0, t, char_indices[char]] = 1
    return X

In [8]:
# Function to calculate the probability of a text
def log_likelihood(model, text):
    probs = model.predict(parse_text(text, vocab_size, padding=True)).squeeze()
    return sum([np.log(probs[i, char_indices[c]]) 
                 for i,c in enumerate(text[1:]) ]) 

In [68]:
print (log_likelihood(model, "the of faculty"))
print (log_likelihood(model, "the faculty of"))
print (log_likelihood(model, "thefacultyof"))

-25.7494921312
-18.5531748161
-40.4845399261


## Most likely phrases from a BOW

In [71]:
from itertools import permutations
bow =  ['philosopher', 'kant', 'is', 'a']
perms = [' '+' '.join(perm)+' ' for perm in permutations(bow)]
for p, t in sorted([(log_likelihood(model, text), text) for text in perms], reverse = True)[:10]:
    print(p, t)

-21.5725630657  is a philosopher kant 
-24.0264622647  is kant a philosopher 
-24.0737350786  kant a philosopher is 
-24.3340368036  a philosopher is kant 
-25.7607139803  kant is a philosopher 
-25.9217395697  a philosopher kant is 
-26.3409298765  is a kant philosopher 
-26.4319313318  kant philosopher is a 
-27.5698824866  is philosopher kant a 
-28.0389502062  a kant philosopher is 


## Least likely phrases

In [69]:
perms = [' '.join(perm) for perm in permutations(bow)]
for p, t in sorted([(log_likelihood(model, text), text) for text in perms], reverse = True)[-10:]:
    print(p, t)

-27.7351272798 a kant is philosopher
-27.7881720141 philosopher kant is a
-28.7193570025 philosopher is a kant
-29.3433935877 a is philosopher kant
-30.0071513033 kant philosopher a is
-30.5771951154 philosopher kant a is
-30.5931856568 kant a is philosopher
-32.0148858503 philosopher a kant is
-32.6038281068 is philosopher a kant
-33.0956074744 philosopher a is kant


## Morphological structure

In [12]:
from itertools import permutations
from random import shuffle
text = list(u' ywh')
perms = [''.join(perm) for perm in permutations(text)]
for p, t in sorted([(log_likelihood(model, text), text) for text in perms], reverse=True)[:5]:
    print(p, t)
print('-'*50)
for p, t in sorted([(log_likelihood(model, text), text) for text in perms], reverse=True)[-5:]:
    print(p, t)

-4.47106042504 why 
-5.80633699894 y wh
-6.51182210445  why
-7.26245993376 hy w
-9.79217171669 wy h
--------------------------------------------------
-27.7011355162 y hw
-28.8524632454  wyh
-32.0772004128  ywh
-32.9385781288  hwy
-34.9063544273  yhw


## Generating text

* The model calculates the probability of the next word given the previous words:  
$$P(c_t | c_{1}, c_{2},\dots, c_{t-1})$$
* We sample from the model using this conditional probability
  ```python
  for i in [1..n]:
      P = predict_next() 
      bin_var = sample_binomial(temperature)
      if bin_var:
          c_i = sample_multinomial(P) 
      else:
          c_i = P.argmax() 
  ```

In [27]:
# Function to sample an index from a probability array:
def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [28]:
def generate_text(diversity, model, sentence, n_chars, padding=True):
    print()
    generated = ''
    generated += sentence
    print('----- Generating with seed: "' + sentence + '"')
    #sys.stdout.write(generated)

    for i in range(n_chars):
        x = np.zeros((1, maxlen, vocab_size))
        if padding and len(sentence) < 40:
            space_array = [" "]*(40-len(sentence))
            for t, char in enumerate(space_array):
                x[0, t, char_indices[char]] = 1.
        for t, char in enumerate(sentence, 40-len(sentence)):
            x[0, t, char_indices[char]] = 1.

        preds = model.predict(x, verbose=0)[0]
        next_index = sample(preds[-1], diversity)
        next_char = indices_char[next_index]
        
        generated += next_char
        sentence = sentence[1:] + next_char

        #sys.stdout.write(next_char)
        #sys.stdout.flush()
    print(generated)
    return True

In [41]:
generate_text(0.4, model, 'the meaning of life is ', 1000)


----- Generating with seed: "the meaning of life is "
the meaning of life is to the great than it is progress"--the most probably a standpoin and something enough in the part of the philosophers, who are experience, in the sense, that is the religion of proposition of the order, and the experiences and sometimes the "many proposished the religion of the people, and an art of so littles, to be an aspideous thre will to the schuman in the spirit, not not all their religion of the scientific morality and consideration of the philosophical process of the soul-many of the contrary they is a strong of the most problem of the spirituality and standing, there and the conditions are the instinct and the eternal many of the same not of the logical races of the suplicald the conception, the nature with the most probably the spirituality in the univershens--and in the greater conditions, the part of all people and the delight, and all the strength of the fact that the fact that even in the formou

True

# Using other datasets

In [100]:
chars = ['\n', '\r', ' ', '!', '(', ')', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'x', 'y', 'z', '\x81', '\x89', '\x8d', '\x91', '\x93', '\x97', '\x9a', '\xa1', '\xa9', '\xad', '\xb1', '\xb3', '\xba', '\xbc', '\xbf', '\xc2', '\xc3']
vocab_size = len(chars)
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

print("Total number of chars:", len(text))
print("Vocabulary size:", vocab_size)

Total number of chars: 1001776
Vocabulary size: 63


In [102]:
maxlen = 40
print('Building model...')
model = Sequential()
model.add(LSTM(128, input_shape=(maxlen, vocab_size), return_sequences=True, name="lstm_1"))
model.add(TimeDistributed(Dense(vocab_size), name="dense_1"))#Check names to see how to load weights
model.add(Activation('softmax', name="activation_1"))

Building model...


In [103]:
h5file = 'weights_bib.hdf5'
optimizer = RMSprop(lr=0.01)
model.load_weights(h5file)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

In [104]:
generate_text(0.3, model, 'el sentido de la vida es ', 1000)


----- Generating with seed: "el sentido de la vida es "
el sentido de la vida es el hijo del hombre de la verdad, y de ellos de los pecados, y ha cumplido a los hombres, y los que habían entonces que es mi padre de los hombres que tenían con ellos. 

12 pero estaba con ellos, y a la carne, salió vosotros es de la carne, y es de la ciudad, y la vida en él le dijo: ¿qué entranteis en el camino; 14 y le dijo: ¿qué se había a sus discípulos. 12 y le es causa de la carne. 

14 pero escondó a los que se habían crucho en el padre, y en la ciudad en el padre de dios y los que se habían re

1 es de la verdad, y los discípulos se había en la carne, y por el señor es de la sinagoga de los hombres de los padres, y le dijo: ¿qué como estaban en la carne, y no desde los hombres habían dicho por el hijo del hombre. 22 por tanto, es camino, y le dijo: ¿qué decía: señor no se había entre el camino, y los hombres habían entonces de la ciudad de dios. 

8 pero se había entre el cielo y le había en la si

True

## One more dataset...

In [93]:
chars = ['\t', '\n', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~', '\x80', '\x81', '\x83', '\x84', '\x85', '\x8a', '\x8b', '\x8c', '\x92', '\x93', '\x94', '\x97', '\x98', '\x99', '\x9c', '\x9d', '\xa0', '\xa1', '\xa2', '\xa3', '\xa4', '\xa5', '\xa6', '\xa7', '\xa8', '\xa9', '\xaa', '\xab', '\xac', '\xad', '\xae', '\xaf', '\xb0', '\xb1', '\xb2', '\xb3', '\xb4', '\xb6', '\xb7', '\xb9', '\xba', '\xbb', '\xbc', '\xbd', '\xbf', '\xc2', '\xc3', '\xc5', '\xce', '\xd0', '\xd1', '\xd7', '\xe1', '\xe2', '\xe9', '\xed', '\xef', '\xf1', '\xf3']
vocab_size = len(chars)
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

print("Total number of chars:", len(text))
print("Vocabulary size:", vocab_size)

Total number of chars: 11211916
Vocabulary size: 129


In [95]:
maxlen = 40
print('Building model...')
model = Sequential()
model.add(LSTM(512, input_shape=(maxlen, vocab_size), return_sequences=True, name="lstm_1"))
model.add(TimeDistributed(Dense(vocab_size), name="dense_1"))#Check names to see how to load weights
model.add(Activation('softmax', name="activation_1"))

Building model...


In [96]:
h5file = 'weights_reg.hdf5'
optimizer = RMSprop(lr=0.01)
model.load_weights(h5file)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

In [97]:
generate_text(0.6, model, 'el sentido de la vida es', 400, padding=True)


----- Generating with seed: "el sentido de la vida es"
el sentido de la vida esa maldad
preparate con tu cantante sin ti...

dale mami que te vas
con la coche
con el alta coming soon

lo que se te converti lo de la gente
mi espolo el papo de nada
parte de la ghetto
si se sienen mi calle yo te una ves

white blook
que nadie se trate de activar la vida dispuesta
prendeen tran mi monthe garrote
forma de la calle en de tilas y mi calle me lo di con mi corazón,
pero no puedes ne


True

# Training of the model

This is the code used for training

In [None]:
maxlen = 40
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
print('Number of sequences:', len(sentences))
X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1
print('Shape X', X.shape)
print('Shape y', y.shape)

In [None]:
for iteration in range(1, 60):
    print()
    print('-' * 50)
    print('Iteration', iteration)
    model.fit(X, y, batch_size=128, nb_epoch=1)

    start_index = random.randint(0, len(text) - maxlen - 1)

    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print()
        print('----- diversity:', diversity)

        generated = ''
        sentence = text[start_index: start_index + maxlen]
        generated += sentence
        print('----- Generating with seed: "' + sentence + '"')
        sys.stdout.write(generated)

        for i in range(400):
            x = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x[0, t, char_indices[char]] = 1.

            preds = model.predict(x, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = indices_char[next_index]

            generated += next_char
            sentence = sentence[1:] + next_char

            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()