# LSTM

- ## Preliminaries

- ### Imports

In [1]:
from __future__ import print_function
from keras.models import Sequential
from keras.layers import Dense, Activation,Dropout,TimeDistributed
from keras.layers import LSTM,SimpleRNN
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
import numpy as np
import random
import sys
import os, os.path
from os import listdir
from os.path import isfile, join
from unicodedata import normalize
import re

Using TensorFlow backend.


- ### Check GPU usage

In [2]:
from tensorflow.python.client import device_lib

def get_available_gpus():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos if x.device_type == 'GPU']

In [3]:
get_available_gpus()

[u'/gpu:0']

----------

- ### Check and set Twitter's API

In [4]:
import twitter
api = twitter.Api(consumer_key='LyNVanTEQEOEGKfXAMeLv6AKG',
                    consumer_secret='0lJvhaaOP5cRZWm6rxwyBIAypd1P7eiDx9f74KBDlLrSldNuBQ',
                    access_token_key='855852332034265088-geTEVmA7xIsOD3WCZyfBNnqjRdS1MhW',
                    access_token_secret='kJMwMl67e3nYrqaGWzIizxzQpRZhtBfOnwPflO1fk3cOt')

In [5]:
print(api.VerifyCredentials())

{"created_at": "Sat Apr 22 18:34:31 +0000 2017", "default_profile": true, "description": "Learning how to be creative", "followers_count": 2, "friends_count": 3, "id": 855852332034265088, "lang": "en", "location": "Somewhere in the cloud", "name": "ArtistBot", "profile_background_color": "F5F8FA", "profile_banner_url": "https://pbs.twimg.com/profile_banners/855852332034265088/1492892354", "profile_image_url": "http://pbs.twimg.com/profile_images/855878764143804417/r55Z2Js5_normal.jpg", "profile_link_color": "1DA1F2", "profile_sidebar_fill_color": "DDEEF6", "profile_text_color": "333333", "screen_name": "TheTalkativeBot", "status": {"created_at": "Tue Apr 25 15:36:21 +0000 2017", "id": 856894659007844352, "id_str": "856894659007844352", "in_reply_to_screen_name": "dvp_tran", "in_reply_to_status_id": 856879788518236161, "in_reply_to_user_id": 747074580754403328, "lang": "de", "source": "<a href=\"http://www.google.com\" rel=\"nofollow\">TheScenarioBot</a>", "text": "@dvp_tran Automamte a

-------------------

# I. Learning from corpus

** 1. Load and convert data**

'''Example script to generate text from Nietzsche's writings.
At least 20 epochs are required before the generated text
starts sounding coherent.
It is recommended to run this script on GPU, as recurrent
networks are quite computationally intensive.
If you try this script on new data, make sure your corpus
has at least ~100k characters. ~1M is better.
'''

In [6]:
#Load and concatenate files:

DIR="../../LSTM/data/Gutenberg/ebooks-unzipped/French/"
all_files = [f for f in listdir(DIR) if isfile(join(DIR, f))]

#choose how many files to concatenate:
nb_files=7
if nb_files>len(all_files):
    nb_files=len(all_files)
    
    
out_path="french/data/"
if not os.path.exists(out_path):
    os.makedirs(out_path)
if not os.path.exists(out_path+"input/"):
    os.makedirs(out_path+"input/")
    
with open(out_path+'input/french.txt', 'w') as outfile:
    for fname in all_files[0:nb_files]:
        with open(DIR+fname) as infile:
            i=0
            for line in infile:
                if i>=50:
                    outfile.write(line)
                i=i+1
        print ("Done concatenating file : %s" %fname)

Done concatenating file : 249.txt
Done concatenating file : 4740-8.txt
Done concatenating file : 799-0.txt
Done concatenating file : 4548-8.txt
Done concatenating file : 4791-8.txt
Done concatenating file : 803-8.txt
Done concatenating file : 2650-0.txt


In [7]:
#load file
file_name=out_path+'input/french.txt'
text = open(file_name).read()
text=normalize('NFKD',text.decode('latin1')).encode('ASCII', 'ignore')

In [8]:
text = re.sub("\n\n+" , "\n", text)

In [9]:
print('corpus length:', len(text))

chars = sorted(list(set(text)))
VOCAB_SIZE = len(chars)
print('total chars:',VOCAB_SIZE)

corpus length: 2508723
total chars: 89


**Warning:** The RNN takes in input numerical data hence the necessity to convert strings into numerical values.

In [10]:
#creating mapping between indexes and characters
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

We’re gonna use Keras to create and train our Network, so we must convert the data into this form: (number_of_sequences, length_of_sequence, number_of_features).
- nb of features = length of the char array
- length of sequence = batch size
- nb of sequence = len(data) divided by batch size.

**Warning : ** target sequence is setted by shifting the source/input sequence by one character with both having the same length.

In [11]:
%%time

SEQ_LENGTH=100
#Build three dimensional arrays
X = np.zeros((len(text)/SEQ_LENGTH, SEQ_LENGTH, VOCAB_SIZE)) #input
y = np.zeros((len(text)/SEQ_LENGTH, SEQ_LENGTH, VOCAB_SIZE)) #target

#Build sequences
for i in range(0, len(text)/SEQ_LENGTH):
    X_sequence = text[i*SEQ_LENGTH:(i+1)*SEQ_LENGTH]
    X_sequence_ix = [char_indices[value] for value in X_sequence]
    input_sequence = np.zeros((SEQ_LENGTH, VOCAB_SIZE))
    for j in range(SEQ_LENGTH):
        input_sequence[j][X_sequence_ix[j]] = 1.
    X[i] = input_sequence

    y_sequence = text[i*SEQ_LENGTH+1:(i+1)*SEQ_LENGTH+1]
    y_sequence_ix = [char_indices[value] for value in y_sequence]
    target_sequence = np.zeros((SEQ_LENGTH, VOCAB_SIZE))
    for j in range(SEQ_LENGTH):
        target_sequence[j][y_sequence_ix[j]] = 1.
    y[i] = target_sequence

CPU times: user 2.02 s, sys: 192 ms, total: 2.22 s
Wall time: 2.22 s


** 2. Build the network**

In [12]:
HIDDEN_DIM= 500 #500
LAYER_NUM = 2


model = Sequential()
model.add(LSTM(HIDDEN_DIM, input_shape=(None, VOCAB_SIZE), return_sequences=True))
for i in range(LAYER_NUM - 1):
    model.add(LSTM(HIDDEN_DIM, return_sequences=True))
model.add(TimeDistributed(Dense(VOCAB_SIZE)))
model.add(Activation('softmax'))
model.compile(loss="categorical_crossentropy", optimizer="rmsprop")

In [13]:
def generate_text(model, length, vocab_size, ix_to_char):
    # starting with random character
    ix = [np.random.randint(vocab_size)]
    y_char = [ix_to_char[ix[-1]]]
    X = np.zeros((1, length, vocab_size))
    for i in range(length):
        # appending the last predicted character to sequence
        X[0, i, :][ix[-1]] = 1
        print(ix_to_char[ix[-1]], end="")
        ix = np.argmax(model.predict(X[:, :i+1, :])[0], 1)
        y_char.append(ix_to_char[ix[-1]])
    return ('').join(y_char)

In [14]:
# Generate some sample before training to know how bad it is!
bla = generate_text(model, 100, VOCAB_SIZE, indices_char)
#api.PostUpdate(status=bla[0:123])

[[[[@@@SSCCCCCBDD;;;GG3333q3333vvvvzzzzzzIIzMMnnnnnNNEEEiiimm4jjjE.SS!!!!!!!AAAAAAIkkk-SSSS!

**3. Train network**

In [15]:
def get_iternb(string):
    return re.findall(r'checkpoint_500_epoch_(.*).hdf5', string)[0]

In [28]:
#batch size equals to seq length here
BATCH_SIZE=100
#len of desired output
GENERATE_LENGTH=140
DIR=out_path+"weights/weight_attempt_s03/"
flag=True

try:
    onlyfiles = [f for f in listdir(DIR) if isfile(join(DIR, f))]
    iteration=[]
    for files in onlyfiles:
        iteration.append(int(get_iternb(files)))
    iteration=max(iteration)

    last_checkpoint=DIR+onlyfiles[0][0:21]+str(iteration)+'.hdf5'
except Exception as e:
    print(e)
    onlyfiles=[]
    if not os.path.exists(DIR):
        os.makedirs(DIR)
    nb_files=0

if nb_files>0:
    model.load_weights(last_checkpoint)
else:
    iteration=0
    
print("Starting at iteration : %s" %iteration)
while flag==True:
    print('\n')
    print('-'*20)
    model.fit(X, y, batch_size=BATCH_SIZE, verbose=2, nb_epoch=1)
    iteration += 1
    bla=generate_text(model, GENERATE_LENGTH,VOCAB_SIZE, indices_char)
    if iteration % 10 == 0:
        print("\n\nIteration nb : %s" %iteration)
        #api.PostUpdate(status=bla[0:123])
        model.save_weights(DIR+'checkpoint_{}_epoch_{}.hdf5'.format(HIDDEN_DIM, iteration))
        #remove unecessary files:
        for files in onlyfiles:
            try:
                if files:
                    os.remove(DIR+files)
            except:
                pass
        onlyfiles = [f for f in listdir(DIR) if isfile(join(DIR, f))]

    if iteration>=800:
        print("Stopping...")
        flag=False

Starting at iteration : 700


--------------------
Epoch 1/1
93s - loss: 0.1897
je sentais que je t'ai l'anA de croire
que je m'en ailuissais assez pour un verre A  un de ses amis vraiment
aux investigations de la barr

--------------------
Epoch 1/1
92s - loss: 0.1903
"Violantie, _s-ellie
en mirieur, et au moment de la gravite verte, que le bas discute pour moi, j'aurais
perdue de savoir A  tous les mots

--------------------
Epoch 1/1
92s - loss: 0.1898


--Je n'irai pas.

En ce moment le docteur rentra dans son cabarade, et depuis le
rivage.  Sa lunette de la situation, illus ou compren

--------------------
Epoch 1/1
92s - loss: 0.1897
; il n'y avait aucun dame en revenant
s'en approcher.  On apercevait dans une pluie d'etifice, ils revirent au milieu des
flammes, la bour

--------------------
Epoch 1/1
91s - loss: 0.1892
it plus
signe d'existence depuis quelques heures, ce symptome fut accueilli
par un redoublement de mousquet aucun retombait en lui presant

--------------------
E

**4. Generate text**

In [17]:
def save_text(model, length, vocab_size, ix_to_char):
    # starting with random character
    ix = [np.random.randint(vocab_size)]
    y_char = [ix_to_char[ix[-1]]]
    X = np.zeros((1, length, vocab_size))
    for i in range(length):
        # appending the last predicted character to sequence
        X[0, i, :][ix[-1]] = 1
        print(ix_to_char[ix[-1]], end="")
        ix = np.argmax(model.predict(X[:, :i+1, :])[0], 1)
        y_char.append(ix_to_char[ix[-1]])
    if not os.path.exists(out_path+"generate/"):
        os.makedirs(out_path+"generate/")
    with open(out_path+"generate/output.txt","w") as f:
        f.write(('').join(y_char))
    return ('').join(y_char)


In [18]:
#seed with particular text:
def generate_text_seeded(model,seed,length, vocab_size, ix_to_char):
    # starting with random character
    # char_indices
    ix = [char_indices[x] for x in seed]
    y_char = [x for x in seed]
    X = np.zeros((1, length, vocab_size))
    for i in range(len(ix)) :
        X[0, i, :][ix[i]] = 1
        print(ix_to_char[ix[i]], end="")
    to_substract = len(ix)
    for i in range(length-to_substract):
        # appending the last predicted character to sequence
        X[0, i, :][ix[-1]] = 1
        print(ix_to_char[ix[-1]], end="")
        ix = np.argmax(model.predict(X[:, :i+1, :])[0], 1)
        y_char.append(ix_to_char[ix[-1]])
    return ('').join(y_char)

In [27]:
generate_text_seeded(model,normalize('NFKD',"le roi est mort ".decode('latin1')), 1000, VOCAB_SIZE, indices_char)

le roi est mort  aupassett aude de leur plus de cinq cents
livres; en bienhe soutenue filpacorie, s'eloignait
des souffrances et d'ascendions avec eux qu'il n'eut pas le courage; il
semblait qu'il ne pouvait enliver, il se complaisait de son arrivee aux travaux des capitaines
Burton et Speke a faire un terrain superbatif, et les trois mille
appelees s'est evidemment sculptee a sa nuit
de chair et d'activite. Le docteur fit le plus complet.

Le vent, que nous l'avions apportee au moyen d'un cote et
descend d'epsace la lange de Zinzereun Fergusson.

Messieurs, dit le docteur, nous n'avons pas traverse!

--D'ailleurs, vous venez a deje devant nous, disait le professeur en decoulant
des courants stuptu soudent la perspective de m'envoyant
encore! Mais il est de monter a degurer notre
adroit de son habitude. A  l'explosion couverte de leur apprendre et en
laissant filer nos viergeants.

 En effet, c'etaient des appareils, des heures, la densition de mon
inglieusement eta

u"le roi est mort aupassett aude de leur plus de cinq cents\r\nlivres; en bienhe soutenue filpacorie, s'eloignait\r\ndes souffrances et d'ascendions avec eux qu'il n'eut pas le courage; il\r\nsemblait qu'il ne pouvait enliver, il se complaisait de son arrivee aux travaux des capitaines\r\nBurton et Speke a faire un terrain superbatif, et les trois mille\r\nappelees s'est evidemment sculptee a sa nuit\r\nde chair et d'activite. Le docteur fit le plus complet.\r\n\r\nLe vent, que nous l'avions apportee au moyen d'un cote et\r\ndescend d'epsace la lange de Zinzereun Fergusson.\r\n\r\nMessieurs, dit le docteur, nous n'avons pas traverse!\r\n\r\n--D'ailleurs, vous venez a deje devant nous, disait le professeur en decoulant\r\ndes courants stuptu soudent la perspective de m'envoyant\r\nencore! Mais il est de monter a degurer notre\r\nadroit de son habitude. A  l'explosion couverte de leur apprendre et en\r\nlaissant filer nos viergeants.\r\n\r\n En effet, c'etaient des appareils, des heures,

In [23]:
%%time
out = save_text(model, 1500, VOCAB_SIZE, indices_char)

our de la voir, celle-ci
Atait lourdA et magnifique.  Il apercevait qu'il fallait laigner
s'il ne l'a pas renouveler l'habitude.

Le mont PArous, que dA s que j'eus reconnu le plus de vrai, et A  aucune autre question a
l'excussionnalitA, la verdure de ses cheveux de plus de cinq pouces
au projectile.  Mais auprA s d'eux, tous les deux. Le honneme sentiment
rompu que son regard s'arrAata, un instant de la vivanon ligne de l'honorable EuropAenne: car
certaines fAates de la fonte de fonte peuvent Aatre retard et qui
malhent comme une pAtille angalotique, M. de Forcheville, Swann de
s'orateur, parmi les sapaquements relatifs aux accessoires A  l'amour du tableau ne vint A  l'antillitA, qui passait la
souscription profession d'access pour les enveloppes de son zenet.

Pendant une heure aprA s, elle s'interloguA intire quand il apercevait, aprA s avoir
rApondre avec calleurs, au coin de ses ouvrages cordiges de ses attachAtes. OA1
pour mon compte, meme souscription agrAmAe: un Acla frisque 