# LSTM

- ## Preliminaries

- ### Imports

In [40]:
from __future__ import print_function
from keras.models import Sequential
from keras.layers import Dense, Activation,Dropout,TimeDistributed
from keras.layers import LSTM,SimpleRNN
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
import numpy as np
import random
import sys
import os, os.path
from os import listdir
from os.path import isfile, join
from unicodedata import normalize

- ### Check CPU usage

In [2]:
from tensorflow.python.client import device_lib

def get_available_gpus():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos if x.device_type == 'GPU']

In [3]:
get_available_gpus()

[u'/gpu:0']

----------

- ### Check and set Twitter's API

In [4]:
import twitter
api = twitter.Api(consumer_key='LyNVanTEQEOEGKfXAMeLv6AKG',
                    consumer_secret='0lJvhaaOP5cRZWm6rxwyBIAypd1P7eiDx9f74KBDlLrSldNuBQ',
                    access_token_key='855852332034265088-geTEVmA7xIsOD3WCZyfBNnqjRdS1MhW',
                    access_token_secret='kJMwMl67e3nYrqaGWzIizxzQpRZhtBfOnwPflO1fk3cOt')

In [5]:
print(api.VerifyCredentials())

{"created_at": "Sat Apr 22 18:34:31 +0000 2017", "default_profile": true, "description": "Learning how to be creative", "friends_count": 2, "id": 855852332034265088, "lang": "en", "location": "Somewhere in the cloud", "name": "ArtistBot", "profile_background_color": "F5F8FA", "profile_banner_url": "https://pbs.twimg.com/profile_banners/855852332034265088/1492892354", "profile_image_url": "http://pbs.twimg.com/profile_images/855878764143804417/r55Z2Js5_normal.jpg", "profile_link_color": "1DA1F2", "profile_sidebar_fill_color": "DDEEF6", "profile_text_color": "333333", "screen_name": "TheTalkativeBot", "status": {"created_at": "Sun Apr 23 03:40:57 +0000 2017", "favorite_count": 1, "id": 855989845294010368, "id_str": "855989845294010368", "lang": "en", "source": "<a href=\"http://www.google.com\" rel=\"nofollow\">TheScenarioBot</a>", "text": "Today I am learning French, watch me improve! ;)"}, "statuses_count": 2}


# I. Toy examples

- ### Test

** 1. Load and convert data**

'''Example script to generate text from Nietzsche's writings.
At least 20 epochs are required before the generated text
starts sounding coherent.
It is recommended to run this script on GPU, as recurrent
networks are quite computationally intensive.
If you try this script on new data, make sure your corpus
has at least ~100k characters. ~1M is better.
'''

In [6]:
#Load and concatenate files:

DIR="../../LSTM/data/Gutenberg/ebooks-unzipped/"
all_files = [f for f in listdir(DIR) if isfile(join(DIR, f))]

#choose how many files to concatenate:
nb_files=10
if nb_files>len(all_files):
    nb_files=len(all_files)
    
    
out_path="french/data/"
if not os.path.exists(out_path):
    os.makedirs(out_path)
if not os.path.exists(out_path+"input/"):
    os.makedirs(out_path+"input/")
    
with open(out_path+'input/input_action.txt', 'w') as outfile:
    for fname in all_files[0:nb_files]:
        with open(DIR+fname) as infile:
            for line in infile:
                outfile.write(line)
        print ("Done concatenating file : %s" %fname)

Done concatenating file : 8541-8.txt
Done concatenating file : 11176-8.txt
Done concatenating file : 11300-8.txt
Done concatenating file : 10604-8.txt
Done concatenating file : 7173-8.txt
Done concatenating file : 9053-8.txt
Done concatenating file : 5126-0.txt
Done concatenating file : 5178-8.txt
Done concatenating file : 7012-8.txt
Done concatenating file : 8524-8.txt


In [7]:
#load file
file_name=out_path+'input/input_action.txt'
text = open(file_name).read()
print('corpus length:', len(text))

chars = sorted(list(set(text)))
VOCAB_SIZE = len(chars)
print('total chars:',VOCAB_SIZE)

corpus length: 4493427
total chars: 151


**Warning:** The RNN takes in input numerical data hence the necessity to convert strings into numerical values.

In [8]:
#creating mapping between indexes and characters
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

We’re gonna use Keras to create and train our Network, so we must convert the data into this form: (number_of_sequences, length_of_sequence, number_of_features).
- nb of features = length of the char array
- length of sequence = batch size
- nb of sequence = len(data) divided by batch size.

**Warning : ** target sequence is setted by shifting the source/input sequence by one character with both having the same length.

In [9]:
%%time

SEQ_LENGTH=100
#Build three dimensional arrays
X = np.zeros((len(text)/SEQ_LENGTH, SEQ_LENGTH, VOCAB_SIZE)) #input
y = np.zeros((len(text)/SEQ_LENGTH, SEQ_LENGTH, VOCAB_SIZE)) #target

#Build sequences
for i in range(0, len(text)/SEQ_LENGTH):
    X_sequence = text[i*SEQ_LENGTH:(i+1)*SEQ_LENGTH]
    X_sequence_ix = [char_indices[value] for value in X_sequence]
    input_sequence = np.zeros((SEQ_LENGTH, VOCAB_SIZE))
    for j in range(SEQ_LENGTH):
        input_sequence[j][X_sequence_ix[j]] = 1.
    X[i] = input_sequence

    y_sequence = text[i*SEQ_LENGTH+1:(i+1)*SEQ_LENGTH+1]
    y_sequence_ix = [char_indices[value] for value in y_sequence]
    target_sequence = np.zeros((SEQ_LENGTH, VOCAB_SIZE))
    for j in range(SEQ_LENGTH):
        target_sequence[j][y_sequence_ix[j]] = 1.
    y[i] = target_sequence

CPU times: user 3.82 s, sys: 452 ms, total: 4.27 s
Wall time: 4.26 s


** 2. Build the network**

In [10]:
HIDDEN_DIM= 500 #500
LAYER_NUM = 2


model = Sequential()
model.add(LSTM(HIDDEN_DIM, input_shape=(None, VOCAB_SIZE), return_sequences=True))
for i in range(LAYER_NUM - 1):
    model.add(LSTM(HIDDEN_DIM, return_sequences=True))
model.add(TimeDistributed(Dense(VOCAB_SIZE)))
model.add(Activation('softmax'))
model.compile(loss="categorical_crossentropy", optimizer="rmsprop")

In [11]:
def generate_text(model, length, vocab_size, ix_to_char):
    # starting with random character
    ix = [np.random.randint(vocab_size)]
    y_char = [ix_to_char[ix[-1]]]
    X = np.zeros((1, length, vocab_size))
    for i in range(length):
        # appending the last predicted character to sequence
        X[0, i, :][ix[-1]] = 1
        print(ix_to_char[ix[-1]], end="")
        ix = np.argmax(model.predict(X[:, :i+1, :])[0], 1)
        y_char.append(ix_to_char[ix[-1]])
    return ('').join(y_char)

In [30]:
# Generate some sample before training to know how bad it is!
bla = generate_text(model, 100, VOCAB_SIZE, indices_char)
api.PostUpdate(status=unicode(bla, errors='replace')[0:123])

c888___<�����BB������**������~~���VVV22nn���tNNNNNNHNNVVAAAAAAAVVAAoooo��8888����uuu��+++QQ���������

Status(ID=855993910816124928, ScreenName=TheTalkativeBot, Created=Sun Apr 23 03:57:06 +0000 2017, Text=u'c888___&lt;\ufffd\ufffd\ufffd\ufffd\ufffdBB\ufffd\ufffd\ufffd\ufffd\ufffd\ufffd**\ufffd\ufffd\ufffd\ufffd\ufffd\ufffd~~\ufffd\ufffd\ufffdVVV22nn\ufffd\ufffd\ufffdtNNNNNNHNNVVAAAAAAAVVAAoooo\ufffd\ufffd8888\ufffd\ufffd\ufffd\ufffduuu\ufffd\ufffd+++QQ\ufffd\ufffd\ufffd\ufffd\ufffd\ufffd\ufffd\ufffd\ufffd\ufffd')

**3. Train network**

In [36]:
#batch size equals to seq length here
BATCH_SIZE=100
#len of desired output
GENERATE_LENGTH=140
DIR=out_path+"weights/weight_attempt_s01/"
flag=True

try:
    onlyfiles = [f for f in listdir(DIR) if isfile(join(DIR, f))]
    nb_files = len(onlyfiles)
    name_template = onlyfiles[0][0:15]
except Exception as e:
    print(e)
    if not os.path.exists(DIR):
        os.makedirs(DIR)
    nb_files=0

if nb_files>0:
    nb_iteration=nb_files*20
    model.load_weights(DIR+onlyfiles[0][0:21]+str(nb_iteration)+'.hdf5')
else:
    nb_iteration=0
    
print("Starting at iteration : %s" %nb_iteration)
while flag==True:
    print('\n')
    print('-'*20)
    model.fit(X, y, batch_size=BATCH_SIZE, verbose=2, nb_epoch=1)
    nb_iteration += 1
    bla=generate_text(model, GENERATE_LENGTH,VOCAB_SIZE, indices_char)
    if nb_iteration % 20 == 0:
        print("\n\nIteration nb : %s" %nb_iteration)
        api.PostUpdate(status=unicode(bla, errors='replace')[0:123])
        model.save_weights(DIR+'checkpoint_{}_epoch_{}.hdf5'.format(HIDDEN_DIM, nb_iteration))
    if nb_iteration>=200:
        print("Stopping...")
        flag=False

list index out of range
Starting at iteration : 0


--------------------
Epoch 1/1
176s - loss: 1.2146
� se constituer des moins de trois hommes et des champs de la chambre de la place de la ville de la ville
de la chambre de la chambre de la

--------------------
Epoch 1/1
170s - loss: 1.1682
E




CHAPITRE IV


LES PARISIS DU LE PROJESTINGES EF LA PRESS D'ANTI DE LA PROJERT DU PORTE IV LICENTILIT DU CONSERINES


Le roi

--------------------
Epoch 1/1
184s - loss: 1.1335
Z se confiant de la conscience de la conscience de la conscience de la
conscience de l'endroit o� nous �tions pr�s de l'endroit o� nous �ti

--------------------
Epoch 1/1
170s - loss: 1.1044
� de la route, et le roi avait remarqu� quelques
mots de fusil, et les propos de la r�publique et les plus grands de la
propri�t� de la ro

--------------------
Epoch 1/1
169s - loss: 1.0798
fait le plus grand point de la part du comte de Margival.

--Ce sera un peu plus loin, dit le vieillard et le monde de la
force de me lai

-

KeyboardInterrupt: 

In [50]:
print(out.("latin1"))

UnicodeDecodeError: 'ascii' codec can't decode byte 0xb0 in position 0: ordinal not in range(128)

**4. Generate text**

In [37]:
def save_text(model, length, vocab_size, ix_to_char):
    # starting with random character
    ix = [np.random.randint(vocab_size)]
    y_char = [ix_to_char[ix[-1]]]
    X = np.zeros((1, length, vocab_size))
    for i in range(length):
        # appending the last predicted character to sequence
        X[0, i, :][ix[-1]] = 1
        print(ix_to_char[ix[-1]], end="")
        ix = np.argmax(model.predict(X[:, :i+1, :])[0], 1)
        y_char.append(ix_to_char[ix[-1]])
    if not os.path.exists(out_path+"generate/"):
        os.makedirs(out_path+"generate/")
    with open(out_path+"generate/output.txt","w") as f:
        f.write(('').join(y_char))
    return ('').join(y_char)


In [38]:
%%time
out = save_text(model, 1500, VOCAB_SIZE, indices_char)

� 190, ��vitement de la
ville, nous venions de viendre �� l���o�� les accumulateurs de l���a��ronef s�����taient
apais��s au-dehors �� l���horizon.

A cette ��pout��, occupaient avec le ma��tre aux trompettes de ce
r��fuse ou deux s��riens propulseurs un ton attach�� �� l���avant. A sa surface
s�����tait d��pass��e en un si grand degr�� fut acc��t��, foust�� entre deux heures et
volantes sous une ��norme balle du silence, l���occasion de l���a��ronef avec le
coucherks - ce qui se passe tr��s revenues sur toute sa
voix. Si Uncle Prudent et Phil Evans se pr��cipit��rent par la place de l���a��ronef.

En somme, aux experts du parc, l���appareil avec lui, il reconnut que, plaisant des
nombres ouvriers comme un homme de fianc��e, comme des a��rospates, les autres des
montagnes, des plaines qu���il convient de maintenir cette science, ce mot d���avantigue e��t agen
de peines et de prendre un moyen finir��ment. De longs nuits glaci��
plus bien r��clament en effet ceci d���une succr�he ��
ces 