# LSTM

- ## Preliminaries

- ### Imports

In [2]:
from __future__ import print_function
from keras.models import Sequential
from keras.layers import Dense, Activation,Dropout,TimeDistributed
from keras.layers import LSTM,SimpleRNN
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
import numpy as np
import random
import sys
import os, os.path
from os import listdir
from os.path import isfile, join
from unicodedata import normalize
import re

Using TensorFlow backend.


- ### Check GPU usage

In [3]:
from tensorflow.python.client import device_lib

def get_available_gpus():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos if x.device_type == 'GPU']

In [4]:
get_available_gpus()

[u'/gpu:0']

----------

- ### Check and set Twitter's API

In [5]:
import twitter
api = twitter.Api(consumer_key='LyNVanTEQEOEGKfXAMeLv6AKG',
                    consumer_secret='0lJvhaaOP5cRZWm6rxwyBIAypd1P7eiDx9f74KBDlLrSldNuBQ',
                    access_token_key='855852332034265088-geTEVmA7xIsOD3WCZyfBNnqjRdS1MhW',
                    access_token_secret='kJMwMl67e3nYrqaGWzIizxzQpRZhtBfOnwPflO1fk3cOt')

In [6]:
print(api.VerifyCredentials())

{"created_at": "Sat Apr 22 18:34:31 +0000 2017", "default_profile": true, "description": "Learning how to be creative", "followers_count": 2, "friends_count": 1, "id": 855852332034265088, "lang": "en", "location": "Somewhere in the cloud", "name": "ArtistBot", "profile_background_color": "F5F8FA", "profile_banner_url": "https://pbs.twimg.com/profile_banners/855852332034265088/1492892354", "profile_image_url": "http://pbs.twimg.com/profile_images/855878764143804417/r55Z2Js5_normal.jpg", "profile_link_color": "1DA1F2", "profile_sidebar_fill_color": "DDEEF6", "profile_text_color": "333333", "screen_name": "TheTalkativeBot", "status": {"created_at": "Tue Apr 25 22:31:31 +0000 2017", "id": 856999137753145349, "id_str": "856999137753145349", "in_reply_to_screen_name": "TheTalkativeBot", "in_reply_to_status_id": 856894659007844352, "in_reply_to_user_id": 855852332034265088, "lang": "en", "media": [{"display_url": "pic.twitter.com/lMV1EeegY0", "expanded_url": "https://twitter.com/TheTalkativeB

-------------------

# I. Learning from corpus

** 1. Load and convert data**

'''Example script to generate text from Nietzsche's writings.
At least 20 epochs are required before the generated text
starts sounding coherent.
It is recommended to run this script on GPU, as recurrent
networks are quite computationally intensive.
If you try this script on new data, make sure your corpus
has at least ~100k characters. ~1M is better.
'''

In [7]:
#Load and concatenate files:

DIR="../../LSTM/data/Gutenberg/ebooks-unzipped/French/"
all_files = [f for f in listdir(DIR) if isfile(join(DIR, f))]

#choose how many files to concatenate:
nb_files=7
if nb_files>len(all_files):
    nb_files=len(all_files)
    
    
out_path="french/data/"
if not os.path.exists(out_path):
    os.makedirs(out_path)
if not os.path.exists(out_path+"input/"):
    os.makedirs(out_path+"input/")
    
with open(out_path+'input/french.txt', 'w') as outfile:
    for fname in all_files[0:nb_files]:
        with open(DIR+fname) as infile:
            i=0
            for line in infile:
                if i>=50:
                    outfile.write(line)
                i=i+1
        print ("Done concatenating file : %s" %fname)

Done concatenating file : 249.txt
Done concatenating file : 4740-8.txt
Done concatenating file : 799-0.txt
Done concatenating file : 4548-8.txt
Done concatenating file : 4791-8.txt
Done concatenating file : 803-8.txt
Done concatenating file : 2650-0.txt


In [8]:
"ISO-8859-1"
"UTF-8"

'UTF-8'

In [9]:
#load file
file_name=out_path+'input/french.txt'
text = open(file_name).read()
text=normalize('NFKD',text.decode('latin1')).encode('ASCII', 'ignore')

In [10]:
text = re.sub("\n\n+" , "\n", text)

In [11]:
print('corpus length:', len(text))

chars = sorted(list(set(text)))
VOCAB_SIZE = len(chars)
print('total chars:',VOCAB_SIZE)

corpus length: 2508723
total chars: 89


**Warning:** The RNN takes in input numerical data hence the necessity to convert strings into numerical values.

In [12]:
#creating mapping between indexes and characters
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

We’re gonna use Keras to create and train our Network, so we must convert the data into this form: (number_of_sequences, length_of_sequence, number_of_features).
- nb of features = length of the char array
- length of sequence = batch size
- nb of sequence = len(data) divided by batch size.

**Warning : ** target sequence is setted by shifting the source/input sequence by one character with both having the same length.

In [13]:
%%time

SEQ_LENGTH=100
#Build three dimensional arrays
X = np.zeros((len(text)/SEQ_LENGTH, SEQ_LENGTH, VOCAB_SIZE)) #input
y = np.zeros((len(text)/SEQ_LENGTH, SEQ_LENGTH, VOCAB_SIZE)) #target

#Build sequences
for i in range(0, len(text)/SEQ_LENGTH):
    X_sequence = text[i*SEQ_LENGTH:(i+1)*SEQ_LENGTH]
    X_sequence_ix = [char_indices[value] for value in X_sequence]
    input_sequence = np.zeros((SEQ_LENGTH, VOCAB_SIZE))
    for j in range(SEQ_LENGTH):
        input_sequence[j][X_sequence_ix[j]] = 1.
    X[i] = input_sequence

    y_sequence = text[i*SEQ_LENGTH+1:(i+1)*SEQ_LENGTH+1]
    y_sequence_ix = [char_indices[value] for value in y_sequence]
    target_sequence = np.zeros((SEQ_LENGTH, VOCAB_SIZE))
    for j in range(SEQ_LENGTH):
        target_sequence[j][y_sequence_ix[j]] = 1.
    y[i] = target_sequence

CPU times: user 1.82 s, sys: 196 ms, total: 2.02 s
Wall time: 2.02 s


** 2. Build the network**

In [14]:
HIDDEN_DIM= 500 #500
LAYER_NUM = 2


model = Sequential()
model.add(LSTM(HIDDEN_DIM, input_shape=(None, VOCAB_SIZE), return_sequences=True))
for i in range(LAYER_NUM - 1):
    model.add(LSTM(HIDDEN_DIM, return_sequences=True))
model.add(TimeDistributed(Dense(VOCAB_SIZE)))
model.add(Activation('softmax'))
model.compile(loss="categorical_crossentropy", optimizer="rmsprop")

In [15]:
def generate_text(model, length, vocab_size, ix_to_char):
    # starting with random character
    ix = [np.random.randint(vocab_size)]
    y_char = [ix_to_char[ix[-1]]]
    X = np.zeros((1, length, vocab_size))
    for i in range(length):
        # appending the last predicted character to sequence
        X[0, i, :][ix[-1]] = 1
        print(ix_to_char[ix[-1]], end="")
        ix = np.argmax(model.predict(X[:, :i+1, :])[0], 1)
        y_char.append(ix_to_char[ix[-1]])
    return ('').join(y_char)

In [16]:
# Generate some sample before training to know how bad it is!
bla = generate_text(model, 100, VOCAB_SIZE, indices_char)
#api.PostUpdate(status=bla[0:123])

PZggglyyd_ddns89999955555555555555555555555555555555555555555555555555555555555555555555555555555555

**3. Train network**

In [None]:
def get_iternb(string):
    return re.findall(r'checkpoint_500_epoch_(.*).hdf5', string)[0]

In [None]:
#batch size equals to seq length here
BATCH_SIZE=100
#len of desired output
GENERATE_LENGTH=140
DIR=out_path+"weights/weight_attempt_s03/"
flag=True

try:
    onlyfiles = [f for f in listdir(DIR) if isfile(join(DIR, f))]
    iteration=[]
    for files in onlyfiles:
        iteration.append(int(get_iternb(files)))
    iteration=max(iteration)

    last_checkpoint=DIR+onlyfiles[0][0:21]+str(iteration)+'.hdf5'
except Exception as e:
    print(e)
    onlyfiles=[]
    if not os.path.exists(DIR):
        os.makedirs(DIR)
    nb_files=0

if nb_files>0:
    model.load_weights(last_checkpoint)
else:
    iteration=0
    
print("Starting at iteration : %s" %iteration)
while flag==True:
    print('\n')
    print('-'*20)
    model.fit(X, y, batch_size=BATCH_SIZE, verbose=2, nb_epoch=1)
    iteration += 1
    bla=generate_text(model, GENERATE_LENGTH,VOCAB_SIZE, indices_char)
    if iteration % 10 == 0:
        print("\n\nIteration nb : %s" %iteration)
        #api.PostUpdate(status=bla[0:123])
        model.save_weights(DIR+'checkpoint_{}_epoch_{}.hdf5'.format(HIDDEN_DIM, iteration))
        #remove unecessary files:
        for files in onlyfiles:
            try:
                if files:
                    os.remove(DIR+files)
            except:
                pass
        onlyfiles = [f for f in listdir(DIR) if isfile(join(DIR, f))]

    if iteration>=900:
        print("Stopping...")
        flag=False

Starting at iteration : 800


--------------------




Epoch 1/1
92s - loss: 0.1989
2.00
_________________________


Un soupir moribond
trainait sur la place
de l'espoir
respirain, ils avaient lance les memes Ateints 

--------------------
Epoch 1/1
97s - loss: 0.1869
Mais Joe, Samuel?

--Je ne l'abandonne pas! non certes! et dut l'ouragan s'enfoncer toute la partie de la
construction en la tablier!  je

--------------------
Epoch 1/1
94s - loss: 0.1822
] et
neuf cent soixante-seize fois moins vite que dans cette atmosphA re place A 
une vie de mon amie avec collA gues, et enfin aux belles

--------------------
Epoch 1/1
93s - loss: 0.1836
%sul.

L'orbre fut des morts, mais des siennes paraissait chasser un des
foyers, et qu'il n'eut pas le seul a faire aux deux angegues de

--------------------
Epoch 1/1
94s - loss: 0.1849
Il ne manquait plus que cela! Voila donc ce
quartzez-li.  Nous nous demandons, devant ce souvenir isolAme
oA1 les palisites on garaissaien

--------------------
Epoch 1/1
94s - loss: 0.1855
Je ne crois pas, dit-il, que 

**4. Generate text**

In [None]:
def save_text(model, length, vocab_size, ix_to_char):
    # starting with random character
    ix = [np.random.randint(vocab_size)]
    y_char = [ix_to_char[ix[-1]]]
    X = np.zeros((1, length, vocab_size))
    for i in range(length):
        # appending the last predicted character to sequence
        X[0, i, :][ix[-1]] = 1
        print(ix_to_char[ix[-1]], end="")
        ix = np.argmax(model.predict(X[:, :i+1, :])[0], 1)
        y_char.append(ix_to_char[ix[-1]])
    if not os.path.exists(out_path+"generate/"):
        os.makedirs(out_path+"generate/")
    with open(out_path+"generate/output.txt","w") as f:
        f.write(('').join(y_char))
    return ('').join(y_char)


In [None]:
#seed with particular text:
def generate_text_seeded(model,seed,length, vocab_size, ix_to_char):
    # starting with random character
    # char_indices
    ix = [char_indices[x] for x in seed]
    y_char = [x for x in seed]
    X = np.zeros((1, length, vocab_size))
    for i in range(len(ix)) :
        X[0, i, :][ix[i]] = 1
        print(ix_to_char[ix[i]], end="")
    to_substract = len(ix)
    for i in range(length-to_substract):
        # appending the last predicted character to sequence
        X[0, i, :][ix[-1]] = 1
        print(ix_to_char[ix[-1]], end="")
        ix = np.argmax(model.predict(X[:, :i+1, :])[0], 1)
        y_char.append(ix_to_char[ix[-1]])
    return ('').join(y_char)

In [None]:
generate_text_seeded(model,normalize('NFKD',"le roi est mort ".decode('latin1')), 1000, VOCAB_SIZE, indices_char)

In [None]:
%%time
out = save_text(model, 1500, VOCAB_SIZE, indices_char)