# LSTM

- ## Preliminaries

- ### Imports

In [1]:
from __future__ import print_function
from keras.models import Sequential
from keras.layers import Dense, Activation,Dropout,TimeDistributed
from keras.layers import LSTM,SimpleRNN
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
import numpy as np
import random
import sys
import os, os.path
from os import listdir
from os.path import isfile, join
from unicodedata import normalize
import re

Using TensorFlow backend.


- ### Check GPU usage

In [2]:
from tensorflow.python.client import device_lib

def get_available_gpus():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos if x.device_type == 'GPU']

In [3]:
get_available_gpus()

[u'/gpu:0']

----------

- ### Check and set Twitter's API

In [4]:
import twitter
api = twitter.Api(consumer_key='LyNVanTEQEOEGKfXAMeLv6AKG',
                    consumer_secret='0lJvhaaOP5cRZWm6rxwyBIAypd1P7eiDx9f74KBDlLrSldNuBQ',
                    access_token_key='855852332034265088-geTEVmA7xIsOD3WCZyfBNnqjRdS1MhW',
                    access_token_secret='kJMwMl67e3nYrqaGWzIizxzQpRZhtBfOnwPflO1fk3cOt')

In [5]:
print(api.VerifyCredentials())

{"created_at": "Sat Apr 22 18:34:31 +0000 2017", "default_profile": true, "description": "Learning how to be creative", "followers_count": 2, "friends_count": 3, "id": 855852332034265088, "lang": "en", "location": "Somewhere in the cloud", "name": "ArtistBot", "profile_background_color": "F5F8FA", "profile_banner_url": "https://pbs.twimg.com/profile_banners/855852332034265088/1492892354", "profile_image_url": "http://pbs.twimg.com/profile_images/855878764143804417/r55Z2Js5_normal.jpg", "profile_link_color": "1DA1F2", "profile_sidebar_fill_color": "DDEEF6", "profile_text_color": "333333", "screen_name": "TheTalkativeBot", "status": {"created_at": "Mon Apr 24 13:46:57 +0000 2017", "id": 856504737352601601, "id_str": "856504737352601601", "in_reply_to_screen_name": "dvp_tran", "in_reply_to_status_id": 856504560373911552, "in_reply_to_user_id": 747074580754403328, "lang": "en", "source": "<a href=\"http://www.google.com\" rel=\"nofollow\">TheScenarioBot</a>", "text": "@dvp_tran Automate an

-------------------

# I. Learning from corpus

** 1. Load and convert data**

'''Example script to generate text from Nietzsche's writings.
At least 20 epochs are required before the generated text
starts sounding coherent.
It is recommended to run this script on GPU, as recurrent
networks are quite computationally intensive.
If you try this script on new data, make sure your corpus
has at least ~100k characters. ~1M is better.
'''

In [6]:
#Load and concatenate files:

DIR="../../LSTM/data/Gutenberg/ebooks-unzipped/English/"
all_files = [f for f in listdir(DIR) if isfile(join(DIR, f))]

#choose how many files to concatenate:
nb_files=10
if nb_files>len(all_files):
    nb_files=len(all_files)
    
    
out_path="english/data/"
if not os.path.exists(out_path):
    os.makedirs(out_path)
if not os.path.exists(out_path+"input/"):
    os.makedirs(out_path+"input/")
    
with open(out_path+'input/english.txt', 'w') as outfile:
    for fname in all_files[0:nb_files]:
        with open(DIR+fname) as infile:
            i=0
            for line in infile:
                if i>=50:
                    outfile.write(line)
                i=i+1
        print ("Done concatenating file : %s" %fname)

Done concatenating file : 21-0.txt
Done concatenating file : 28.txt
Done concatenating file : 18.txt
Done concatenating file : 13-0.txt
Done concatenating file : 16-0.txt
Done concatenating file : 51-0.txt
Done concatenating file : 30.txt
Done concatenating file : 20.txt
Done concatenating file : 46-8.txt
Done concatenating file : 50.txt


In [7]:
"../../LSTM/data/Gutenberg/ebooks-unzipped/English/"

'../../LSTM/data/Gutenberg/ebooks-unzipped/English/'

In [8]:
#load file
file_name=out_path+'input/english.txt'
text = open(file_name).read()
text=normalize('NFKD',text.decode('latin1')).encode('ASCII', 'ignore')

In [9]:
#text = text.replace(to_delete,"").replace('Digitized by',"").replace('Google',"") 
text = re.sub("\n\n+" , "\n", text)

In [10]:
print('corpus length:', len(text))

chars = sorted(list(set(text)))
VOCAB_SIZE = len(chars)
print('total chars:',VOCAB_SIZE)

corpus length: 9670571
total chars: 87


**Warning:** The RNN takes in input numerical data hence the necessity to convert strings into numerical values.

In [11]:
#creating mapping between indexes and characters
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

We’re gonna use Keras to create and train our Network, so we must convert the data into this form: (number_of_sequences, length_of_sequence, number_of_features).
- nb of features = length of the char array
- length of sequence = batch size
- nb of sequence = len(data) divided by batch size.

**Warning : ** target sequence is setted by shifting the source/input sequence by one character with both having the same length.

In [12]:
%%time

SEQ_LENGTH=100
#Build three dimensional arrays
X = np.zeros((len(text)/SEQ_LENGTH, SEQ_LENGTH, VOCAB_SIZE)) #input
y = np.zeros((len(text)/SEQ_LENGTH, SEQ_LENGTH, VOCAB_SIZE)) #target

#Build sequences
for i in range(0, len(text)/SEQ_LENGTH):
    X_sequence = text[i*SEQ_LENGTH:(i+1)*SEQ_LENGTH]
    X_sequence_ix = [char_indices[value] for value in X_sequence]
    input_sequence = np.zeros((SEQ_LENGTH, VOCAB_SIZE))
    for j in range(SEQ_LENGTH):
        input_sequence[j][X_sequence_ix[j]] = 1.
    X[i] = input_sequence

    y_sequence = text[i*SEQ_LENGTH+1:(i+1)*SEQ_LENGTH+1]
    y_sequence_ix = [char_indices[value] for value in y_sequence]
    target_sequence = np.zeros((SEQ_LENGTH, VOCAB_SIZE))
    for j in range(SEQ_LENGTH):
        target_sequence[j][y_sequence_ix[j]] = 1.
    y[i] = target_sequence

CPU times: user 11.6 s, sys: 3.04 s, total: 14.7 s
Wall time: 14.7 s


** 2. Build the network**

In [13]:
HIDDEN_DIM= 500 #500
LAYER_NUM = 2


model = Sequential()
model.add(LSTM(HIDDEN_DIM, input_shape=(None, VOCAB_SIZE), return_sequences=True))
for i in range(LAYER_NUM - 1):
    model.add(LSTM(HIDDEN_DIM, return_sequences=True))
model.add(TimeDistributed(Dense(VOCAB_SIZE)))
model.add(Activation('softmax'))
model.compile(loss="categorical_crossentropy", optimizer="rmsprop")

In [14]:
def generate_text(model, length, vocab_size, ix_to_char):
    # starting with random character
    ix = [np.random.randint(vocab_size)]
    y_char = [ix_to_char[ix[-1]]]
    X = np.zeros((1, length, vocab_size))
    for i in range(length):
        # appending the last predicted character to sequence
        X[0, i, :][ix[-1]] = 1
        print(ix_to_char[ix[-1]], end="")
        ix = np.argmax(model.predict(X[:, :i+1, :])[0], 1)
        y_char.append(ix_to_char[ix[-1]])
    return ('').join(y_char)

In [15]:
# Generate some sample before training to know how bad it is!
bla = generate_text(model, 100, VOCAB_SIZE, indices_char)
#api.PostUpdate(status=bla[0:123])

(.ff
QQDDDD:::ttttttffQQDDDDDRR:::tttttffWW[[[XXXXXXBBBBBB)))))RRR!!vvYY!GG99G)))(((SS...G&&&::OOOO#

**3. Train network**

In [None]:
def get_iternb(string):
    return re.findall(r'checkpoint_500_epoch_(.*).hdf5', string)[0]

In [None]:
#batch size equals to seq length here
BATCH_SIZE=100
#len of desired output
GENERATE_LENGTH=140
DIR=out_path+"weights/weight_attempt_s02/"
flag=True

try:
    onlyfiles = [f for f in listdir(DIR) if isfile(join(DIR, f))]
    iteration=[]
    for files in onlyfiles:
        iteration.append(int(get_iternb(files)))
    iteration=max(iteration)

    last_checkpoint=DIR+onlyfiles[0][0:21]+str(iteration)+'.hdf5'
except Exception as e:
    print(e)
    onlyfiles=[]
    if not os.path.exists(DIR):
        os.makedirs(DIR)
    nb_files=0

if nb_files>0:
    model.load_weights(last_checkpoint)
else:
    iteration=0
    
print("Starting at iteration : %s" %iteration)
while flag==True:
    print('\n')
    print('-'*20)
    model.fit(X, y, batch_size=BATCH_SIZE, verbose=2, nb_epoch=1)
    iteration += 1
    bla=generate_text(model, GENERATE_LENGTH,VOCAB_SIZE, indices_char)
    if iteration % 10 == 0:
        print("\n\nIteration nb : %s" %iteration)
        #api.PostUpdate(status=bla[0:123])
        model.save_weights(DIR+'checkpoint_{}_epoch_{}.hdf5'.format(HIDDEN_DIM, iteration))
        #remove unecessary files:
        for files in onlyfiles:
            try:
                if files:
                    os.remove(DIR+files)
            except:
                pass
        onlyfiles = [f for f in listdir(DIR) if isfile(join(DIR, f))]

    if iteration>=600:
        print("Stopping...")
        flag=False

[Errno 2] No such file or directory: 'english/data/weights/weight_attempt_s02/'
Starting at iteration : 0


--------------------




Epoch 1/1
407s - loss: 1.9727
#                                                                                                                                           

--------------------
Epoch 1/1
407s - loss: 1.3521
01:004:004 And they shall be a temple of the land, and the first that he tenth
           the house of the LORD, and the first that he tent

--------------------
Epoch 1/1
407s - loss: 1.2467
9 1817330222 3373383322
1033288874 3773380774 3773300774 3733300224 3333322222
2333228274 3773380774 3773300224 3333380224 3333302222
333

--------------------
Epoch 1/1
407s - loss: 1.1958
e the god of the LORD their God of the LORD their God of the LORD their God of the
           people of the congregation of the children of

--------------------
Epoch 1/1
407s - loss: 1.1628
d the LORD said unto the LORD: and the LORD said unto the LORD, I will set the
           families of the LORD.

03:018:010 And the LORD 

--------------------
Epoch 1/1
407s - loss: 1.1375
/
           

           Korathites: and of the sons of Jehoshaphat, the son of Shimrothaish,
           the son o

--------------------
Epoch 1/1
409s - loss: 0.9003
re they not written in the book of the children
           of Ammon, and say unto them, They shall not fall upon us, and
           they s

--------------------
Epoch 1/1
408s - loss: 0.8986
6 5402467731 3338833338 2351073341 1433944470
4809044339 5777343324 3418433339 3433337322 5937439375
8440943433 7467073444 7463333433 9447

--------------------
Epoch 1/1
410s - loss: 0.8971
/pplied within 90 daughters of Parliamened unto the LORD
           hath not pitied unto him all the words of the enemy, that they
       

--------------------
Epoch 1/1
409s - loss: 0.8954
be revealed.

42:008:015 And when her masters saw that the third part of the earth
           was in the days of Nathan, he went to the p

--------------------
Epoch 1/1
409s - loss: 0.8938

           the seed of Abraham, and the seed of Israel that sinneth
           sait

409s - loss: 0.8617
proportion of the most
cannot be denied that the representatives of the people replied,
will, perhaps, be accomplished, in the confederacy

--------------------
Epoch 1/1
409s - loss: 0.8611
.

42:013:022 And the second bear is like a weaver and a voice, and have
           forgotten the LORD their God.

26:005:007 And they 

--------------------
Epoch 1/1
409s - loss: 0.8604
?

19:141:004 I am as a mother of all men for my sake, that thou mayest
           see thy name for ever: I am the LORD your God, and ye 

--------------------
Epoch 1/1
409s - loss: 0.8600
judged that in the national government to ingreferant in
the same face, of considering the progress of the most
government of the Union, w

Iteration nb : 90


--------------------
Epoch 1/1
408s - loss: 0.8595
ce of a majority of such an enemy, and will be
able under the national legislatures, the subordination is to an
equal violation on the sub

--------------------
Epoch 1/1
408s - loss: 0.8588
xpedity i

           and Jehiel, and Karmah.

13:00

--------------------
Epoch 1/1
408s - loss: 0.8436
was the son of Zeruiah, and said unto him, Tarry and
           upon the earth, that I may know that thou art called
           any more t

Iteration nb : 130


--------------------
Epoch 1/1
408s - loss: 0.8436
e the seventh day there was not any thing unto his servant
           Job.

19:038:001 Why are ye not as a father and a servant, if any m

--------------------
Epoch 1/1
408s - loss: 0.8441
Ye shall be given to the people, and to be a man of
           flesh, and fine flour, or increase, with the bread, and the
           capt

--------------------
Epoch 1/1
408s - loss: 0.8431
4 9333800867 4119190018
5964442774 2711872756 8846991329 5577266660 5024777792
4477004274 4710666678 7469241204 7167776922 7274722212
717

--------------------
Epoch 1/1
408s - loss: 0.8427
le the sound of the trumpet, and the street of the city, and
           the priest shall make an atonement for him before t

**4. Generate text**

In [None]:
def save_text(model, length, vocab_size, ix_to_char):
    # starting with random character
    ix = [np.random.randint(vocab_size)]
    y_char = [ix_to_char[ix[-1]]]
    X = np.zeros((1, length, vocab_size))
    for i in range(length):
        # appending the last predicted character to sequence
        X[0, i, :][ix[-1]] = 1
        print(ix_to_char[ix[-1]], end="")
        ix = np.argmax(model.predict(X[:, :i+1, :])[0], 1)
        y_char.append(ix_to_char[ix[-1]])
    if not os.path.exists(out_path+"generate/"):
        os.makedirs(out_path+"generate/")
    with open(out_path+"generate/output.txt","w") as f:
        f.write(('').join(y_char))
    return ('').join(y_char)


In [None]:
#seed with particular text:
def generate_text_seeded(model,seed,length, vocab_size, ix_to_char):
    # starting with random character
    # char_indices
    ix = [char_indices[x] for x in seed]
    y_char = [x for x in seed]
    X = np.zeros((1, length, vocab_size))
    for i in range(len(ix)) :
        X[0, i, :][ix[i]] = 1
        print(ix_to_char[ix[i]], end="")
    to_substract = len(ix)
    for i in range(length-to_substract):
        # appending the last predicted character to sequence
        X[0, i, :][ix[-1]] = 1
        print(ix_to_char[ix[-1]], end="")
        ix = np.argmax(model.predict(X[:, :i+1, :])[0], 1)
        y_char.append(ix_to_char[ix[-1]])
    return ('').join(y_char)

In [None]:
generate_text_seeded(model,normalize('NFKD',"I love ".decode('latin1')), 100, VOCAB_SIZE, indices_char)

In [None]:
%%time
out = save_text(model, 1500, VOCAB_SIZE, indices_char)