# LSTM

- ## Preliminaries

- ### Imports

In [18]:
from __future__ import print_function
from keras.models import Sequential
from keras.layers import Dense, Activation,Dropout,TimeDistributed
from keras.layers import LSTM,SimpleRNN
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
import numpy as np
import random
import sys
import os, os.path
from os import listdir
from os.path import isfile, join
from unicodedata import normalize
import re

- ### Check GPU usage

In [19]:
from tensorflow.python.client import device_lib

def get_available_gpus():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos if x.device_type == 'GPU']

In [20]:
get_available_gpus()

[u'/gpu:0']

----------

- ### Check and set Twitter's API

In [21]:
import twitter
api = twitter.Api(consumer_key='LyNVanTEQEOEGKfXAMeLv6AKG',
                    consumer_secret='0lJvhaaOP5cRZWm6rxwyBIAypd1P7eiDx9f74KBDlLrSldNuBQ',
                    access_token_key='855852332034265088-geTEVmA7xIsOD3WCZyfBNnqjRdS1MhW',
                    access_token_secret='kJMwMl67e3nYrqaGWzIizxzQpRZhtBfOnwPflO1fk3cOt')

In [22]:
print(api.VerifyCredentials())

{"created_at": "Sat Apr 22 18:34:31 +0000 2017", "default_profile": true, "description": "Learning how to be creative", "followers_count": 2, "friends_count": 1, "id": 855852332034265088, "lang": "en", "location": "Somewhere in the cloud", "name": "ArtistBot", "profile_background_color": "F5F8FA", "profile_banner_url": "https://pbs.twimg.com/profile_banners/855852332034265088/1492892354", "profile_image_url": "http://pbs.twimg.com/profile_images/855878764143804417/r55Z2Js5_normal.jpg", "profile_link_color": "1DA1F2", "profile_sidebar_fill_color": "DDEEF6", "profile_text_color": "333333", "screen_name": "TheTalkativeBot", "status": {"created_at": "Fri Apr 28 00:17:34 +0000 2017", "id": 857750603602300928, "id_str": "857750603602300928", "in_reply_to_screen_name": "dvp_tran", "in_reply_to_status_id": 857750118941982720, "in_reply_to_user_id": 747074580754403328, "lang": "en", "media": [{"display_url": "pic.twitter.com/pOxEzRXkrx", "expanded_url": "https://twitter.com/TheTalkativeBot/stat

-------------------

# I. Learning from corpus

** 1. Load and convert data**

'''Example script to generate text from Nietzsche's writings.
At least 20 epochs are required before the generated text
starts sounding coherent.
It is recommended to run this script on GPU, as recurrent
networks are quite computationally intensive.
If you try this script on new data, make sure your corpus
has at least ~100k characters. ~1M is better.
'''

In [23]:
#Load and concatenate files:

DIR="../../LSTM/data/Gutenberg/ebooks-unzipped/English/"
all_files = [f for f in listdir(DIR) if isfile(join(DIR, f))]

#choose how many files to concatenate:
nb_files=10
if nb_files>len(all_files):
    nb_files=len(all_files)
    
    
out_path="english/data/"
if not os.path.exists(out_path):
    os.makedirs(out_path)
if not os.path.exists(out_path+"input/"):
    os.makedirs(out_path+"input/")
    
with open(out_path+'input/english.txt', 'w') as outfile:
    for fname in all_files[0:nb_files]:
        with open(DIR+fname) as infile:
            i=0
            for line in infile:
                if i>=50:
                    outfile.write(line)
                i=i+1
        print ("Done concatenating file : %s" %fname)

Done concatenating file : 21-0.txt
Done concatenating file : 28.txt
Done concatenating file : 18.txt
Done concatenating file : 13-0.txt
Done concatenating file : 16-0.txt
Done concatenating file : 51-0.txt
Done concatenating file : 30.txt
Done concatenating file : 20.txt
Done concatenating file : 46-8.txt
Done concatenating file : 50.txt


In [24]:
"../../LSTM/data/Gutenberg/ebooks-unzipped/English/"

'../../LSTM/data/Gutenberg/ebooks-unzipped/English/'

In [25]:
#load file
file_name=out_path+'input/english.txt'
text = open(file_name).read()
text=normalize('NFKD',text.decode('latin1')).encode('ASCII', 'ignore')

In [26]:
#text = text.replace(to_delete,"").replace('Digitized by',"").replace('Google',"") 
text = re.sub("\n\n+" , "\n", text)

In [27]:
print('corpus length:', len(text))

chars = sorted(list(set(text)))
VOCAB_SIZE = len(chars)
print('total chars:',VOCAB_SIZE)

corpus length: 9670571
total chars: 87


**Warning:** The RNN takes in input numerical data hence the necessity to convert strings into numerical values.

In [28]:
#creating mapping between indexes and characters
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

We’re gonna use Keras to create and train our Network, so we must convert the data into this form: (number_of_sequences, length_of_sequence, number_of_features).
- nb of features = length of the char array
- length of sequence = batch size
- nb of sequence = len(data) divided by batch size.

**Warning : ** target sequence is setted by shifting the source/input sequence by one character with both having the same length.

In [29]:
%%time

#original sequence length : 100

SEQ_LENGTH=50
#Build three dimensional arrays
X = np.zeros((len(text)/SEQ_LENGTH, SEQ_LENGTH, VOCAB_SIZE)) #input
y = np.zeros((len(text)/SEQ_LENGTH, SEQ_LENGTH, VOCAB_SIZE)) #target

#Build sequences
for i in range(0, len(text)/SEQ_LENGTH):
    X_sequence = text[i*SEQ_LENGTH:(i+1)*SEQ_LENGTH]
    X_sequence_ix = [char_indices[value] for value in X_sequence]
    input_sequence = np.zeros((SEQ_LENGTH, VOCAB_SIZE))
    for j in range(SEQ_LENGTH):
        input_sequence[j][X_sequence_ix[j]] = 1.
    X[i] = input_sequence

    y_sequence = text[i*SEQ_LENGTH+1:(i+1)*SEQ_LENGTH+1]
    y_sequence_ix = [char_indices[value] for value in y_sequence]
    target_sequence = np.zeros((SEQ_LENGTH, VOCAB_SIZE))
    for j in range(SEQ_LENGTH):
        target_sequence[j][y_sequence_ix[j]] = 1.
    y[i] = target_sequence

CPU times: user 10.7 s, sys: 3.08 s, total: 13.8 s
Wall time: 13.7 s


** 2. Build the network**

In [30]:
HIDDEN_DIM= 500 #500
LAYER_NUM = 2


model = Sequential()
model.add(LSTM(HIDDEN_DIM, input_shape=(None, VOCAB_SIZE), return_sequences=True))
for i in range(LAYER_NUM - 1):
    model.add(LSTM(HIDDEN_DIM, return_sequences=True))
model.add(TimeDistributed(Dense(VOCAB_SIZE)))
model.add(Activation('softmax'))
model.compile(loss="categorical_crossentropy", optimizer="rmsprop")

In [31]:
def generate_text(model, length, vocab_size, ix_to_char):
    # starting with random character
    ix = [np.random.randint(vocab_size)]
    y_char = [ix_to_char[ix[-1]]]
    X = np.zeros((1, length, vocab_size))
    for i in range(length):
        # appending the last predicted character to sequence
        X[0, i, :][ix[-1]] = 1
        print(ix_to_char[ix[-1]], end="")
        ix = np.argmax(model.predict(X[:, :i+1, :])[0], 1)
        y_char.append(ix_to_char[ix[-1]])
    return ('').join(y_char)

In [32]:
# Generate some sample before training to know how bad it is!
bla = generate_text(model, 100, VOCAB_SIZE, indices_char)
#api.PostUpdate(status=bla[0:123])

?&((AQQILccTTT((QQTcTrQccTrpppppssppfff0008880088hhhh((((Htt]]c]]aa]a]a]a122]aae1//////$$$VVV$$VVE

**3. Train network**

** Note :**
- batch_size of 400 combined with a seq_len of 500 gets OOM
- batch_size of 400 combined with a seq_len of 400 pass epoch at : 299s
- batch_size of 100 combined with a seq_len of 100 pass epoch at : 400s
- batch_size of 400 combined with a seq_len of 200 pass epoch at : 267s
- batch_size of 400 combined with a seq_len of 100 pass epoch at : 262s
- batch_size of 500 combined with a seq_len of 100 pass epoch at : 251s

here is an interesting post : https://stats.stackexchange.com/questions/164876/tradeoff-batch-size-vs-number-of-iterations-to-train-a-neural-network

In [33]:
def get_iternb(string):
    return re.findall(r'checkpoint_500_epoch_(.*).hdf5', string)[0]

In [34]:
#batch size equals to seq length here
BATCH_SIZE=500 #100 slower and #>450 gets OOM
#len of desired output
GENERATE_LENGTH=140
DIR=out_path+"weights/weight_attempt_s02/"
flag=True

try:
    onlyfiles = [f for f in listdir(DIR) if isfile(join(DIR, f))]
    nb_files = len(onlyfiles)
    print("Checkpoints : %s" %onlyfiles)
    iteration=[]
    for files in onlyfiles:
        iteration.append(int(get_iternb(files)))
    iteration=max(iteration)

    last_checkpoint=DIR+onlyfiles[0][0:21]+str(iteration)+'.hdf5'
except Exception as e:
    print(e)
    onlyfiles=[]
    if not os.path.exists(DIR):
        os.makedirs(DIR)
    nb_files=0

    
print(nb_files)
if nb_files>0:
    model.load_weights(last_checkpoint)
    print("Checkpoint %s loaded successfuly!" % last_checkpoint)
else:
    iteration=0
    
print("Starting at iteration : %s" %iteration)
while flag==True:
    print('\n')
    print('-'*20)
    model.fit(X, y, batch_size=BATCH_SIZE, verbose=2, nb_epoch=1)
    iteration += 1
    bla=generate_text(model, GENERATE_LENGTH,VOCAB_SIZE, indices_char)
    if iteration % 10 == 0:
        print("\n\nIteration nb : %s" %iteration)
        #api.PostUpdate(status=bla[0:123])
        model.save_weights(DIR+'checkpoint_{}_epoch_{}.hdf5'.format(HIDDEN_DIM, iteration))
        #remove unecessary files:
        for files in onlyfiles:
            try:
                if files:
                    os.remove(DIR+files)
            except:
                pass
        onlyfiles = [f for f in listdir(DIR) if isfile(join(DIR, f))]

    if iteration>=600:
        print("Stopping...")
        flag=False

Checkpoints : ['checkpoint_500_epoch_530.hdf5']
1
Checkpoint english/data/weights/weight_attempt_s02/checkpoint_500_epoch_530.hdf5 loaded successfuly!
Starting at iteration : 530


--------------------
Epoch 1/1
249s - loss: 1.0658
RD hath sworn to me;

02:022:003 And swallowed their eyes, and take and gathered themselves together
           into the city, and that w

--------------------
Epoch 1/1
249s - loss: 1.0155
7 7633862516 0261977745 8029788818 7937778663
1786689435 5927357211 6181698721 6941703210 8901036241 16366666753
8437904800 9902558476 693

--------------------
Epoch 1/1
249s - loss: 0.9849
and the men of Israel were all that go away the cart it out through the deserts
           that was with him in the temple, crucify side th

--------------------
Epoch 1/1
249s - loss: 0.9634
Ye shall not eat of the house of Jacob, and will bring upon them the treasures of
           the kings of the earth.

24:044:006 And the 

--------------------
Epoch 1/1
249s - loss: 0.9470
)

JA

**4. Generate text**

In [35]:
def save_text(model, length, vocab_size, ix_to_char):
    # starting with random character
    ix = [np.random.randint(vocab_size)]
    y_char = [ix_to_char[ix[-1]]]
    X = np.zeros((1, length, vocab_size))
    for i in range(length):
        # appending the last predicted character to sequence
        X[0, i, :][ix[-1]] = 1
        print(ix_to_char[ix[-1]], end="")
        ix = np.argmax(model.predict(X[:, :i+1, :])[0], 1)
        y_char.append(ix_to_char[ix[-1]])
    if not os.path.exists(out_path+"generate/"):
        os.makedirs(out_path+"generate/")
    with open(out_path+"generate/output.txt","w") as f:
        f.write(('').join(y_char))
    return ('').join(y_char)


In [36]:
#seed with particular text:
def generate_text_seeded(model,seed,length, vocab_size, ix_to_char):
    # starting with random character
    # char_indices
    ix = [char_indices[x] for x in seed]
    y_char = [x for x in seed]
    X = np.zeros((1, length, vocab_size))
    for i in range(len(ix)) :
        X[0, i, :][ix[i]] = 1
        print(ix_to_char[ix[i]], end="")
    to_substract = len(ix)
    for i in range(length-to_substract):
        # appending the last predicted character to sequence
        X[0, i, :][ix[-1]] = 1
        print(ix_to_char[ix[-1]], end="")
        ix = np.argmax(model.predict(X[:, :i+1, :])[0], 1)
        y_char.append(ix_to_char[ix[-1]])
    return ('').join(y_char)

In [37]:
generate_text_seeded(model,normalize('NFKD',"Who is god? ".decode('latin1')), 1000, VOCAB_SIZE, indices_char)

Who is god?  herne earlashall be a man, that ye may know how that
           God well with thee, come thou? And he said, Unto Hesaboah: and offer it at the first
           cities, and upon his heads, and was buried in Samaria into the temple of Ramshronia, and
           the Levites after him Gera, and Shimei.

01:014:014 And the sons of Seir, and Caleb's brother, which were born the people that was in the
           coupling of the court of the countries: and the priest shall take all the wise men, behold, am
           nothing in his sight.

14:025:018 And as he passed by, his father Eliasaphar, and Nergaim might not be three enter in out of
           the land of Egypt.

02:007:001 And the LORD said unto Joshua, Fear not: for I am with thee, saith the Lord GOD.

26:032:001 Be thou plain the brighterX of life and plents of gold in the house of the LORD, and
           that day shall be a praise, and be thou laid with him.

02:025:016 And thou shalt put it before the

u"Who is god? herne earlashall be a man, that ye may know how that\r\n           God well with thee, come thou? And he said, Unto Hesaboah: and offer it at the first\r\n           cities, and upon his heads, and was buried in Samaria into the temple of Ramshronia, and\r\n           the Levites after him Gera, and Shimei.\r\n\r\n01:014:014 And the sons of Seir, and Caleb's brother, which were born the people that was in the\r\n           coupling of the court of the countries: and the priest shall take all the wise men, behold, am\r\n           nothing in his sight.\r\n\r\n14:025:018 And as he passed by, his father Eliasaphar, and Nergaim might not be three enter in out of\r\n           the land of Egypt.\r\n\r\n02:007:001 And the LORD said unto Joshua, Fear not: for I am with thee, saith the Lord GOD.\r\n\r\n26:032:001 Be thou plain the brighterX of life and plents of gold in the house of the LORD, and\r\n           that day shall be a praise, and be thou laid with him.\r\n\r\n02:025:0

In [38]:
%%time
out = save_text(model, 1500, VOCAB_SIZE, indices_char)

# Six hundred thousand chosen men, whom the LORD called unto him, and said,
           Belteshazzan, Manasseh, and Aria, and Manasseh, and Shimeah, and Zerah. And the sons of
           Shephatiah the son of Asael, the son of Amariah,

13:024:004 They shall be a tine of the river Aethrard, and from the mountains of Ammon, and from the desert of the
           wicked; and the spirit of a man hath not been entered into the sea.

46:015:050 But if a man have an end, or of a shearing, and that with a noise with the bow
           shall be forgiven unto men: but the first day of the week shall be burnt in the
           steps.

23:031:011 Then shall the land tempts the sea, than that soul shall be cut off from the
           face of the earth.

19:037:031 Among my people were confing, that thou mayest call my people Israel, and they
           shall deal treacherously with thee in all things.

42:012:020 And they that hated me her womb-four chamber and with the sword shall be blessed: for h