# LSTM

- ## Preliminaries

- ### Imports

In [1]:
from __future__ import print_function
from keras.models import Sequential
from keras.layers import Dense, Activation,Dropout,TimeDistributed
from keras.layers import LSTM,SimpleRNN
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
import numpy as np
import random
import sys
import os, os.path
from os import listdir
from os.path import isfile, join

Using TensorFlow backend.


- ### Check CPU usage

In [2]:
from tensorflow.python.client import device_lib

def get_available_gpus():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos if x.device_type == 'GPU']

In [3]:
get_available_gpus()

[u'/gpu:0']

----------

# I. Toy examples

- ### Test

** 1. Load and convert data**

'''Example script to generate text from Nietzsche's writings.
At least 20 epochs are required before the generated text
starts sounding coherent.
It is recommended to run this script on GPU, as recurrent
networks are quite computationally intensive.
If you try this script on new data, make sure your corpus
has at least ~100k characters. ~1M is better.
'''

In [4]:
#Load and concatenate files:

DIR="../../LSTM/data/all_script/action/raw/"
all_files = [f for f in listdir(DIR) if isfile(join(DIR, f))]

#choose how many files to concatenate:
nb_files=15
if nb_files>len(all_files):
    nb_files=len(all_files)
    
    
out_path="hollywood/data/"
if not os.path.exists(out_path):
    os.makedirs(out_path)
if not os.path.exists(out_path+"input/"):
    os.makedirs(out_path+"input/")
    
with open(out_path+'input/input_action.txt', 'w') as outfile:
    for fname in all_files[0:nb_files]:
        with open(DIR+fname) as infile:
            for line in infile:
                outfile.write(line)
        print ("Done concatenating file : %s" %fname)

Done concatenating file : httpwww.imsdb.comscriptsA-Most-Violent-Year.html.txt
Done concatenating file : httpwww.imsdb.comscriptsAlien-3.html.txt
Done concatenating file : httpwww.imsdb.comscripts48-Hrs..html.txt
Done concatenating file : httpwww.imsdb.comscriptsAlien.html.txt
Done concatenating file : httpwww.imsdb.comscriptsAmerican-Shaolin-King-of-Kickboxers-II.html.txt
Done concatenating file : httpwww.imsdb.comscriptsAustin-Powers---International-Man-of-Mystery.html.txt
Done concatenating file : httpwww.imsdb.comscriptsAustin-Powers---The-Spy-Who-Shagged-Me.html.txt
Done concatenating file : httpwww.imsdb.comscriptsArmy-of-Darkness.html.txt
Done concatenating file : httpwww.imsdb.comscriptsArctic-Blue.html.txt
Done concatenating file : httpwww.imsdb.comscriptsArmageddon.html.txt
Done concatenating file : httpwww.imsdb.comscriptsStar-Wars-The-Force-Awakens.html.txt
Done concatenating file : httpwww.imsdb.comscriptsJackie-Brown.html.txt
Done concatenating file : httpwww.imsdb.comscr

In [5]:
#load file
file_name=out_path+'input/input_action.txt'
text = open(file_name).read()
print('corpus length:', len(text))

chars = sorted(list(set(text)))
VOCAB_SIZE = len(chars)
print('total chars:',VOCAB_SIZE)

corpus length: 2627888
total chars: 100


**Warning:** The RNN takes in input numerical data hence the necessity to convert strings into numerical values.

In [6]:
#creating mapping between indexes and characters
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

We’re gonna use Keras to create and train our Network, so we must convert the data into this form: (number_of_sequences, length_of_sequence, number_of_features).
- nb of features = length of the char array
- length of sequence = batch size
- nb of sequence = len(data) divided by batch size.

**Warning : ** target sequence is setted by shifting the source/input sequence by one character with both having the same length.

In [7]:
%%time

SEQ_LENGTH=100
#Build three dimensional arrays
X = np.zeros((len(text)/SEQ_LENGTH, SEQ_LENGTH, VOCAB_SIZE)) #input
y = np.zeros((len(text)/SEQ_LENGTH, SEQ_LENGTH, VOCAB_SIZE)) #target

#Build sequences
for i in range(0, len(text)/SEQ_LENGTH):
    X_sequence = text[i*SEQ_LENGTH:(i+1)*SEQ_LENGTH]
    X_sequence_ix = [char_indices[value] for value in X_sequence]
    input_sequence = np.zeros((SEQ_LENGTH, VOCAB_SIZE))
    for j in range(SEQ_LENGTH):
        input_sequence[j][X_sequence_ix[j]] = 1.
    X[i] = input_sequence

    y_sequence = text[i*SEQ_LENGTH+1:(i+1)*SEQ_LENGTH+1]
    y_sequence_ix = [char_indices[value] for value in y_sequence]
    target_sequence = np.zeros((SEQ_LENGTH, VOCAB_SIZE))
    for j in range(SEQ_LENGTH):
        target_sequence[j][y_sequence_ix[j]] = 1.
    y[i] = target_sequence

CPU times: user 1.96 s, sys: 220 ms, total: 2.18 s
Wall time: 2.17 s


** 2. Build the network**

In [8]:
HIDDEN_DIM= 500 #500
LAYER_NUM = 2


model = Sequential()
model.add(LSTM(HIDDEN_DIM, input_shape=(None, VOCAB_SIZE), return_sequences=True))
for i in range(LAYER_NUM - 1):
    model.add(LSTM(HIDDEN_DIM, return_sequences=True))
model.add(TimeDistributed(Dense(VOCAB_SIZE)))
model.add(Activation('softmax'))
model.compile(loss="categorical_crossentropy", optimizer="rmsprop")

In [9]:
def generate_text(model, length, vocab_size, ix_to_char):
    # starting with random character
    ix = [np.random.randint(vocab_size)]
    y_char = [ix_to_char[ix[-1]]]
    X = np.zeros((1, length, vocab_size))
    for i in range(length):
        # appending the last predicted character to sequence
        X[0, i, :][ix[-1]] = 1
        print(ix_to_char[ix[-1]], end="")
        ix = np.argmax(model.predict(X[:, :i+1, :])[0], 1)
        y_char.append(ix_to_char[ix[-1]])
    return ('').join(y_char)

In [10]:
# Generate some sample before training to know how bad it is!
generate_text(model, 100, VOCAB_SIZE, indices_char)

9x::::llll::lllm����������(BB(BMtWWMWWW�RRP{{{{nn{{		OOO		%55O%+++ssss1111pwwpp1}LLLzzzz66�������Rmm

'9x::::llll::lllm\xa9\xa9\xa9\xa9\xa9\xa9\xa2\xa2\xa2\xa2(BB(BMtWWMWWW\x80RRP{{{{nn{{\t\tOOO\t\t%55O%+++ssss1111pwwpp1}LLLzzzz66\xc2\xc2\x80\x80\x80\x80\x80Rmm\x80'

**3. Train network**

In [11]:
#batch size equals to seq length here
BATCH_SIZE=100
#len of desired output
GENERATE_LENGTH=100
DIR=out_path+"weights/weight_attempt_s01/"

try:
    onlyfiles = [f for f in listdir(DIR) if isfile(join(DIR, f))]
    nb_files = len(onlyfiles)
    name_template = onlyfiles[0][0:15]
except Exception as e:
    print(e)
    if not os.path.exists(DIR):
        os.makedirs(DIR)
    nb_files=0

if nb_files>0:
    nb_iteration=nb_files*20
    model.load_weights(DIR+onlyfiles[0][0:21]+str(nb_iteration)+'.hdf5')
else:
    nb_iteration=0
    
print("Starting at iteration : %s" %nb_iteration)
while True:
    print('\n')
    print('-'*20)
    model.fit(X, y, batch_size=BATCH_SIZE, verbose=2, nb_epoch=1)
    nb_iteration += 1
    generate_text(model, GENERATE_LENGTH,VOCAB_SIZE, indices_char)
    if nb_iteration % 20 == 0:
        print("\n\nIteration nb : %s" %nb_iteration)
        model.save_weights(DIR+'checkpoint_{}_epoch_{}.hdf5'.format(HIDDEN_DIM, nb_iteration))

list index out of range
Starting at iteration : 0


--------------------




Epoch 1/1
108s - loss: 2.4128
N                                                                                                   

--------------------
Epoch 1/1
107s - loss: 1.5932
S                                                                                                   

--------------------
Epoch 1/1
113s - loss: 1.3537
he the stare and the stare and the 
the stare and the stare and the stare and the 
the stare and the

--------------------
Epoch 1/1
100s - loss: 1.2177
S the door and the stare and the stares and the 
                         the door and the man and t

--------------------
Epoch 1/1
99s - loss: 1.1290
(                                                                                                   

--------------------
Epoch 1/1
110s - loss: 1.0659
<b>                                                                                                 

--------------------
Epoch 1/1
107s - loss: 1.0174
1
</b>                         I have a lot of my mon

KeyboardInterrupt: 

**4. Generate text**

In [12]:
def save_text(model, length, vocab_size, ix_to_char):
    # starting with random character
    ix = [np.random.randint(vocab_size)]
    y_char = [ix_to_char[ix[-1]]]
    X = np.zeros((1, length, vocab_size))
    for i in range(length):
        # appending the last predicted character to sequence
        X[0, i, :][ix[-1]] = 1
        print(ix_to_char[ix[-1]], end="")
        ix = np.argmax(model.predict(X[:, :i+1, :])[0], 1)
        y_char.append(ix_to_char[ix[-1]])
    if not os.path.exists(out_path+"generate/"):
        os.makedirs(out_path+"generate/")
    with open(out_path+"generate/output.txt","w") as f:
        f.write(('').join(y_char))
    return ('').join(y_char)


In [13]:
%%time
out = save_text(model, 1500, VOCAB_SIZE, indices_char)

f the first time, and the door opens and weapons into the
     corner.

     He looks at her.

<b>                             RATH
</b>               Yes.

<b>                             RATH
</b>               I was the continue.

<b>                             RATH
</b>               I was the continue.

<b>                             RATH
</b>               I was the continue.

<b>                             RATH
</b>               I was the continue.

<b>                             RATH
</b>               I was the continue.

<b>                             RATH
</b>               I was the continue.

<b>                             RATH
</b>               I was the continue.

<b>                             RATH
</b>               I was the continue.

<b>                             RATH
</b>               I was the continue.

<b>                             RATH
</b>               I was the continue.

<b>                             RATH
</b>               I was the continu