In [9]:
import nltk
import numpy as np
import os
import random
import sys
import tensorflow as tf

In [4]:
# Download NLTK model data (you need to do this once)
#DONE
#nltk.download("book")

In [5]:
corpora_dir = "/Users/char8060/datascience/nltk_data/corpora/state_union"

# Read all file paths in corpora directory
file_list = []
for root, _ , files in os.walk(corpora_dir):  
    for filename in files:
        file_list.append(os.path.join(root, filename))
        
print("Read ", len(file_list), " files..." )

# Extract text from all documents
docs = []

for files in file_list:
    with open(files, 'r') as fin:
        try:
            str_form = fin.read().lower().replace('\n', '')
            docs.append(str_form)
        except UnicodeDecodeError: 
            # Some sentences have wierd characters. Ignore them for now
            pass
# Combine them all into a string of text
text = ' '.join(docs)

print('corpus length:', len(text))

Read  66  files...
corpus length: 1915949


In [6]:
chars = sorted(list(set(text)))
print('Total Number of Unique Characters:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars)) # Character to index
indices_char = dict((i, c) for i, c in enumerate(chars)) # Index to Character

Total Number of Unique Characters: 57


In [29]:
"""
Recommended to run this script on GPU, as recurrent
networks are quite computationally intensive.
If you try this script on new data, make sure your corpus
has at least ~100k characters. ~1M is better.
"""

# cut the text in semi-redundant sequences of maxlen characters
maxlen = 40 # Number of characters considered
step = 3 # Stide of our window
sentences = []
next_chars = []

# Reading the text in terms of sequence of characters
# Extract only 'maxlen' characters every time
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    # The character just after the sequence is the label
    next_chars.append(text[i + maxlen]) 
print('nb sequences:', len(sentences))

print('Vectorization...')
# Initializing Tensor (training data)
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool) 
# Initializing Output that holds next character (label)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool) 
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        # Populate Tensor Input
        x[i, t, char_indices[char]] = 1 
    # Populate y with the character just after the sequence
    y[i, char_indices[next_chars[i]]] = 1


def sample(preds, temperature=1.0):
    """Perform Temperature Sampling"""
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature 
    exp_preds = np.exp(preds)
    # Softmax of predictions
    preds = exp_preds / np.sum(exp_preds) 
    # Sample a single characters, with probabilities defined in `preds`
    probas = np.random.multinomial(1, preds, 1) 
    return np.argmax(probas)


def on_epoch_end(epoch, _):
    """Function invoked at end of each epoch. Prints generated text"""
    print()
    print('----- Generating text after Epoch: %d' % epoch)

    start_index = random.randint(0, len(text) - maxlen - 1)
    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print('----- Diversity:', diversity)

        generated = ''
        sentence = text[start_index: start_index + maxlen]
        generated += sentence
        print('----- Generating with seed: "' + sentence + '"')
        sys.stdout.write(generated)

        #generate text of 400 chars
        for i in range(400):
            #initalize single input with all zeros
            x_pred = np.zeros((1, maxlen, len(chars)))
            
            #generate the encoded version of the sentence seed
            for t, char in enumerate(sentence):
                x_pred[0, t, char_indices[char]] = 1.

            preds = model.predict(x_pred, verbose=0)[0]
            # Generate next character
            next_index = sample(preds, diversity) 
            next_char = indices_char[next_index]
            
            # Append character to generated sequence
            generated += next_char 
            sentence = sentence[1:] + next_char

            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()
    
    # Save model weights into file
    model.save_weights('saved_weights.hdf5', overwrite=True)
        

# After every single epoch, we are going to call the function on_epoch_end
# to generate some text.
print_callback = tf.keras.callbacks.LambdaCallback(on_epoch_end=on_epoch_end)
checkpointer = tf.keras.callbacks.ModelCheckpoint(filepath='weights_v2.hdf5', verbose=1, save_best_only=True)

nb sequences: 638637
Vectorization...


In [22]:
print('Building model...')
# Size of vector in the hidden layer.
hidden_size = 128 
# Initialize Sequential Model
model = tf.keras.Sequential()
model.add(tf.keras.layers.LSTM(hidden_size, input_shape=(maxlen, len(chars))))
# Add the output layer that is a softmax of the number of characters
model.add(tf.keras.layers.Dense(len(chars), activation='softmax')) 
# Optimization through RMSprop
optimizer_new = tf.keras.optimizers.RMSprop() 
# Consider cross Entropy loss. Why? MLE of P(D | theta)
model.compile(loss='categorical_crossentropy', optimizer=optimizer_new) 

# Train this for 30 epochs. Size of output from LSTM i.e. hidden layer vector shape=128
model.fit(x, y,
          batch_size=128,
          epochs=30,
          callbacks=[print_callback, checkpointer])

Building model...
Epoch 1/30
----- Generating text after Epoch: 0
----- Diversity: 0.2
----- Generating with seed: "rtunities.but there is much more to be d"
rtunities.but there is much more to be demented the propose to sear the seare the secure the program the for the secure to sect and the secure the program of the sears of the for the states of the congress and the propose the some the prople the sear a section and the sare of the for and the peace of the propod of the sears and the prosice of the program of the sears of the sare of the contress to and the program to state the prosed the
----- Diversity: 0.5
----- Generating with seed: "rtunities.but there is much more to be d"
rtunities.but there is much more to be desporment of our haver be for for and the predure the from hell and of the ear and the stare to the state. the enomory the prome to make in oun our peace in all a sere are couttry and the progres the sourg the for the eares for a seal and the sourg thas a the progress 

american people. to strengthen the families of encouraged the world in the endens of and the new services of the united states of the tax communities and the first of the surplise of our demands of the needs of our state of americans who have to the some of a short should be freedom of existing the fact of the relater of the future. the community of a steps in peace and our state of the people of our finance to american poor of minds an
----- Diversity: 1.0
----- Generating with seed: "american people. to strengthen the famil"
american people. to strengthen the family choods through core and even or his vight beefer. the federal betirels of plac power while here the veap far comple and porsults, we'sen forter government dastas. that all quality of an and first program policains, and its fund is nle works of keep and more reathoust and impricans of the admyers what wastoute of our freedembed essent as upinity, and the undicties waild workers. we cankment with 
----- Diversity: 1.2
-----

g national differences must and will be any hat harrividnust our tax 2'sse unly at goancbate.all, noting alive unwarters of thesropwe. our oncomply for persus. this.ly akelless than peacefaller umportance are ameracnamboas.and export tot milo2, in a nexti0n ementioripher that thing, and murry supply. we hase no2, there soo, you receve the nat8 and chanding humprialy, and the last common for us, and we should strengt contuct vatuent, the
Epoch 8/30
----- Generating text after Epoch: 7
----- Diversity: 0.2
----- Generating with seed: "g shot up 8 percent.  that's far more th"
g shot up 8 percent.  that's far more than the congress of the congress on the congress to provide the congress on the state of the united states of the federal government to the congress on the state of the union of the congress and the production of the congress on the security of the congress on the strength of the congress and the congress to the congress on the congress to the congress on the strength of a serv

f-government be extended and the right of the united states of the american people of the congress to act of the union of the united states of the american people of the united states of the problems of the united states of the united states of the first to the congress to defend the program of the american people with the united states of the united states of the united states of the family of the american people who have been all the 
----- Diversity: 0.5
----- Generating with seed: "f-government be extended and the right o"
f-government be extended and the right of the congress and the proposal of the next year in the union is determined the congress to propose possible and state of the servants and power of the progress of the family effort of the congress that is the congress all our economy and citizens and businesses for the fear in our country to control as a home and the confenexing to emphaie that is many terrorism in the fiscal year of the free of 
----- Diversity: 1.0
-----

en the optimistic analyst will realize that on congress to reveral an america is had deepess for being abroad, and clear educated forld controlck foundation to lies has carriapited to tax cutresn equal colon'al and movewie hasters we to rely to copination and the challenge now maintain the somety america and to both in communities must be dien, freedom. the force of bick, and pronoped stope comentive deserve the vilifance world from our
----- Diversity: 1.2
----- Generating with seed: "en the optimistic analyst will realize t"
en the optimistic analyst will realize the othery war haddrak leaders. so i refiemes are flexibl5 and which fearnody, yet us beew depends on required outs ahout.any world. we stand be leaders, enactings age of faich. north, 6s who have americans who load in midqulnue up of time.but worse cannot ded togethorty. it must no into fimaning saps vielnteners have have pursen.it will be probeducas made divade. we famiry wast, we well be man, ye
Epoch 15/30
----- Generati

Epoch 18/30
----- Generating text after Epoch: 17
----- Diversity: 0.2
----- Generating with seed: " of revolution in aggression.as the assa"
 of revolution in aggression.as the assa and the state of the united states and the strength of the state of the united states and the present that we can all that the fact of our country to the state of the united states and the state of the union of the american people with the congress to continue the problems of the congress on the world war is the congress on the state of the united states and the people of the congress and the con
----- Diversity: 0.5
----- Generating with seed: " of revolution in aggression.as the assa"
 of revolution in aggression.as the assame the challenge of our exports of the people of a strong and local formed that we must also read and in the end of the tax pressarial period the american patient and the house they can also do a work it may be no sense.our market and workers will be a strong and and to act that we sh

oes to secure for themselves the full blecause is the world of the congress and the congress sasess them the change that make sure that i shall need to the congress and the service of the united states are starts. for the congress for the congress to reduce the lives of the iraqi states have been the state of the union is our own people for the state of the vietnam of the war in the world in the united states for all of the federal gove
----- Diversity: 1.0
----- Generating with seed: "oes to secure for themselves the full bl"
oes to secure for themselves the full bload in reward, we can boking upperd this places act our american -- the nert, employment policies, becausa easic common great productivity will make america we have all working had to call for those who have takentry have been new way. the earnance of the alsomation cismuctional instruter and ensurity in state of the charter celenter 7f also's desiractions down and economic programs i have to is n
----- Diversity: 1.2
-----

s multibillion-dollar package that will be reward within better.than year our national problems, afford with the invalities of accomplished.it is.more fiscal, which, and irensu-allials, all outising revolution happeg recession. we must afford the strong security of our power. distritution areas irnegs an 1nhpopporcwwity i urge a bundar, do. delige a chill hact, wrightingleted hembcirs america, crys mosch our greaty cataitles, i can kent
Epoch 25/30
----- Generating text after Epoch: 24
----- Diversity: 0.2
----- Generating with seed: "r enormous engine of production. our pro"
r enormous engine of production. our programs of the congress and the congress to the people of the congress on the congress to the congress to the congress to continue to all of you the strengthened of the present strengthened the street of our economy and strengthen the world is the street of the american people with the congress and the street of the union additional american to the congress to give the state o

ity of the marketplace in the service of the congress to the people of the united states of the first and the world that the state of the union is a state of the union is the democratic and the state of the union is a strong and community of the community of the states of the people of the world in the state of the union is the department of the united states and the state of the union is a strong and community programs of the state of 
----- Diversity: 0.5
----- Generating with seed: "ity of the marketplace in the service of"
ity of the marketplace in the service of the american people and the truly program in the last that to defense program will be a community in the hope of the world effort of the last the national activities that expect and economic and strong and competitive supporting the great optimical defense of the last year for the negotial of the state of the united states, and we seek to make the seavine, the things to the state of the union is 
----- Diversity: 1.0
-----

<tensorflow.python.keras.callbacks.History at 0x7fa4ad8286d0>

In [26]:
model.summary()

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_5 (LSTM)                (None, 128)               95232     
_________________________________________________________________
dense_3 (Dense)              (None, 57)                7353      
Total params: 102,585
Trainable params: 102,585
Non-trainable params: 0
_________________________________________________________________


In [28]:
test_list = list('s multibillion-dollar package that will be a congress and the social security program and providing services. the past of the economy of freedom of the congress for all our competitive and controls on the soviet union can only any medicare on the world. we must continue the artalically and good to the sate community of the security of our communities are a peace in the united states are on the expenditures of congress and happened in an')
len(test_list)

440

In [None]:
# To continue training...
model.load_weights("saved_weights.hdf5")

model.fit(x, y,
          batch_size=128,
          epochs=30,
          callbacks=[print_callback, checkpointer])