### Based on: https://machinelearningmastery.com/use-word-embedding-layers-deep-learning-keras/ and https://machinelearningmastery.com/how-to-develop-a-word-level-neural-language-model-in-keras/ and https://github.com/stanfordnlp/GloVe

In [4]:
#import time
import datetime
import string
from pickle import dump
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding

import numpy as np

from random import randint
from keras.preprocessing.sequence import pad_sequences

def get_time( output=True ):
    
    temp = time.time()
    if output:
        now = datetime.datetime.now()
        print( now.strftime( "%Y.%m.%d %H:%M" ) )
        
    return temp

foo = get_time()

def print_time( start_time, end_time, interval="seconds" ):
    
    if interval == "hours":
        print ( "Time to process: [%s] hours" % ( str( ( end_time - start_time ) / 60 / 60 ) ) )
    else:
        print ( "Time to process: [%s] seconds" % ( str( end_time - start_time ) ) )

print_time( 0, 1 )

2018.05.21 10:47
Time to process: [1] seconds


In [5]:
# load doc into memory
def load_doc( filename ):
    
    # open the file as read only
    file = open( filename, 'r' )
    # read all text
    text = file.read()
    # close the file
    file.close()
    
    return text

# load document
#in_filename = "../texts/alice-in-wonderland.txt"
#in_filename = "../texts/dr-zeuss-compilation.txt"
in_filename = "../texts/trump-speeches.txt"
doc = load_doc( in_filename )
print( doc[ :200 ] )

﻿SPEECH 1


...Thank you so much.  That's so nice.  Isn't he a great guy.  He doesn't get a fair press; he doesn't get it.  It's just not fair.  And I have to tell you I'm here, and very strongly here


In [13]:
# my_punctuation = string.punctuation
# print( type( my_punctuation ) )
# print( my_punctuation )
my_punctuation = '"#$%&\'()*+,-/:;<=>@[\\]^_`{|}~'
my_punctuation

'"#$%&\'()*+,-/:;<=>@[\\]^_`{|}~'

In [18]:
# turn a doc into clean tokens
def clean_doc( doc, to_lower=True ):
    
    # replace '--' with a space ' '
    doc = doc.replace( '--', ' ' )
    # replace sentence simple sentence boundaries w/ unique token/markers
    doc = doc.replace( '. ', ' endperiod ' )
    doc = doc.replace( '! ', ' endexclamation ' )
    doc = doc.replace( '? ', ' endquestion ' )
    doc = doc.replace( ', ', ' pausecomma ' )
    doc = doc.replace( ': ', ' pausecolon ' )
    doc = doc.replace( '; ', ' pausesemicolon ' )
    
    # split into tokens by white space
    tokens = doc.split()
    
    # remove punctuation from each token
    table = str.maketrans( '', '', string.punctuation ) # will strip all .?!,:; that don't fit replace expr above.
    #table = str.maketrans( '', '', my_punctuation )
    tokens = [ w.translate( table ) for w in tokens ]
    
    # remove remaining tokens that are not alphabetic
    if to_lower:
        tokens = [ word for word in tokens if word.isalpha() ]
    
    # make lower case
    tokens = [ word.lower() for word in tokens ] 
    
    return tokens

In [19]:
# clean document
tokens = clean_doc( doc )
tokens_unique = list( set( tokens ) )
print( tokens[ :200 ] )
print( 'Total Tokens: %d' % len( tokens ) )
print( 'Unique Tokens: %d' % len( tokens_unique ) )

['thank', 'you', 'so', 'much', 'endperiod', 'thats', 'so', 'nice', 'endperiod', 'isnt', 'he', 'a', 'great', 'guy', 'endperiod', 'he', 'doesnt', 'get', 'a', 'fair', 'press', 'pausesemicolon', 'he', 'doesnt', 'get', 'it', 'endperiod', 'its', 'just', 'not', 'fair', 'endperiod', 'and', 'i', 'have', 'to', 'tell', 'you', 'im', 'here', 'pausecomma', 'and', 'very', 'strongly', 'here', 'pausecomma', 'because', 'i', 'have', 'great', 'respect', 'for', 'steve', 'king', 'and', 'have', 'great', 'respect', 'likewise', 'for', 'citizens', 'united', 'pausecomma', 'david', 'and', 'everybody', 'pausecomma', 'and', 'tremendous', 'resect', 'for', 'the', 'tea', 'party', 'endperiod', 'also', 'pausecomma', 'also', 'the', 'people', 'of', 'iowa', 'endperiod', 'they', 'have', 'something', 'in', 'common', 'endperiod', 'hardworking', 'people', 'endperiod', 'they', 'want', 'to', 'work', 'pausecomma', 'they', 'want', 'to', 'make', 'the', 'country', 'great', 'endperiod', 'i', 'love', 'the', 'people', 'of', 'iowa', 'en

In [20]:
# organize into sequences of tokens
sequence_len = 50 + 1
sequences = list()

for i in range( sequence_len, len( tokens ) ):
    
    # select sequence of tokens
    seq = tokens[ i - sequence_len:i ]
    
    # convert into a line
    line = ' '.join( seq )
    
    # store
    sequences.append( line )
    
print( 'Total Sequences: %d' % len( sequences ) )


Total Sequences: 176937


In [21]:
# save tokens to file, one dialog per line
def save_doc( lines, filename ):
    
    data = '\n'.join( lines )
    file = open( filename, 'w' )
    file.write( data )
    file.close()

In [22]:
# save sequences to file
#out_filename = "../texts/dr-zeuss-compilation-sequences.txt"
out_filename = "../texts/trump-speeches-sequences-02.txt"
save_doc( sequences, out_filename )

In [23]:
#in_filename = "../texts/dr-zeuss-compilation-sequences.txt"
in_filename = "../texts/trump-speeches-sequences-02.txt"
doc = load_doc( in_filename )
lines = doc.split( '\n' )
lines[ 0:10 ]

['thank you so much endperiod thats so nice endperiod isnt he a great guy endperiod he doesnt get a fair press pausesemicolon he doesnt get it endperiod its just not fair endperiod and i have to tell you im here pausecomma and very strongly here pausecomma because i have great respect',
 'you so much endperiod thats so nice endperiod isnt he a great guy endperiod he doesnt get a fair press pausesemicolon he doesnt get it endperiod its just not fair endperiod and i have to tell you im here pausecomma and very strongly here pausecomma because i have great respect for',
 'so much endperiod thats so nice endperiod isnt he a great guy endperiod he doesnt get a fair press pausesemicolon he doesnt get it endperiod its just not fair endperiod and i have to tell you im here pausecomma and very strongly here pausecomma because i have great respect for steve',
 'much endperiod thats so nice endperiod isnt he a great guy endperiod he doesnt get a fair press pausesemicolon he doesnt get it endperio

## Convert Words to Index Values

In [24]:
# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts( lines )
sequences = tokenizer.texts_to_sequences( lines )

In [25]:
print( len( sequences[ 0 ] ) == sequence_len )
print( len( sequences ) )

True
176937


In [26]:
print( len( tokenizer.word_index ) )
print( type( tokenizer.word_index ) )
print( tokenizer.word_index[ "pausecomma" ] )

5848
<class 'dict'>
2


In [27]:
# vocabulary size
vocab_size = len( tokenizer.word_index ) + 1
vocab_size

5849

In [28]:
# separate into input and output: for now it's 50 words input and 1 word output
sequences = np.array( sequences )
X = sequences[ :,:-1 ] # all rows, from word 0 up to, but not including, the last word
y = sequences[ :,-1 ]  # all rows, last word only
y = to_categorical( y, num_classes=vocab_size )
seq_length = X.shape[ 1 ]
seq_length

50

## Load and Filter GloVe Data

In [29]:
# load the whole embedding into memory
embeddings_index = dict()
embeddings_dimension = 300 #must be 50, 100, 200, 300
glove = open( "../glove/glove.6B." + str( embeddings_dimension ) + "d.txt" )

for line in glove:
    
    values = line.split()
    # 1st string is word...
    word = values[ 0 ]
    
    if word in tokens_unique:
        
        # ...the rest are coefficients
        coefs = np.asarray( values[ 1: ], dtype='float32' )
        embeddings_index[ word ] = coefs
        print( "*", end="" )
    
glove.close()
print( '\nLoaded %s word vectors.' % len( embeddings_index ) )
print( '\nWords not found %d.' % ( len( tokenizer.word_index ) - len( embeddings_index ) ) )

****************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************

### Transform into Matrix That Maps Coefs by Index

In [30]:
# create a weight matrix for words in training docs
embedding_matrix = np.zeros( ( vocab_size, embeddings_dimension ) )
missing_words = []

# we need this to create empty coefficients array
dummy_shape = embeddings_index[ "the" ].shape

for word, i in tokenizer.word_index.items():
    
    embedding_vector = embeddings_index.get( word )
    
    # not all words in our token list are in the wikipedia 400K set!
    if embedding_vector is None:
        
        # report and create empty coefficients array
        missing_words.append( word )
        embedding_vector = np.zeros( dummy_shape )
        
    embedding_matrix[ i ] = embedding_vector
    
print( len( missing_words ) )
missing_words

204


['endperiod',
 'pausecomma',
 'endquestion',
 'pausesemicolon',
 'pausecolon',
 'endexclamation',
 'selffunding',
 'theyve',
 'youve',
 'beada',
 'selfinspect',
 'theyll',
 'twoway',
 'onethird',
 'reince',
 'lowenergy',
 'bluecollar',
 'ayeyeye',
 'fbomb',
 'obamaclinton',
 'maralago',
 'bigly',
 'sixyear',
 'escavators',
 'nationbuilding',
 'africanamerican',
 'byebye',
 'selfinspection',
 'offmike',
 'antiwoman',
 'intelligencegathering',
 'thatand',
 'africanamericans',
 'donaldjtrumpcom',
 'mexicanamerican',
 'hardhitting',
 'braggadocious',
 'selfpolice',
 'komatsus',
 'nobodys',
 'taxexempt',
 'indianas',
 'fiveforone',
 'oreilly',
 'resect',
 'everyones',
 'theyd',
 'cetain',
 'romneycare',
 'oneyard',
 'threefooter',
 'werent',
 'clearsighted',
 'nationstate',
 'goodsized',
 'airconditioner',
 'disastertrump',
 'nogood',
 'peopletrump',
 'exampletrump',
 'brandnew',
 'oldfashioned',
 'wellover',
 'trilliontrump',
 'jobproducer',
 'truthteller',
 'expresident',
 'middleincome',

In [31]:
# confirm visually that 
print( len( embedding_matrix[ 0 ] ) )
print( sum( embedding_matrix[ 0 ] ) )
empty_coefficients_count = 0

for i in range( len( embedding_matrix ) ):
    if sum( embedding_matrix[ i ] ) == 0:
        empty_coefficients_count += 1
        
empty_coefficients_count

300
0.0


205

## Define Model

In [17]:
import keras
print( keras.__version__ )

import tensorflow as tf
print( tf.__version__ )

2.1.3
1.4.1


In [32]:
# define model
model = Sequential()

# now using a pre-trained, non-trainable embedding from glove's wiki analysis
model.add( Embedding( vocab_size, embeddings_dimension, weights=[embedding_matrix], input_length=seq_length, trainable=True ) )
model.add( LSTM( seq_length * 2, return_sequences=True ) )
model.add( LSTM( seq_length * 2 ) )
model.add( Dense( seq_length * 2, activation='relu' ) )

# fixed TypeError below, downgraded keras from 2.1.5 to 2.1.3: https://github.com/keras-team/keras/issues/9621
# TypeError: softmax() got an unexpected keyword argument 'axis'
model.add( Dense( vocab_size, activation='softmax' ) )

print( model.summary() )

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 50, 300)           1754700   
_________________________________________________________________
lstm_1 (LSTM)                (None, 50, 100)           160400    
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_1 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_2 (Dense)              (None, 5849)              590749    
Total params: 2,596,349
Trainable params: 2,596,349
Non-trainable params: 0
_________________________________________________________________
None


## Fit the Model

In [34]:
# calc batch size
print( len( sequences ) / 128 )
print( len( sequences ) / 1028 )
# Was:
#batch_size = 128
batch_size = 1028


1382.3203125
172.11770428015564


In [None]:
start_time = get_time()
# compile model
model.compile( loss='categorical_crossentropy', optimizer='adam', metrics=[ 'accuracy' ] )
# fit model
model.fit( X, y, batch_size=batch_size, epochs=100 )
end_time = get_time()
print_time( start_time, end_time )

2018.05.21 11:22
Epoch 1/100
Epoch 2/100
Epoch 3/100
 16448/176937 [=>............................] - ETA: 24s - loss: 5.4769 - acc: 0.0891

In [42]:
# save the model to file
model.save( "models/trump-speeches-03.keras" )

# save the tokenizer
dump( tokenizer, open( "tokenizers/trump-speeches-03.pkl", 'wb' ) )

# save embedding_matrix based on wiki embeddings, complete w/ missing coefficients array dummies
dump( embedding_matrix, open( "embeddings/trump-speeches-03.glove", 'wb' ) )


## Use The Model to Generate Text

In [43]:
seq_length = len( lines[ 0 ].split() ) - 1
seq_length

50

In [44]:
def generate_seq( model, tokenizer, seq_length, seed_text, n_words ):
    
    result = list()
    in_text = seed_text
    
    # generate a fixed number of words
    for _ in range( n_words ):
        
        # encode the text as integer
        encoded = tokenizer.texts_to_sequences( [ in_text ] )[ 0 ] 
        
        # truncate sequences to a fixed length
        encoded = pad_sequences( [ encoded ], maxlen=seq_length, truncating='pre' ) 
        
        # predict probabilities for each word
        yhat = model.predict_classes( encoded, verbose=0 )
        
        # map predicted word index to word
        out_word = ''
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break 
                
        # append to input
        in_text += ' ' + out_word
        
        result.append( out_word )
        
    return ' '.join( result )

In [45]:
# select a seed text
seed_text = lines[ randint( 0, len( lines ) ) ]
print( seed_text + '...\n' )
#print( len( seed_text.split( " " ) ) )

# generate new text
generated = generate_seq( model, tokenizer, seq_length, seed_text, 50 )
print( "..." + generated )

and saying thank you very much sucker its really really crazy so we have to rebuild quickly our infrastructure of this country if we dont the other day in ohio a bridge collapsed bridges are collapsing all over the country the reports on bridges and the like are unbelievable whats happening...

...with our infrastructure i go to saudi arabia i go to dubai i am doing big jobs in dubai i go to various different places i go to china i can do something in first place you go and help from hell of a lot of money worth a lot


In [46]:
def generate_seq_word_by_word( model, tokenizer, seq_length, seed_text, n_words ):
    
    print( "...", end='' )
    #result = list()
    in_text = seed_text
    
    # generate a fixed number of words
    for _ in range( n_words ):
        
        # encode the text as integer
        encoded = tokenizer.texts_to_sequences( [ in_text ] )[ 0 ] 
        
        # truncate sequences to a fixed length
        encoded = pad_sequences( [ encoded ], maxlen=seq_length, truncating='pre' ) 
        
        # predict probabilities for each word
        yhat = model.predict_classes( encoded, verbose=0 )
        
        # map predicted word index to word
        out_word = ''
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                print( word, end=' ' )
                break 
                
        # append to input for next iteration
        in_text += ' ' + out_word

In [47]:
# select a seed text
seed_text = lines[ randint( 0, len( lines ) ) ]
print( seed_text + '...\n' )

# generate new text
generate_seq_word_by_word( model, tokenizer, seq_length, seed_text, 25 )

may be wrong but who i am a very honest person if somebody is going to say a little bit negative or a lot negative about me and if they happen to be republican i may choose to hit them back not always but i may choose to hit them back...

...not a fan of mitt romney mitt romney lost an election he should have won and if you maybe even going to go in me 

In [50]:
my_input = input()

# generate new text
generate_seq_word_by_word( model, tokenizer, seq_length, my_input, 25 )

trump walked down
...a fortune up your company so trump is so you sit right and i just want to skip iowa i was and he said what 