### Based on: https://machinelearningmastery.com/use-word-embedding-layers-deep-learning-keras/ and https://machinelearningmastery.com/how-to-develop-a-word-level-neural-language-model-in-keras/ and https://github.com/stanfordnlp/GloVe

In [1]:
import string
from pickle import dump
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding

import numpy as np

from random import randint
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [2]:
# load doc into memory
def load_doc( filename ):
    
    # open the file as read only
    file = open( filename, 'r' )
    # read all text
    text = file.read()
    # close the file
    file.close()
    
    return text

# load document
#in_filename = "../texts/alice-in-wonderland.txt"
#in_filename = "../texts/dr-zeuss-compilation.txt"
in_filename = "../texts/trump-speeches.txt"
doc = load_doc( in_filename )
print( doc[ :200 ] )

﻿SPEECH 1


...Thank you so much.  That's so nice.  Isn't he a great guy.  He doesn't get a fair press; he doesn't get it.  It's just not fair.  And I have to tell you I'm here, and very strongly here


In [3]:
# turn a doc into clean tokens
def clean_doc( doc, to_lower=True ):
    
    # replace '--' with a space ' '
    doc = doc.replace( '--', ' ' )
    
    # split into tokens by white space
    tokens = doc.split()
    
    # remove punctuation from each token
    table = str.maketrans( '', '', string.punctuation )
    tokens = [ w.translate( table ) for w in tokens ]
    
    # remove remaining tokens that are not alphabetic
    if to_lower:
        tokens = [ word for word in tokens if word.isalpha() ]
    
    # make lower case
    tokens = [ word.lower() for word in tokens ] 
    
    return tokens

In [4]:
# clean document
tokens = clean_doc( doc )
tokens_unique = list( set( tokens ) )
print( tokens[ :200 ] )
print( 'Total Tokens: %d' % len( tokens ) )
print( 'Unique Tokens: %d' % len( tokens_unique ) )

['thank', 'you', 'so', 'much', 'thats', 'so', 'nice', 'isnt', 'he', 'a', 'great', 'guy', 'he', 'doesnt', 'get', 'a', 'fair', 'press', 'he', 'doesnt', 'get', 'it', 'its', 'just', 'not', 'fair', 'and', 'i', 'have', 'to', 'tell', 'you', 'im', 'here', 'and', 'very', 'strongly', 'here', 'because', 'i', 'have', 'great', 'respect', 'for', 'steve', 'king', 'and', 'have', 'great', 'respect', 'likewise', 'for', 'citizens', 'united', 'david', 'and', 'everybody', 'and', 'tremendous', 'resect', 'for', 'the', 'tea', 'party', 'also', 'also', 'the', 'people', 'of', 'iowa', 'they', 'have', 'something', 'in', 'common', 'hardworking', 'people', 'they', 'want', 'to', 'work', 'they', 'want', 'to', 'make', 'the', 'country', 'great', 'i', 'love', 'the', 'people', 'of', 'iowa', 'so', 'thats', 'the', 'way', 'it', 'is', 'very', 'simple', 'with', 'that', 'said', 'our', 'country', 'is', 'really', 'headed', 'in', 'the', 'wrong', 'direction', 'with', 'a', 'president', 'who', 'is', 'doing', 'an', 'absolutely', 'terr

In [5]:
# organize into sequences of tokens
sequence_len = 50 + 1
sequences = list()

for i in range( sequence_len, len( tokens ) ):
    
    # select sequence of tokens
    seq = tokens[ i - sequence_len:i ]
    
    # convert into a line
    line = ' '.join( seq )
    
    # store
    sequences.append( line )
    
print( 'Total Sequences: %d' % len( sequences ) )


Total Sequences: 156111


In [6]:
# save tokens to file, one dialog per line
def save_doc( lines, filename ):
    
    data = '\n'.join( lines )
    file = open( filename, 'w' )
    file.write( data )
    file.close()

In [7]:
# save sequences to file
#out_filename = "../texts/dr-zeuss-compilation-sequences.txt"
out_filename = "../texts/trump-speeches-sequences.txt"
save_doc( sequences, out_filename )

In [8]:
#in_filename = "../texts/dr-zeuss-compilation-sequences.txt"
in_filename = "../texts/trump-speeches-sequences.txt"
doc = load_doc( in_filename )
lines = doc.split( '\n' )
lines[ 0:10 ]

['thank you so much thats so nice isnt he a great guy he doesnt get a fair press he doesnt get it its just not fair and i have to tell you im here and very strongly here because i have great respect for steve king and have great respect likewise',
 'you so much thats so nice isnt he a great guy he doesnt get a fair press he doesnt get it its just not fair and i have to tell you im here and very strongly here because i have great respect for steve king and have great respect likewise for',
 'so much thats so nice isnt he a great guy he doesnt get a fair press he doesnt get it its just not fair and i have to tell you im here and very strongly here because i have great respect for steve king and have great respect likewise for citizens',
 'much thats so nice isnt he a great guy he doesnt get a fair press he doesnt get it its just not fair and i have to tell you im here and very strongly here because i have great respect for steve king and have great respect likewise for citizens united',


## Convert Words to Index Values

In [9]:
# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts( lines )
sequences = tokenizer.texts_to_sequences( lines )

In [10]:
print( len( sequences[ 0 ] ) == sequence_len )
print( len( sequences ) )

True
156111


In [11]:
print( len( tokenizer.word_index ) )
print( type( tokenizer.word_index ) )
print( tokenizer.word_index[ "terrible" ] )

5842
<class 'dict'>
368


In [12]:
# vocabulary size
vocab_size = len( tokenizer.word_index ) + 1
vocab_size

5843

In [13]:
# separate into input and output: for now it's 50 words input and 1 word output
sequences = np.array( sequences )
X = sequences[ :,:-1 ] # all rows, from word 0 up to, but not including, the last word
y = sequences[ :,-1 ]  # all rows, last word only
y = to_categorical( y, num_classes=vocab_size )
seq_length = X.shape[ 1 ]
seq_length

50

## Load and Filter GloVe Data

In [14]:
# load the whole embedding into memory
embeddings_index = dict()
embeddings_dimension = 300 #must be 50, 100, 200, 300
glove = open( "../glove/glove.6B." + str( embeddings_dimension ) + "d.txt" )

for line in glove:
    
    values = line.split()
    # 1st string is word...
    word = values[ 0 ]
    
    if word in tokens_unique:
        
        # ...the rest are coefficients
        coefs = np.asarray( values[ 1: ], dtype='float32' )
        embeddings_index[ word ] = coefs
        print( "*", end="" )
    
glove.close()
print( '\nLoaded %s word vectors.' % len( embeddings_index ) )
print( '\nWords not found %d.' % ( len( tokenizer.word_index ) - len( embeddings_index ) ) )

****************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************

### Transform into Matrix Which Maps Coefs by Index

In [15]:
# create a weight matrix for words in training docs
embedding_matrix = np.zeros( ( vocab_size, embeddings_dimension ) )
missing_words = []

# we need this to create empty coefficients array
dummy_shape = embeddings_index[ "the" ].shape

for word, i in tokenizer.word_index.items():
    
    embedding_vector = embeddings_index.get( word )
    
    # not all words in our token list are in the wikipedia 400K set!
    if embedding_vector is None:
        
        # report and create empty coefficients array
        missing_words.append( word )
        embedding_vector = np.zeros( dummy_shape )
        
    embedding_matrix[ i ] = embedding_vector
    
print( len( missing_words ) )
missing_words

198


['selffunding',
 'theyve',
 'youve',
 'beada',
 'selfinspect',
 'theyll',
 'twoway',
 'onethird',
 'reince',
 'lowenergy',
 'bluecollar',
 'ayeyeye',
 'fbomb',
 'obamaclinton',
 'maralago',
 'bigly',
 'sixyear',
 'escavators',
 'nationbuilding',
 'africanamerican',
 'byebye',
 'selfinspection',
 'offmike',
 'antiwoman',
 'intelligencegathering',
 'thatand',
 'africanamericans',
 'donaldjtrumpcom',
 'mexicanamerican',
 'hardhitting',
 'braggadocious',
 'selfpolice',
 'komatsus',
 'nobodys',
 'taxexempt',
 'indianas',
 'fiveforone',
 'oreilly',
 'resect',
 'everyones',
 'theyd',
 'cetain',
 'romneycare',
 'oneyard',
 'threefooter',
 'werent',
 'clearsighted',
 'nationstate',
 'goodsized',
 'airconditioner',
 'disastertrump',
 'nogood',
 'peopletrump',
 'exampletrump',
 'brandnew',
 'oldfashioned',
 'wellover',
 'trilliontrump',
 'jobproducer',
 'truthteller',
 'expresident',
 'middleincome',
 'miniversion',
 'lowlevel',
 'smarttough',
 'stronglooking',
 'resonants',
 'exofficials',
 'inc

In [16]:
# confirm visually that 
print( len( embedding_matrix[ 0 ] ) )
print( sum( embedding_matrix[ 0 ] ) )
empty_coefficients_count = 0

for i in range( len( embedding_matrix ) ):
    if sum( embedding_matrix[ i ] ) == 0:
        empty_coefficients_count += 1
        
empty_coefficients_count

300
0.0


199

## Define Model

In [17]:
import keras
print( keras.__version__ )

import tensorflow as tf
print( tf.__version__ )

2.1.3
1.4.1


In [18]:
# define model
model = Sequential()

# now using a pre-trained, non-trainable embedding from glove's wiki analysis
model.add( Embedding( vocab_size, embeddings_dimension, weights=[embedding_matrix], input_length=seq_length, trainable=True ) )
model.add( LSTM( seq_length * 2, return_sequences=True ) )
model.add( LSTM( seq_length * 2 ) )
model.add( Dense( seq_length * 2, activation='relu' ) )

# fixed TypeError below, downgraded keras from 2.1.5 to 2.1.3: https://github.com/keras-team/keras/issues/9621
# TypeError: softmax() got an unexpected keyword argument 'axis'
model.add( Dense( vocab_size, activation='softmax' ) )

print( model.summary() )

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 50, 300)           1752900   
_________________________________________________________________
lstm_1 (LSTM)                (None, 50, 100)           160400    
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_1 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_2 (Dense)              (None, 5843)              590143    
Total params: 2,593,943
Trainable params: 2,593,943
Non-trainable params: 0
_________________________________________________________________
None


## Fit the Model

In [19]:
# calc batch size
print( len( sequences ) / 128 )
print( len( sequences ) / 1028 )
# Was:
#batch_size = 128
batch_size = 1028


1219.6171875
151.85894941634243


In [41]:
# compile model
model.compile( loss='categorical_crossentropy', optimizer='adam', metrics=[ 'accuracy' ] )
# fit model
model.fit( X, y, batch_size=batch_size, epochs=200 )

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

Epoch 79/200
Epoch 80/200
Epoch 81/200
Epoch 82/200
Epoch 83/200
Epoch 84/200
Epoch 85/200
Epoch 86/200
Epoch 87/200
Epoch 88/200
Epoch 89/200
Epoch 90/200
Epoch 91/200
Epoch 92/200
Epoch 93/200
Epoch 94/200
Epoch 95/200
Epoch 96/200
Epoch 97/200
Epoch 98/200
Epoch 99/200
Epoch 100/200
Epoch 101/200
Epoch 102/200
Epoch 103/200
Epoch 104/200
Epoch 105/200
Epoch 106/200
Epoch 107/200
Epoch 108/200
Epoch 109/200
Epoch 110/200
Epoch 111/200
Epoch 112/200
Epoch 113/200
Epoch 114/200
Epoch 115/200
Epoch 116/200
Epoch 117/200
Epoch 118/200
Epoch 119/200
Epoch 120/200
Epoch 121/200
Epoch 122/200
Epoch 123/200
Epoch 124/200
Epoch 125/200
Epoch 126/200
Epoch 127/200
Epoch 128/200
Epoch 129/200
Epoch 130/200
Epoch 131/200
Epoch 132/200
Epoch 133/200
Epoch 134/200
Epoch 135/200
Epoch 136/200
Epoch 137/200
Epoch 138/200
Epoch 139/200
Epoch 140/200
Epoch 141/200
Epoch 142/200
Epoch 143/200
Epoch 144/200
Epoch 145/200
Epoch 146/200
Epoch 147/200
Epoch 148/200
Epoch 149/200
Epoch 150/200
Epoch 151/200

Epoch 155/200
Epoch 156/200
Epoch 157/200
Epoch 158/200
Epoch 159/200
Epoch 160/200
Epoch 161/200
Epoch 162/200
Epoch 163/200
Epoch 164/200
Epoch 165/200
Epoch 166/200
Epoch 167/200
Epoch 168/200
Epoch 169/200
Epoch 170/200
Epoch 171/200
Epoch 172/200
Epoch 173/200
Epoch 174/200
Epoch 175/200
Epoch 176/200
Epoch 177/200
Epoch 178/200
Epoch 179/200
Epoch 180/200
Epoch 181/200
Epoch 182/200
Epoch 183/200
Epoch 184/200
Epoch 185/200
Epoch 186/200
Epoch 187/200
Epoch 188/200
Epoch 189/200
Epoch 190/200
Epoch 191/200
Epoch 192/200
Epoch 193/200
Epoch 194/200
Epoch 195/200
Epoch 196/200
Epoch 197/200
Epoch 198/200
Epoch 199/200
Epoch 200/200


<keras.callbacks.History at 0x7f90d2e84358>

In [42]:
# save the model to file
model.save( "models/trump-speeches-02.keras" )

# save the tokenizer
dump( tokenizer, open( "tokenizers/trump-speeches-02.pkl", 'wb' ) )

# save embedding_matrix based on wiki embeddings, complete w/ missing coefficients array dummies
dump( embedding_matrix, open( "embeddings/trump-speeches-02.glove", 'wb' ) )


## Use The Model to Generate Text

In [43]:
seq_length = len( lines[ 0 ].split() ) - 1
seq_length

50

In [44]:
def generate_seq( model, tokenizer, seq_length, seed_text, n_words ):
    
    result = list()
    in_text = seed_text
    
    # generate a fixed number of words
    for _ in range( n_words ):
        
        # encode the text as integer
        encoded = tokenizer.texts_to_sequences( [ in_text ] )[ 0 ] 
        
        # truncate sequences to a fixed length
        encoded = pad_sequences( [ encoded ], maxlen=seq_length, truncating='pre' ) 
        
        # predict probabilities for each word
        yhat = model.predict_classes( encoded, verbose=0 )
        
        # map predicted word index to word
        out_word = ''
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break 
                
        # append to input
        in_text += ' ' + out_word
        
        result.append( out_word )
        
    return ' '.join( result )

In [45]:
# select a seed text
seed_text = lines[ randint( 0, len( lines ) ) ]
print( seed_text + '...\n' )
#print( len( seed_text.split( " " ) ) )

# generate new text
generated = generate_seq( model, tokenizer, seq_length, seed_text, 50 )
print( "..." + generated )

and saying thank you very much sucker its really really crazy so we have to rebuild quickly our infrastructure of this country if we dont the other day in ohio a bridge collapsed bridges are collapsing all over the country the reports on bridges and the like are unbelievable whats happening...

...with our infrastructure i go to saudi arabia i go to dubai i am doing big jobs in dubai i go to various different places i go to china i can do something in first place you go and help from hell of a lot of money worth a lot


In [46]:
def generate_seq_word_by_word( model, tokenizer, seq_length, seed_text, n_words ):
    
    print( "...", end='' )
    #result = list()
    in_text = seed_text
    
    # generate a fixed number of words
    for _ in range( n_words ):
        
        # encode the text as integer
        encoded = tokenizer.texts_to_sequences( [ in_text ] )[ 0 ] 
        
        # truncate sequences to a fixed length
        encoded = pad_sequences( [ encoded ], maxlen=seq_length, truncating='pre' ) 
        
        # predict probabilities for each word
        yhat = model.predict_classes( encoded, verbose=0 )
        
        # map predicted word index to word
        out_word = ''
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                print( word, end=' ' )
                break 
                
        # append to input for next iteration
        in_text += ' ' + out_word

In [47]:
# select a seed text
seed_text = lines[ randint( 0, len( lines ) ) ]
print( seed_text + '...\n' )

# generate new text
generate_seq_word_by_word( model, tokenizer, seq_length, seed_text, 25 )

may be wrong but who i am a very honest person if somebody is going to say a little bit negative or a lot negative about me and if they happen to be republican i may choose to hit them back not always but i may choose to hit them back...

...not a fan of mitt romney mitt romney lost an election he should have won and if you maybe even going to go in me 

In [50]:
my_input = input()

# generate new text
generate_seq_word_by_word( model, tokenizer, seq_length, my_input, 25 )

trump walked down
...a fortune up your company so trump is so you sit right and i just want to skip iowa i was and he said what 