### Based on: https://machinelearningmastery.com/use-word-embedding-layers-deep-learning-keras/ and https://machinelearningmastery.com/how-to-develop-a-word-level-neural-language-model-in-keras/ and https://github.com/stanfordnlp/GloVe

## Choose GPU to Run

In [1]:
# From: https://github.com/keras-team/keras/issues/6031
import os
gpu_id = input( "Select GPU [0 or 1]: " )

if gpu_id in [ "0", "1" ]:
    os.environ[ "CUDA_VISIBLE_DEVICES" ] = gpu_id
else:
    print( "Invalid GPU id.  Defaulting to '0,1'" )

Select GPU [0 or 1]: 1


### Choose CPU Cores

In [2]:
cores = 12
share_cores = input( "Share CPU cores w/ other models? [y/n]: " )

if share_cores == "y":
    
    cores = int( cores / 2 )

print( "Allocating %d cores to this notebook" % cores )

# From: https://stackoverflow.com/questions/46421258/limit-number-of-cores-used-in-keras

from keras import backend as K
K.set_session(
    K.tf.Session(
        config=K.tf.ConfigProto(
            intra_op_parallelism_threads=cores, inter_op_parallelism_threads=cores 
        )
    )
)

Share CPU cores w/ other models? [y/n]: y
Allocating 6 cores to this notebook


Using TensorFlow backend.


In [3]:
import gc
import re
import time
import datetime
import string
from pickle import dump
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Bidirectional
from keras.layers import Dropout
from keras.layers import Embedding
from keras.models import load_model
import collections
from wordsegment import load, segment
import numpy as np

from random import randint
from keras.preprocessing.sequence import pad_sequences

def get_time( output=True ):
    
    temp = time.time()
    if output:
        now = datetime.datetime.now()
        print( now.strftime( "%Y.%m.%d %H:%M" ) )
        
    return temp

start_time = get_time()

def print_time( start_time, end_time, interval="seconds" ):
    
    if interval == "hours":
        print ( "Time to process: [%s] hours" % ( str( ( end_time - start_time ) / 60 / 60 ) ) )
    else:
        print ( "Time to process: [%s] seconds" % ( str( end_time - start_time ) ) )

in_filename = "../texts/trump-speeches.txt"

# load segmentation dictionary: http://www.grantjenks.com/docs/wordsegment/
load()

print_time( start_time, get_time() )

2018.06.27 11:01
2018.06.27 11:01
Time to process: [0.31827831268310547] seconds


## Load Doc, Line by Line

In [5]:
# http://cmdlinetips.com/2011/08/three-ways-to-read-a-text-file-line-by-line-in-python/
def load_doc_by_line( filename ):
    
    # Open the file with read only permit
    file = open( filename, "r" )
    
    # use readlines to read all lines in the file
    # The variable "lines" is a list containing all lines in the file
    lines = file.readlines()
    
    # close the file after reading the lines.
    file.close()
    
    return lines

start_time = get_time()
lines = load_doc_by_line( in_filename )
doc = " ".join( lines )

print_time( start_time, get_time() )

doc[ :400 ]
# tweets = None
# gc.collect()

2018.06.27 11:02
2018.06.27 11:02
Time to process: [0.003679990768432617] seconds


"\ufeffSPEECH 1\n \n \n ...Thank you so much.  That's so nice.  Isn't he a great guy.  He doesn't get a fair press; he doesn't get it.  It's just not fair.  And I have to tell you I'm here, and very strongly here, because I have great respect for Steve King and have great respect likewise for Citizens United, David and everybody, and tremendous resect for the Tea Party.  Also, also the people of Iowa.  The"

## Build Encoded Punctuation to Punctuation Dictionary

In [6]:
punctuation_dict = {}
punctuation_dict[ "endperiod" ] = "."
punctuation_dict[ "endquestion" ] = "?"
punctuation_dict[ "endexclamation" ] = "!"
punctuation_dict[ "pausecomma" ] = ","
punctuation_dict[ "pausecolon" ] = ":"
punctuation_dict[ "pausesemicolon" ] = ";"
punctuation_dict[ "pauseemdash" ] = "-"
punctuation_dict[ "pausedash" ] = "-"
punctuation_dict[ "smartquoteopen" ] = '“'
punctuation_dict[ "smartquoteclose" ] = '”'
punctuation_dict[ "quoteopen" ] = '"'
punctuation_dict[ "quoteclose" ] = '"'
punctuation_dict[ "attweetat" ] = '@'
punctuation_dict[ "tweetlink" ] = "[link]"
punctuation_dict[ "hashtweethash" ] = '#'
punctuation_dict[ "opentweetopen" ] = '[start]'
punctuation_dict[ "closetweetclose" ] = '[end]'
punctuation_dict[ "ampersand" ] = '&'
punctuation_dict[ "tweetelipsis" ] = "..."

punctuation_dict[ "contractionopen" ] = "contractionopen"
punctuation_dict[ "contractionclose" ] = "contractionclose"


In [28]:
# turn a doc into clean tokens
punctuation_string = '!‘"\'#$%&()*+,-./:;<=>?@[\\]^_`{|}~'

def clean_doc( doc ):
    
    # multiple kinds of emdash!
    doc = doc.replace( '--', ' pauseemdash ' )
    doc = doc.replace( '—', ' pauseemdash ' )
    doc = doc.replace( '–', ' pauseemdash ' )
    doc = doc.replace( 'u.s.', ' US ' )
    doc = doc.replace( 'U.S.', ' US ' )
    doc = doc.replace( "…", " tweetelipsis " )
    doc = doc.replace( "...", " tweetelipsis " )
    doc = doc.replace( '. ', ' endperiod ' )
    doc = doc.replace( '! ', ' endexclamation ' )
    doc = doc.replace( '? ', ' endquestion ' )
    doc = doc.replace( '•', ' endbullet ' )
    doc = doc.replace( ', ', ' pausecomma ' )
    doc = doc.replace( ': ', ' pausecolon ' )
    doc = doc.replace( '; ', ' pausesemicolon ' )
    doc = doc.replace( ' - ', ' pausedash ' )
    doc = doc.replace( '“', ' smartquoteopen ' )
    doc = doc.replace( '”', ' smartquoteclose ' )
    doc = doc.replace( ' "', ' quoteopen ' )
    doc = doc.replace( '" ', ' quoteclose ' )
#     doc = doc.replace( "@ ", " " ) # remove trailing @'s first...
#     doc = doc.replace( "@", "attweetat" ) # ...then prefix 1st char @ as word
#     doc = doc.replace( "# ", " " ) # remove trailing #'s first...
#     doc = doc.replace( "#", "hashtweethash" ) # ...then prefix 1st char # as word
    doc = doc.replace( "&amp;", " ampersand " )
    doc = doc.replace( "\ufeff", "" )
    
    # From: https://stackoverflow.com/questions/33113338/how-to-replace-dash-between-characters-with-space-using-regex
    # replace hyphenated words w/ spaces
    doc = re.sub( r"([a-z])\-([a-z])", r"\1 \2", doc , 0, re.IGNORECASE )
    
    # replace comma separated words w/0 spaces
    doc = re.sub( r"([a-z]),([a-z])", r"\1 \2", doc , 0, re.IGNORECASE )
    
#     # replace links w/ "tweetlink"
#     # basic regex here: https://bytes.com/topic/python/answers/741677-find-replace-hyperlinks-string
#     http_pattern = r'http[^\s\n\r]+'
#     doc = re.sub( http_pattern , "tweetlink", doc )
    
#     # this overgenerates texttexttexttweetlink, so insert space where it occurs
#     doc = re.sub( r"([a-z])(tweetlink)", r"\1 \2", doc , 0, re.IGNORECASE )
    
#     # above isn't catching all (wuh?!?), so just bruteforce it
#     doc = doc.replace( "tweetlink", " tweetlink" )
    
#     # overgeneration of foooooooooooooooohashtweethash, so insert space where it occurs
#     doc = re.sub( r"([a-z])(hashtweethash)", r"\1 \2", doc , 0, re.IGNORECASE )
#     # above isn't catching all (wuh?!?), so just bruteforce it
#     doc = doc.replace( "hashtweethash", " hashtweethash" )
    
#     # overgeneration of fooooooooooooooooattweetat, so insert space where it occurs
#     doc = re.sub( r"([a-z])(attweetat)", r"\1 \2", doc , 0, re.IGNORECASE )
#     # above isn't catching all (wuh?!?), so just bruteforce it
#     doc = doc.replace( "attweetat", " attweetat" )
    
#     # tag all hyphenated words to protect from deletion
#     doc = re.sub( r"([a-z])-([a-z])", r"\1hyphentweethyphen\2", doc , 0, re.IGNORECASE )
    
    # do big ad-hoc global replacement, instead of using maketrans and string.punctuation
    # single quote is special case: don't leave a space, so that contractions are collapsed
    doc = re.sub( r"([a-z]+)'([a-z]+)", r"contractionopen \1\2 contractionclose", doc , 0, re.IGNORECASE )
    # some fool's using a DIFFERENT apostrophe... ’
    doc = re.sub( r"([a-z]+)’([a-z]+)", r"contractionopen \1\2 contractionclose", doc , 0, re.IGNORECASE )
    # replaced by above to wrap contractions
    # doc = re.sub( r"([a-z])'([a-z])", r"\1\2", doc , 0, re.IGNORECASE )
    # # some fool's using a DIFFERENT apostrophe... ’
    # doc = re.sub( r"([a-z])’([a-z])", r"\1\2", doc , 0, re.IGNORECASE )
    
    # trailing deletion: goin' to goin
    doc = re.sub( r"([a-z])’ ", r"\1 ", doc , 0, re.IGNORECASE )
    # not working?!?
    #doc = re.sub( r"’([a-z])", r"\1", doc , 0, re.IGNORECASE )
    
    # otherwise, delete all chars not already tagged as having semantic interest
    # moved up!
    #punctuation_string = '!‘"\'#$%&()*+,-./:;<=>?@[\\]^_`{|}~'
    for punctuation_char in punctuation_string:
        doc = doc.replace( punctuation_char, ' ' )
    
    # ...now that global deletion of stragging dashes has been performed, replaced hyphenated words
    #doc = doc.replace( "hyphentweethyphen", "-" )
    
    # finally, reduce duplicate spaces to just one: https://stackoverflow.com/questions/1546226/simple-way-to-remove-multiple-spaces-in-a-string/15913564
    # doc = doc.replace( "  ", ' ' )
    doc = re.sub( ' +', ' ', doc )
    
    # split into tokens by white space
    tokens = doc.split()
    
    return tokens

In [30]:
clean_doc( "\ufeffFeels like I'm goin’ crazy, this isn’t some fool's errand, wouldn't you agree?" )

['Feels',
 'like',
 'contractionopen',
 'Im',
 'contractionclose',
 'goin',
 'crazy',
 'pausecomma',
 'this',
 'contractionopen',
 'isnt',
 'contractionclose',
 'some',
 'contractionopen',
 'fools',
 'contractionclose',
 'errand',
 'pausecomma',
 'contractionopen',
 'wouldnt',
 'contractionclose',
 'you',
 'agree']

## Load Dictionary and Segment Doc Tokens

In [10]:
# load dictionary
start_time = get_time()

embeddings_dimension = 300 #must be 50, 100, 200, 300
with open( "output/vocabulary-glove.6B." + str( embeddings_dimension ) + "d.txt", 'r' ) as vocabulary_file:
    
    # omit newline char: https://stackoverflow.com/questions/12330522/reading-a-file-without-newlines
    vocabulary_list = vocabulary_file.read().splitlines()

print_time( start_time, get_time() )

vocabulary_dict = dict.fromkeys( vocabulary_list )
print( "vocabulary_list", len( vocabulary_list ) )
print( "vocabulary_dict", len( vocabulary_dict ) )

print( "1234123412341234" in vocabulary_dict )
print( "earth" in vocabulary_dict )
print( "earth" in vocabulary_list )
print( vocabulary_dict[ "earth" ] )

2018.06.27 11:06
2018.06.27 11:06
Time to process: [0.03485250473022461] seconds
vocabulary_list 400000
vocabulary_dict 400000
False
True
True
None


In [31]:
start_time = get_time()

# clean document
tokens_raw = clean_doc( doc )

 # 'compress' into list of unique tokens
tokens_unique = list( set( tokens_raw ) )
tokens_unique_lowercase = [ token.lower() for token in tokens_unique ]

print()
print( 'Total tokens_raw: %d' % len( tokens_raw ) )
print( 'Unique tokens: %d' % len( tokens_unique ) )
print( 'Unique tokens_unique_lowercase: %d' % len( tokens_unique_lowercase ) )

print_time( start_time, get_time() )

2018.06.27 11:17

Total tokens_raw: 209834
Unique tokens: 7668
Unique tokens_unique_lowercase: 7668
2018.06.27 11:17
Time to process: [0.45456862449645996] seconds


## Look at Token Frequencies

In [32]:
word_counts = collections.Counter( tokens_raw )
pairs = { k: word_counts[ k ] for k in list( word_counts )[ :10 ] }
pairs

{'1': 84,
 'SPEECH': 11,
 'Thank': 148,
 'Thats': 209,
 'contractionopen': 9313,
 'endperiod': 11080,
 'much': 307,
 'so': 781,
 'tweetelipsis': 318,
 'you': 2198}

In [33]:
for word, count in word_counts.most_common( 30 ):
    print( '%s: %7d' % ( word, count ) )

endperiod:   11080
contractionopen:    9313
contractionclose:    9313
pausecomma:    8572
the:    5187
to:    5171
I:    4873
and:    3531
a:    3351
of:    2803
you:    2198
have:    2121
that:    2109
it:    2071
pauseemdash:    1931
going:    1922
in:    1882
And:    1643
is:    1422
we:    1405
they:    1364
know:    1278
people:    1209
be:    1127
are:     987
for:     977
We:     975
was:     932
endquestion:     932
But:     919


In [34]:
print( "i", word_counts[ "i" ] )
print( "I", word_counts[ "I" ] )
print( "me", word_counts[ "me" ] )

print( "you", word_counts[ "you" ] )

print( "we", word_counts[ "we" ] )
print( "us", word_counts[ "us" ] )
print( "US", word_counts[ "US" ] )

print( "they", word_counts[ "they" ] )
print( "them", word_counts[ "them" ] )

i 0
I 4873
me 718
you 2198
we 1405
us 309
US 34
they 1364
them 548


In [35]:
start_time = get_time()

# organize into sequences of tokens
sequence_len = 50 + 1
sequences = []

for i in range( sequence_len, len( tokens_raw ) ):
    
    # select sequence of tokens
    seq = tokens_raw[ i - sequence_len:i ]
    
    # convert into a line
    line = ' '.join( seq )
    
    # store
    sequences.append( line )
    
print( 'Total Sequences: %d' % len( sequences ) )
print_time( start_time, get_time() )


2018.06.27 11:18
Total Sequences: 209783
2018.06.27 11:18
Time to process: [0.171112060546875] seconds


In [36]:
# save tokens to file, one dialog per line
def save_doc( lines, filename ):
    
    data = '\n'.join( lines )
    file = open( filename, 'w' )
    file.write( data )
    file.close()

In [37]:
# save sequences to file
out_filename = "sequences/trump-speech-sequences-take-III.txt"
save_doc( sequences, out_filename )

In [38]:
def load_doc( filename ):
    
    # open the file as read only
    file = open( filename, 'r' )
    # read all text
    text = file.read()
    # close the file
    file.close()
    
    return text

In [39]:
in_filename = "sequences/trump-speech-sequences-take-III.txt"
#doc = load_doc( in_filename )
lines = load_doc( in_filename ).split( '\n' )
lines[ 0:10 ]

['SPEECH 1 tweetelipsis Thank you so much endperiod contractionopen Thats contractionclose so nice endperiod contractionopen Isnt contractionclose he a great guy endperiod He contractionopen doesnt contractionclose get a fair press pausesemicolon he contractionopen doesnt contractionclose get it endperiod contractionopen Its contractionclose just not fair endperiod And I have to tell you',
 '1 tweetelipsis Thank you so much endperiod contractionopen Thats contractionclose so nice endperiod contractionopen Isnt contractionclose he a great guy endperiod He contractionopen doesnt contractionclose get a fair press pausesemicolon he contractionopen doesnt contractionclose get it endperiod contractionopen Its contractionclose just not fair endperiod And I have to tell you contractionopen',
 'tweetelipsis Thank you so much endperiod contractionopen Thats contractionclose so nice endperiod contractionopen Isnt contractionclose he a great guy endperiod He contractionopen doesnt contractionclose

In [40]:
seq_len_sum = 0;
line_len_dict = {}

for line in lines:
    
    token_count = len( line.split( " " ) )
    seq_len_sum += token_count
    
    if token_count in line_len_dict:
        line_len_dict[ token_count ] += 1
    else:
        line_len_dict[ token_count ] = 1

print( seq_len_sum / len( lines ) )
print( line_len_dict )

51.0
{51: 209783}


## Convert Words to Index Values

In [42]:
start_time = get_time()

# integer encode sequences of words
# tokenizer = Tokenizer( lower=False, filters=punctuation_string )
tokenizer = Tokenizer( lower=False, filters="" )

tokenizer.fit_on_texts( lines )
print( "sequences len *before* keras:", len( sequences ) )
sequences = tokenizer.texts_to_sequences( lines )
print( "sequences len *after* keras:", len( sequences ) )

# elegant! https://stackoverflow.com/questions/41971587/how-to-convert-predicted-sequence-back-to-text-in-keras
words_by_id = dict( map( reversed, tokenizer.word_index.items() ) ) 

# Check to and from of words and idx
print( tokenizer.word_index[ "contractionopen" ] )
print( tokenizer.word_index[ "contractionclose" ] )

print( words_by_id[ 2] )
print( words_by_id[ 3 ] )

# vocabulary size
# discrepancy between these two lengths, of by a few words...
#vocab_size = len( tokens_unique ) + 1
vocab_size = len( tokenizer.word_index ) + 1
print( "vocab_size", vocab_size )

print_time( start_time, get_time() )

2018.06.27 11:19
sequences len *before* keras: 209783
sequences len *after* keras: 209783
3
2
endperiod
is
pausecomma
vocab_size 7669
2018.06.27 11:19
Time to process: [7.721929311752319] seconds


## This KLUDGE Works, But Is No Longer Needed
*(Using an empty filter string in the tokenizer obviates the need for this workaround)*

In [216]:
# start_time = get_time()

# sequences_foo = np.zeros( ( len( sequences ), len( sequences[ 0 ] ) ), dtype=int )
# print( sequences_foo.shape )

# row_count = 0

# for row_idx, row in enumerate( sequences ):
    
#     for col_idx, col in enumerate( row ):
        
#         if col_idx > 26:
#             print( row_idx, col_idx )
#         else:
#             sequences_foo[ row_idx, col_idx ] = col
        
        
# #     if row_count == 30:
# #         break
# #     row_count += 1

# print( sequences_foo.shape )
# print( sequences_foo[ 0 ])

# print_time( start_time, get_time() )

## We Need to Convert List of Lists into Array of Arrays
_(Tokenizer's output is different when asked to leave case as is!?!)_

In [45]:
start_time = get_time()

sequences_np = np.array( sequences )

for i in range( len( sequences ) ):
    sequences_np[ i ] = np.array( sequences[ i ] )

print( type( sequences ) )
print( type( sequences_np ) )
print( sequences_np.shape )
print( type( sequences_np[ 0 ] ) )
print( sequences_np[ 0 ] )

print_time( start_time, get_time() )

# sequences = None
# gc.collect()

2018.06.27 11:21
<class 'list'>
<class 'numpy.ndarray'>
(209783, 51)
<class 'numpy.ndarray'>
[1472  301   93  185   11   40  101    1    3  144    2   40  193    1    3
 1691    2   52    9   53  168    1  110    3  216    2   56    9  641  312
  215   52    3  216    2   56   14    1    3   61    2   66   35  641    1
   18    7   12    6  123   11]
2018.06.27 11:21
Time to process: [1.0177383422851562] seconds


In [48]:
print( len( tokenizer.word_index ) + 1 )
print( len( tokens_unique ) + 1 )
print( len( set( tokens_unique ) ) + 1 )

7669
7669
7669


In [49]:
print( type( sequences_np ) )
print( sequences_np.shape )

print( sequences_np[ 0 ] )
print( type( sequences_np[ 0 ] ) )

# separate into input and output: for now it's 50 words input and 1 word output
#sequences = np.array( sequences )
X = sequences_np[ :,:-1 ] # all rows, from word 0 up to, but not including, the last word
y = sequences_np[ :,-1 ]  # all rows, last word only

# Throws MemoryError
# https://stackoverflow.com/questions/46293734/memoryerror-in-keras-utils-np-utils-to-categorical
#y = to_categorical( y, num_classes=vocab_size )
print( "X.shape", X.shape )
print( "y.shape", y.shape )

seq_length = len( X[ 0 ] )
seq_length

<class 'numpy.ndarray'>
(209783, 51)
[1472  301   93  185   11   40  101    1    3  144    2   40  193    1    3
 1691    2   52    9   53  168    1  110    3  216    2   56    9  641  312
  215   52    3  216    2   56   14    1    3   61    2   66   35  641    1
   18    7   12    6  123   11]
<class 'numpy.ndarray'>
X.shape (209783, 50)
y.shape (209783,)


50

## Load and Filter GloVe Data

In [50]:
import itertools

start_time = get_time()

# verify that mixed and lower case lists are the same length
print( "¿", len( tokens_unique ), "==", len( tokens_unique_lowercase ), "?" )

# load the whole embedding into memory
embeddings_index = dict()

# moved up!
#embeddings_dimension = 300 #must be 50, 100, 200, 300
glove = open( "../glove/glove.6B." + str( embeddings_dimension ) + "d.txt" )

# ASSUME: that 1st item in list is lowercase word, vectors are 2nd item
for line in glove:
    
    values = line.split()
    # 1st string is word...
    word = values[ 0 ]
    
    # we're now searching w/in lowercase version to allow both mixed and lower case words ("WorD" and "word") to 
    # inherit same vectors
    if word in tokens_unique_lowercase:
        
        # ...the rest are coefficients
        coefs = np.asarray( values[ 1: ], dtype='float32' )
        
        # get indices for all occurences in lower_case list
        indices = [ i for i, word_lower in enumerate( tokens_unique_lowercase ) if word_lower == word ]
        
        if len( indices ) > 1:
            
            print( "[%d] entries found for [%s]" % ( len( indices ), word ) )
        
            # iterate indices for this word
            for i in indices:
                word_temp = tokens_unique[ i ]
                embeddings_index[ word_temp ] = coefs
        else:
            
            embeddings_index[ word ] = coefs 
            
    
glove.close()
print( '\nLoaded %s word vectors.' % len( embeddings_index ) )
#print( '\nWords not found %d.' % ( len( tokenizer.word_index ) - len( embeddings_index ) ) )
print_time( start_time, get_time(), interval="minute" )


2018.06.27 11:23
¿ 7668 == 7668 ?
[3] entries found for [the]
[3] entries found for [of]
[3] entries found for [to]
[3] entries found for [and]
[3] entries found for [in]
[2] entries found for [a]
[3] entries found for [for]
[3] entries found for [that]
[3] entries found for [on]
[3] entries found for [is]
[3] entries found for [was]
[3] entries found for [said]
[3] entries found for [with]
[3] entries found for [he]
[3] entries found for [as]
[3] entries found for [it]
[3] entries found for [by]
[3] entries found for [at]
[3] entries found for [from]
[3] entries found for [his]
[3] entries found for [an]
[3] entries found for [be]
[3] entries found for [has]
[3] entries found for [are]
[3] entries found for [have]
[3] entries found for [but]
[3] entries found for [were]
[3] entries found for [not]
[3] entries found for [this]
[3] entries found for [who]
[3] entries found for [they]
[2] entries found for [had]
[3] entries found for [which]
[3] entries found for [will]
[3] entries found

[2] entries found for [agreement]
[3] entries found for [started]
[2] entries found for [growth]
[3] entries found for [yet]
[2] entries found for [western]
[3] entries found for [special]
[3] entries found for [interest]
[3] entries found for [strong]
[2] entries found for [england]
[2] entries found for [named]
[3] entries found for [real]
[2] entries found for [rate]
[2] entries found for [race]
[2] entries found for [nearly]
[2] entries found for [enough]
[3] entries found for [keep]
[3] entries found for [call]
[3] entries found for [taking]
[3] entries found for [outside]
[3] entries found for [really]
[3] entries found for [almost]
[2] entries found for [single]
[3] entries found for [leading]
[2] entries found for [trying]
[2] entries found for [find]
[2] entries found for [minutes]
[3] entries found for [together]
[3] entries found for [hard]
[2] entries found for [hours]
[2] entries found for [san]
[2] entries found for [executive]
[2] entries found for [areas]
[2] entries fo

[2] entries found for [fast]
[2] entries found for [boy]
[3] entries found for [worst]
[2] entries found for [weather]
[2] entries found for [bridge]
[2] entries found for [cancer]
[2] entries found for [crimes]
[2] entries found for [sun]
[2] entries found for [becoming]
[2] entries found for [gulf]
[3] entries found for [gets]
[2] entries found for [corporate]
[2] entries found for [easy]
[2] entries found for [choice]
[2] entries found for [add]
[2] entries found for [reduce]
[2] entries found for [crowd]
[2] entries found for [fine]
[2] entries found for [environmental]
[2] entries found for [subject]
[2] entries found for [mean]
[2] entries found for [ten]
[3] entries found for [ask]
[3] entries found for [watch]
[2] entries found for [else]
[3] entries found for [rich]
[3] entries found for [happy]
[3] entries found for [save]
[2] entries found for [article]
[2] entries found for [standing]
[2] entries found for [doctors]
[2] entries found for [sanctions]
[2] entries found for [b

[2] entries found for [crazy]
[2] entries found for [discipline]
[3] entries found for [hell]
[2] entries found for [shocked]
[2] entries found for [afterwards]
[2] entries found for [rely]
[2] entries found for [endorsed]
[2] entries found for [jihad]
[2] entries found for [anymore]
[2] entries found for [legally]
[3] entries found for [amazing]
[2] entries found for [surprising]
[2] entries found for [cheese]
[2] entries found for [hire]
[2] entries found for [negotiated]
[2] entries found for [negotiators]
[2] entries found for [literally]
[2] entries found for [weakness]
[3] entries found for [tremendous]
[2] entries found for [rebuild]
[2] entries found for [businessmen]
[2] entries found for [somehow]
[2] entries found for [stopping]
[2] entries found for [hopefully]
[3] entries found for [folks]
[2] entries found for [honored]
[2] entries found for [whenever]
[2] entries found for [taxpayers]
[3] entries found for [sorry]
[2] entries found for [mr]
[2] entries found for [indiana

In [51]:
indices = [ i for i, word_lower in enumerate( tokens_unique_lowercase ) if word_lower == "president" ]

for i in indices:
    
    print( "index [%d] word [%s] frequencey [%d]" % ( i, tokens_unique[ i ], word_counts[ tokens_unique[ i ] ] ) )
  

index [2185] word [PRESIDENT] frequencey [7]
index [3220] word [President] frequencey [80]
index [3686] word [president] frequencey [117]


In [52]:
# what words appear in upper case?
words_uppercase = []

for word in tokens_unique:
    
    if word.isupper():
        words_uppercase.append( word )
        
print( "len( words_uppercase )", len( words_uppercase ) )

len( words_uppercase ) 1157


In [53]:
for word in words_uppercase:
    
    print( "word [%s], count[%d]" % ( word, word_counts[ word ] ) ) 

word [TO], count[265]
word [CONSEQUENCE], count[1]
word [DOUBLE], count[1]
word [ECONOMICALLY], count[1]
word [FACE], count[2]
word [LIFETIME], count[2]
word [RAISING], count[1]
word [EAST], count[3]
word [HES], count[13]
word [LEGALLY], count[2]
word [AAA], count[4]
word [CHARACTERISTICS], count[1]
word [JOBS], count[7]
word [WHEN], count[26]
word [DEAD], count[1]
word [TINY], count[2]
word [VITAL], count[3]
word [RULES], count[1]
word [LACE], count[1]
word [TEN], count[1]
word [HEAR], count[6]
word [YEARS], count[17]
word [GSA], count[4]
word [LAUGHTER], count[1]
word [AGENTS], count[3]
word [LOST], count[3]
word [KILL], count[1]
word [FOREVER], count[1]
word [CHEERS], count[2]
word [STATES], count[3]
word [SMALL], count[3]
word [PENNSYLVANIA], count[2]
word [PPP], count[5]
word [ISOLATIONIST], count[3]
word [CRIME], count[1]
word [DEBATE], count[1]
word [AGAIN], count[6]
word [IMBALANCE], count[2]
word [IVE], count[4]
word [COULDNT], count[1]
word [THIS], count[43]
word [RESPECT], c

In [54]:
print( "len( tokens_unique ) == len( tokens_unique_lowercase )", len( tokens_unique ) == len( tokens_unique_lowercase ) )
print( "Words not in embeddings_index:", len( tokens_unique_lowercase ) - len( embeddings_index ) )
#print( embeddings_index[ "the" ] )
print( "len( tokenizer.word_index )", len( tokenizer.word_index ) )
print( "len( embeddings_index )", len( embeddings_index ) )
print( "vocab_size", vocab_size )

len( tokens_unique ) == len( tokens_unique_lowercase ) True
Words not in embeddings_index: 131
len( tokenizer.word_index ) 7668
len( embeddings_index ) 7537
vocab_size 7669


### Transform into Matrix That Maps Coefs by Index

In [55]:
# create a weight matrix for words in training docs
embedding_matrix = np.zeros( ( vocab_size, embeddings_dimension ) )
missing_words = []

# we need this to create empty coefficients array
dummy_shape = embeddings_index[ "the" ].shape

# use the mixed case list: tokens_unique
#for i, word in enumerate( tokens_unique ):
for word, i in tokenizer.word_index.items():
    
    embedding_vector = embeddings_index.get( word )
    
    # not all words in our token list are in the wikipedia 400K set!
    if embedding_vector is None:
        
        # 1st time, get the lowercase vector
        embedding_vector = embeddings_index.get( word.lower() )
        
    # 2nd test: If not found as original or lower case, then assign it an empty vector
    if embedding_vector is None:  
        
        # report and create empty coefficients array
        missing_words.append( word )
        embedding_vector = np.zeros( dummy_shape )
     
    #print( "i", i, "word", word )
    embedding_matrix[ i ] = embedding_vector
    
print( "Missing words:", len( missing_words ) )

# before hashtag segmentation Missing words: 5640
# after hashtag segmentation: 5060

Missing words: 131


In [56]:
missing_words.sort()
missing_words#[ :20 ]

['1stand',
 '22Kill',
 'Alumisource',
 'Ayeyeye',
 'Beada',
 'Brexits',
 'CAMERAMANS',
 'CNBCs',
 'Cruzs',
 'DSHTIDNT',
 'DonaldJTrump',
 'Everybodys',
 'Everyones',
 'Everythings',
 'ISISs',
 'Indianas',
 'Iowas',
 'Itll',
 'Ivankas',
 'Jebs',
 'KPZCOMPANIES',
 'Kasichs',
 'Komatsus',
 'Kristols',
 'MAEPGESSENGER',
 'NASHLTIONALLY',
 'Nabiscos',
 'Nabsico',
 'Nobodys',
 'OMalley',
 'OREILLY',
 'OReilly',
 'Obamacares',
 'Ordierno',
 'Orlandos',
 'PLIBLG',
 'Pences',
 'Pfizers',
 'REINCE',
 'RealDonaldTrump',
 'Reince',
 'Romneycare',
 'Rubios',
 'Sanderss',
 'Sarahs',
 'Schusters',
 'Sharons',
 'Shouldnt',
 'Swenden',
 'THETORY',
 'THEYLL',
 'THEYVE',
 'Therell',
 'Theyd',
 'Theyll',
 'Theyve',
 'Universitys',
 'Vetdogs',
 'Wisconsins',
 'YOUVE',
 'Youve',
 'anybodys',
 'ayeyayay',
 'ayeyeye',
 'baby’',
 'bigly',
 'bleh',
 'braggadocious',
 'braggingly',
 'cetain',
 'clearsighted',
 'contractionclose',
 'contractionopen',
 'corruptness',
 'couldve',
 'deductability',
 'embargoing',
 '

In [57]:
print( embedding_matrix.shape )
print( X.shape )
print( y.shape )

(7669, 300)
(209783, 50)
(209783,)


In [233]:
# write missing words to file
with open( "output/missing-words-speeches.txt", "w" ) as out_file:
    
    for word in missing_words:

        out_file.write( "%s\n" % word )


In [58]:
# confirm visually that 
print( len( embedding_matrix[ 0 ] ) )
print( sum( embedding_matrix[ 0 ] ) )
empty_coefficients_count = 0

for i in range( len( embedding_matrix ) ):
    if sum( embedding_matrix[ i ] ) == 0:
        empty_coefficients_count += 1
        
empty_coefficients_count

300
0.0


132

## Define Model

In [59]:
import keras
print( keras.__version__ )

import tensorflow as tf
print( tf.__version__ )

2.1.3
1.4.1


In [60]:
# define model
model = Sequential()

# now using a pre-trained, non-trainable embedding from glove's wiki analysis
model.add( Embedding( vocab_size, embeddings_dimension, weights=[embedding_matrix], input_length=seq_length, trainable=True ) )
model.add( Bidirectional( LSTM( seq_length * 2, return_sequences=True ) ) )
#model.add( Dropout( 0.90 ) )
model.add( Bidirectional( LSTM( seq_length * 2 ) ) )
#model.add( Dropout( 0.90 ) )
model.add( Dense( seq_length * 2, activation='relu' ) )

# fixed TypeError below, downgraded keras from 2.1.5 to 2.1.3: https://github.com/keras-team/keras/issues/9621
# TypeError: softmax() got an unexpected keyword argument 'axis'
model.add( Dense( vocab_size, activation='softmax' ) )

print( model.summary() )

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 50, 300)           2300700   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 50, 200)           320800    
_________________________________________________________________
bidirectional_2 (Bidirection (None, 200)               240800    
_________________________________________________________________
dense_1 (Dense)              (None, 100)               20100     
_________________________________________________________________
dense_2 (Dense)              (None, 7669)              774569    
Total params: 3,656,969
Trainable params: 3,656,969
Non-trainable params: 0
_________________________________________________________________
None


## Fit the Model

In [61]:
# calc batch size
print( len( sequences ) / 128 )
print( len( sequences ) / 1028 )
# Was:
#batch_size = 124
batch_size = 1024

# can't remember where I read that batch sizes larger than 512 cause erratic convergence patterns.
# TODO: find that article!
#batch_size = 512


1638.9296875
204.06906614785993


## Load Model?

In [None]:
load = input( "Load model? [y/n]" )

if load == "y":
    
    model_name = "models/trump-speeches-take-III.h5"
    print( "Loading model %s" % model_name )
    model = load_model( model_name )
    
else:
    
    print( "NOT loading model, using default untrained model" )

In [257]:
start_time = get_time()

# Per comment here: https://stackoverflow.com/questions/46293734/memoryerror-in-keras-utils-np-utils-to-categoricalhttps://stackoverflow.com/questions/46293734/memoryerror-in-keras-utils-np-utils-to-categorical
model.compile( loss='sparse_categorical_crossentropy', optimizer='adam', metrics=[ 'accuracy' ] )
# model.compile( loss='categorical_crossentropy', optimizer='adam', metrics=[ 'accuracy' ] )

model.fit( X, y, batch_size=batch_size, epochs=100 )
end_time = get_time()
print_time( start_time, end_time, interval="hours" )


2018.06.26 13:43
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoc

In [258]:
# save the whole model to file
model.save( "models/trump-speeches-take-III.h5" )

# save the tokenizer
dump( tokenizer, open( "tokenizers/trump-speeches-take-III.dump", 'wb' ) )

# save embedding_matrix based on wiki embeddings, complete w/ missing coefficients array dummies
dump( embedding_matrix, open( "embeddings/trump-speeches-take-III.glove", 'wb' ) )


## Use The Model to Generate Text

In [244]:
seq_length = len( lines[ 0 ].split() ) - 1
seq_length

25

In [245]:
punctuation_dict.get( "smartquoteclose", "bar" )

'”'

In [313]:
def sample_yhats( preds, temperature=1.0 ):
    
    # helper function to sample an index from a probability array
    preds = np.asarray( preds ).astype( 'float64' )
    preds = np.log( preds ) / temperature
    exp_preds = np.exp( preds )
    preds = exp_preds / np.sum( exp_preds )
    probas = np.random.multinomial( 1, preds, 1 )
    return np.argmax( probas )

In [321]:
def generate_seq( model, tokenizer, seq_length, seed_text, n_words, temperature=1.0 ):
    
    result = list()
    result_literal = list()
    in_text = seed_text
    yhat = [ 0.1 ]
    
    # generate a fixed number of words
    for _ in range( n_words ):
        
        # encode the text as integer
        encoded = tokenizer.texts_to_sequences( [ in_text ] )[ 0 ] 
        
        # truncate sequences to a fixed length
        encoded = pad_sequences( [ encoded ], maxlen=seq_length, truncating='pre' ) 
        
        # predict probabilities for each word
        yhat = model.predict_classes( encoded, verbose=0 )
        # this returns list of predictions
        yhats = model.predict( encoded, verbose=0 )[ 0 ]
        
        # map predicted word index to word
        print( "yhat", yhat, words_by_id[ yhat[ 0 ] ] )
        #print( "len( yhats )", len( yhats ) )
        #print( "type( yhats )", type( yhats ) )
        #print( "argmax( yhats )", np.argmax( yhats ) )
        #print( "yhats", yhats )
        
        out_word_id = sample_yhats( yhats, temperature )
        out_word = words_by_id[ out_word_id ]
        # out_word = words_by_id[ yhat[ 0 ] ]
                
        # append to input
        in_text += ' ' + out_word
        
        #result.append( out_word )
        # substitute punctuation tags for actual punctuation
        result.append( punctuation_dict.get( out_word, out_word ) )
        
#         if out_word == "closetweetclose":
#             #print( "Tweet end detected" )
#             break
            
    return ' '.join( result )

In [318]:
type( words_by_id[ 1 ] )

str

In [311]:
def reformat_punctuation( doc ):
    
    doc = doc.replace( ' . ', '. ' )
    doc = doc.replace( ' ! ', '! ' )
    doc = doc.replace( ' ? ', '? ' )
    doc = doc.replace( ' , ', ', ' )
    doc = doc.replace( ' : ', ': ' )
    doc = doc.replace( ' ; ', '; ' )
    
    doc = doc.replace( '“ ', '“' )
    doc = doc.replace( ' ”', '”' )
    doc = doc.replace( "attweetat", '@' )
    #doc = doc.replace( "hashtweethash", '#' )
    doc = doc.replace( " amp; ", '&' )
    
    return doc

In [322]:
# select a seed text
seed_text = lines[ randint( 0, len( lines ) ) ]
print( seed_text, "\n" )

# # substitute the seed words
# raw_text = seed_text.split( " " )

# clean_text = [ punctuation_dict.get( word, word ) for word in raw_text ]
# clean_text = ' '.join( clean_text )

# print( reformat_punctuation( clean_text ) + '... \n' )
# #print( len( seed_text.split( " " ) ) )

# generate new text
generated = generate_seq( model, tokenizer, seq_length, seed_text, 5, 1.5 )

print( "... " + generated )
print()
print( "\n\n... " + reformat_punctuation( generated ) )

what contractionopen youre contractionclose doing closetweetclose opentweetopen hashtagopen 2 hashtagclose endperiod Be totally focused endperiod Being successful requires nothing less than 100 of your concentrated effort 

yhat [2] closetweetclose
yhat [1] opentweetopen
yhat [2994] Whether
yhat [2507] Todays
yhat [11] contractionclose
... [end] [start] contractionopen Ill contractionclose



... [end] [start] contractionopen Ill contractionclose


  """


In [269]:
my_input = input()
"opentweetopen " + my_input
generated = generate_seq( model, tokenizer, seq_length, my_input, 50 )
print( "... " + reformat_punctuation( generated ) )

Get tough and smart US or we contractionopen wont contractionclose have a country anymore [end] [start] Big announcement in Ames Iowa on Tuesday! You
... [end] [start] Illegal education talks about employees could never have to begin with the historic First Amendment - states will never get out this years. Such more candidates [end] [start] Remember ObamaCare is getting smart borders is going to take Evangelical govt which NO action. Would be what


In [None]:
#initiate sentences
seed_sentences = "Nobody has better respect for intelligence than Donald Trump ."
generated = ''
sentence = []

for i in range( seq_length ):
    sentence.append( "a" )

seed = seed_sentences.split()

for i in range( len( seed ) ):
    sentence[ seq_length - i - 1 ]= seed[ len( seed ) - i - 1 ]

generated += ' '.join(sentence)
print('Generating text with the following seed: "' + ' '.join(sentence) + '"')

print ()

In [None]:
words_number = 100
#generate the text
for i in range(words_number):
    #create the vector
    x = np.zeros((1, seq_length, vocab_size))
    for t, word in enumerate(sentence):
        x[0, t, vocab[word]] = 1.
    #print(x.shape)

    #calculate next word
    preds = model.predict(x, verbose=0)[0]
    next_index = sample(preds, 0.34)
    next_word = vocabulary_inv[next_index]

    #add the next word to the text
    generated += " " + next_word
    # shift the sentence by one, and and the next word at its end
    sentence = sentence[1:] + [next_word]

print(generated)
