### Based on: https://machinelearningmastery.com/use-word-embedding-layers-deep-learning-keras/ and https://machinelearningmastery.com/how-to-develop-a-word-level-neural-language-model-in-keras/ and https://github.com/stanfordnlp/GloVe

In [1]:
import re
import time
import datetime
import string
from pickle import dump
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
from keras.models import load_model
import collections

import numpy as np

from random import randint
from keras.preprocessing.sequence import pad_sequences

def get_time( output=True ):
    
    temp = time.time()
    if output:
        now = datetime.datetime.now()
        print( now.strftime( "%Y.%m.%d %H:%M" ) )
        
    return temp

foo = get_time()

def print_time( start_time, end_time, interval="seconds" ):
    
    if interval == "hours":
        print ( "Time to process: [%s] hours" % ( str( ( end_time - start_time ) / 60 / 60 ) ) )
    else:
        print ( "Time to process: [%s] seconds" % ( str( end_time - start_time ) ) )

print_time( 0, 1 )

Using TensorFlow backend.


2018.05.23 15:42
Time to process: [1] seconds


In [7]:
# load doc into memory
def load_doc( filename ):
    
    # open the file as read only
    file = open( filename, 'r' )
    # read all text
    text = file.read()
    # close the file
    file.close()
    
    return text

# load document
#in_filename = "../texts/alice-in-wonderland.txt"
#in_filename = "../texts/dr-zeuss-compilation.txt"
in_filename = "../texts/trump-tweets.txt"
doc = load_doc( in_filename )
print( doc[ :200 ] )

Just met with UN Secretary-General António Guterres who is working hard to “Make the United Nations Great Again.” When the UN does more to solve conflicts around the world it means the U.S. has less t


In [8]:
print( doc[ :400 ] )

Just met with UN Secretary-General António Guterres who is working hard to “Make the United Nations Great Again.” When the UN does more to solve conflicts around the world it means the U.S. has less to do and we save money. @NikkiHaley is doing a fantastic job! https://t.co/pqUv6cyH2z
America is a Nation that believes in the power of redemption. America is a Nation that believes in second chances 


In [9]:
# my_punctuation = string.punctuation
# print( type( my_punctuation ) )
# print( my_punctuation )
my_punctuation = '"#$%&\'()*+,-/:;<=>@[\\]^_`{|}~'
my_punctuation

'"#$%&\'()*+,-/:;<=>@[\\]^_`{|}~'

## Build Encoded Punctuation to Punctuation Dictionary

In [10]:
punctuation_dict = {}
punctuation_dict[ "endperiod" ] = "."
punctuation_dict[ "endquestion" ] = "?"
punctuation_dict[ "endexclamation" ] = "!"
punctuation_dict[ "pausecomma" ] = ","
punctuation_dict[ "pausecolon" ] = ":"
punctuation_dict[ "pausesemicolon" ] = ";"
punctuation_dict[ "smartquoteopen" ] = '“'
punctuation_dict[ "smartquoteclose" ] = '”'
punctuation_dict[ "attweetat" ] = '@'
punctuation_dict[ "tweetlink" ] = "[link]"
punctuation_dict[ "hashtweethash" ] = '#'


In [11]:
# turn a doc into clean tokens
def clean_doc( doc, to_lower=True ):
    
    # replace '--' with a space ' '
    doc = doc.replace( '--', ' ' )
    # replace sentence simple sentence boundaries w/ unique token/markers
    doc = doc.replace( '. ', ' endperiod ' )
    doc = doc.replace( '! ', ' endexclamation ' )
    doc = doc.replace( '? ', ' endquestion ' )
    doc = doc.replace( ', ', ' pausecomma ' )
    doc = doc.replace( ': ', ' pausecolon ' )
    doc = doc.replace( '; ', ' pausesemicolon ' )
    doc = doc.replace( '“', 'smartquoteopen ' )
    doc = doc.replace( '”', ' smartquoteclose' )
    doc = doc.replace( "@ ", " " ) # remove trailing @'s first...
    doc = doc.replace( " @", " attweetat" ) # ...then encode 1st char @'s
    doc = doc.replace( "# ", " " ) # remove trailing #'s first...
    doc = doc.replace( " #", " hashtweethash" ) # ...then encode 1st char #'s
    
    # replace links w/ "tweetlink"
    # basic regex here: https://bytes.com/topic/python/answers/741677-find-replace-hyperlinks-string
    http_pattern = r'http[^\s\n\r]+'
    doc = re.sub( http_pattern , "tweetlink", doc )
    
    # split into tokens by white space
    tokens = doc.split()
    
    # remove punctuation from each token
    table = str.maketrans( '', '', string.punctuation ) # will strip all .?!,:; that don't fit replace expr above.
    #table = str.maketrans( '', '', my_punctuation )
    tokens = [ w.translate( table ) for w in tokens ]
    
    # remove remaining tokens that are not alphabetic
    if to_lower:
        tokens = [ word for word in tokens if word.isalpha() ]
    
    # make lower case
    tokens = [ word.lower() for word in tokens ] 
    
    return tokens

## Load and Clean Doc

In [12]:
start_time = get_time()

# clean document
tokens = clean_doc( doc )
tokens_unique = list( set( tokens ) )
print( tokens[ :100 ] )
print( 'Total Tokens: %d' % len( tokens ) )
print( 'Unique Tokens: %d' % len( tokens_unique ) )

print_time( start_time, get_time() )

2018.05.23 15:45
['just', 'met', 'with', 'un', 'secretarygeneral', 'antónio', 'guterres', 'who', 'is', 'working', 'hard', 'to', 'smartquoteopen', 'make', 'the', 'united', 'nations', 'great', 'again', 'smartquoteclose', 'when', 'the', 'un', 'does', 'more', 'to', 'solve', 'conflicts', 'around', 'the', 'world', 'it', 'means', 'the', 'us', 'endperiod', 'has', 'less', 'to', 'do', 'and', 'we', 'save', 'money', 'endperiod', 'attweetatnikkihaley', 'is', 'doing', 'a', 'fantastic', 'job', 'endexclamation', 'tweetlink', 'america', 'is', 'a', 'nation', 'that', 'believes', 'in', 'the', 'power', 'of', 'redemption', 'endperiod', 'america', 'is', 'a', 'nation', 'that', 'believes', 'in', 'second', 'chances', 'and', 'america', 'is', 'a', 'nation', 'that', 'believes', 'that', 'the', 'best', 'is', 'always', 'yet', 'to', 'come', 'endexclamation', 'hashtweethashprisonreform', 'tweetlink', 'we', 'grieve', 'for', 'the', 'terrible', 'loss', 'of', 'life']
Total Tokens: 399137
Unique Tokens: 21148
2018.05.23 15:

In [13]:
word_counts = collections.Counter( tokens )
word_counts

Counter({'just': 1334,
         'met': 51,
         'with': 2401,
         'un': 37,
         'secretarygeneral': 1,
         'antónio': 1,
         'guterres': 1,
         'who': 1028,
         'is': 6207,
         'working': 202,
         'hard': 324,
         'to': 9223,
         'smartquoteopen': 1300,
         'make': 788,
         'the': 15056,
         'united': 196,
         'nations': 66,
         'great': 3051,
         'again': 635,
         'smartquoteclose': 1283,
         'when': 625,
         'does': 236,
         'more': 812,
         'solve': 34,
         'conflicts': 5,
         'around': 93,
         'world': 376,
         'it': 2446,
         'means': 47,
         'us': 1091,
         'endperiod': 14523,
         'has': 1370,
         'less': 86,
         'do': 936,
         'and': 6876,
         'we': 1986,
         'save': 60,
         'money': 337,
         'attweetatnikkihaley': 3,
         'doing': 458,
         'a': 6911,
         'fantastic': 218,
         'j

In [16]:
for word, count in word_counts.most_common( 100 ):
    print( '%s: %7d' % ( word, count ) )

the:   15056
endperiod:   14523
to:    9223
a:    6911
and:    6876
tweetlink:    6742
is:    6207
of:    6100
in:    5728
on:    4075
for:    4062
i:    4061
you:    3687
be:    3470
will:    3460
endexclamation:    3153
great:    3051
that:    2463
it:    2446
at:    2413
with:    2401
are:    2397
pausesemicolon:    2246
amp:    2187
have:    2009
we:    1986
our:    1955
pausecolon:    1947
my:    1816
trump:    1777
he:    1684
by:    1651
not:    1648
was:    1489
all:    1448
has:    1370
thank:    1341
this:    1339
just:    1334
they:    1330
smartquoteopen:    1300
smartquoteclose:    1283
people:    1207
me:    1173
so:    1162
your:    1146
from:    1131
obama:    1092
us:    1091
as:    1074
new:    1074
very:    1062
thanks:    1048
who:    1028
his:    1015
now:     977
do:     936
out:     935
no:     919
but:     912
about:     910
get:     896
what:     870
should:     867
big:     856
if:     842
more:     812
america:     807
make:     788
time:     783
endquestion:

In [8]:
start_time = get_time()

# organize into sequences of tokens
sequence_len = 50 + 1
sequences = list()

for i in range( sequence_len, len( tokens ) ):
    
    # select sequence of tokens
    seq = tokens[ i - sequence_len:i ]
    
    # convert into a line
    line = ' '.join( seq )
    
    # store
    sequences.append( line )
    
print( 'Total Sequences: %d' % len( sequences ) )
print_time( start_time, get_time() )


2018.05.22 09:33
Total Sequences: 399086
2018.05.22 09:33
Time to process: [0.2942061424255371] seconds


In [9]:
# save tokens to file, one dialog per line
def save_doc( lines, filename ):
    
    data = '\n'.join( lines )
    file = open( filename, 'w' )
    file.write( data )
    file.close()

In [10]:
# save sequences to file
out_filename = "../texts/trump-tweets-sequences-01.txt"
save_doc( sequences, out_filename )

In [11]:
in_filename = "../texts/trump-tweets-sequences-01.txt"
doc = load_doc( in_filename )
lines = doc.split( '\n' )
lines[ 0:10 ]

['just met with un secretarygeneral antónio guterres who is working hard to smartquoteopen make the united nations great again smartquoteclose when the un does more to solve conflicts around the world it means the us endperiod has less to do and we save money endperiod attweetatnikkihaley is doing a fantastic job',
 'met with un secretarygeneral antónio guterres who is working hard to smartquoteopen make the united nations great again smartquoteclose when the un does more to solve conflicts around the world it means the us endperiod has less to do and we save money endperiod attweetatnikkihaley is doing a fantastic job endexclamation',
 'with un secretarygeneral antónio guterres who is working hard to smartquoteopen make the united nations great again smartquoteclose when the un does more to solve conflicts around the world it means the us endperiod has less to do and we save money endperiod attweetatnikkihaley is doing a fantastic job endexclamation tweetlink',
 'un secretarygeneral a

## Convert Words to Index Values

In [12]:
# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts( lines )
sequences = tokenizer.texts_to_sequences( lines )

In [17]:
print( sequences[ 0 ][ 0 ] )
line[ 0 ][ 0 ]

39


'w'

In [18]:
# elegant! https://stackoverflow.com/questions/41971587/how-to-convert-predicted-sequence-back-to-text-in-keras
sequences_to_texts = dict( map( reversed, tokenizer.word_index.items() ) ) 

In [19]:
sequences_to_texts[ 39 ]

'just'

In [20]:
print( len( sequences[ 0 ] ) == sequence_len )
print( len( sequences ) )

True
399086


In [21]:
print( len( tokenizer.word_index ) )
print( type( tokenizer.word_index ) )
print( tokenizer.word_index[ "just" ] )

21148
<class 'dict'>
39


In [22]:
# vocabulary size
vocab_size = len( tokenizer.word_index ) + 1
vocab_size

21149

In [23]:
# separate into input and output: for now it's 50 words input and 1 word output
sequences = np.array( sequences )
X = sequences[ :,:-1 ] # all rows, from word 0 up to, but not including, the last word
y = sequences[ :,-1 ]  # all rows, last word only
y = to_categorical( y, num_classes=vocab_size )
seq_length = X.shape[ 1 ]
seq_length

50

## Load and Filter GloVe Data

In [24]:
start_time = get_time()

# load the whole embedding into memory
embeddings_index = dict()
embeddings_dimension = 300 #must be 50, 100, 200, 300
glove = open( "../glove/glove.6B." + str( embeddings_dimension ) + "d.txt" )

for line in glove:
    
    values = line.split()
    # 1st string is word...
    word = values[ 0 ]
    
    if word in tokens_unique:
        
        # ...the rest are coefficients
        coefs = np.asarray( values[ 1: ], dtype='float32' )
        embeddings_index[ word ] = coefs
        #print( "*", end="" )
    
glove.close()
print( '\nLoaded %s word vectors.' % len( embeddings_index ) )
print( '\nWords not found %d.' % ( len( tokenizer.word_index ) - len( embeddings_index ) ) )
print_time( start_time, get_time() )

2018.05.22 09:35

Loaded 13766 word vectors.

Words not found 7382.
2018.05.22 09:37
Time to process: [109.39622044563293] seconds


### Transform into Matrix That Maps Coefs by Index

In [25]:
# create a weight matrix for words in training docs
embedding_matrix = np.zeros( ( vocab_size, embeddings_dimension ) )
missing_words = []

# we need this to create empty coefficients array
dummy_shape = embeddings_index[ "the" ].shape

for word, i in tokenizer.word_index.items():
    
    embedding_vector = embeddings_index.get( word )
    
    # not all words in our token list are in the wikipedia 400K set!
    if embedding_vector is None:
        
        # report and create empty coefficients array
        missing_words.append( word )
        embedding_vector = np.zeros( dummy_shape )
        
    embedding_matrix[ i ] = embedding_vector
    
print( len( missing_words ) )
missing_words

7382


['endperiod',
 'tweetlink',
 'endexclamation',
 'pausesemicolon',
 'pausecolon',
 'smartquoteopen',
 'smartquoteclose',
 'endquestion',
 'attweetatbarackobama',
 'attweetatfoxnews',
 'attweetatmittromney',
 'attweetatfoxandfriends',
 'hashtweethashmakeamericagreatagain',
 'attweetatapprenticenbc',
 'attweetatcnn',
 'hashtweethashcelebapprentice',
 'attweetatbarackobamas',
 'attweetatnytimes',
 'attweetatcelebapprentice',
 'hashtweethashmaga',
 'hashtweethashtimetogettough',
 'attweetatnbc',
 'twitlonger',
 'attweetatgretawire',
 'attweetativankatrump',
 'attweetatnewsmaxmedia',
 'realdonaldtrump',
 'attweetatseanhannity',
 'attweetatbillmaher',
 'attweetatmacys',
 'attweetatrealdonaldtrump',
 'attweetatoreillyfactor',
 'hashtweethashamericafirst',
 'barackobama',
 'attweetatwhitehouse',
 'hashtweethashdraintheswamp',
 'attweetattrumpdoral',
 'arod',
 'attweetatyankees',
 'daca',
 'makeamericagreatagain',
 'hashtweethashtrumpvlog',
 'attweetatgop',
 'attweetatbreitbartnews',
 'attweetat

In [26]:
# confirm visually that 
print( len( embedding_matrix[ 0 ] ) )
print( sum( embedding_matrix[ 0 ] ) )
empty_coefficients_count = 0

for i in range( len( embedding_matrix ) ):
    if sum( embedding_matrix[ i ] ) == 0:
        empty_coefficients_count += 1
        
empty_coefficients_count

300
0.0


7383

## Define Model

In [27]:
import keras
print( keras.__version__ )

import tensorflow as tf
print( tf.__version__ )

2.1.3
1.4.1


In [30]:
# define model
model = Sequential()

# now using a pre-trained, non-trainable embedding from glove's wiki analysis
model.add( Embedding( vocab_size, embeddings_dimension, weights=[embedding_matrix], input_length=seq_length, trainable=True ) )
model.add( LSTM( seq_length * 2, return_sequences=True ) )
model.add( LSTM( seq_length * 2 ) )
model.add( Dense( seq_length * 2, activation='relu' ) )

# fixed TypeError below, downgraded keras from 2.1.5 to 2.1.3: https://github.com/keras-team/keras/issues/9621
# TypeError: softmax() got an unexpected keyword argument 'axis'
model.add( Dense( vocab_size, activation='softmax' ) )

print( model.summary() )

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 50, 300)           6344700   
_________________________________________________________________
lstm_1 (LSTM)                (None, 50, 100)           160400    
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_1 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_2 (Dense)              (None, 21149)             2136049   
Total params: 8,731,649
Trainable params: 8,731,649
Non-trainable params: 0
_________________________________________________________________
None


## Fit the Model

In [31]:
# calc batch size
print( len( sequences ) / 128 )
print( len( sequences ) / 1028 )
# Was:
#batch_size = 124
#batch_size = 1024

# can't remember where I read that batch sizes larger than 512 cause erratic convergence patterns.
# TODO: find that article!
batch_size = 512


3117.859375
388.215953307393


## Load Model?

In [40]:
load = input( "Load model? [y/n]" )

if load == "y":
    
    model_name = "models/trump-tweets-w-links-n-ats-02.h5"
    print( "Loading model %s" % model_name )
    model = load_model( model_name )
    
else:
    
    print( "NOT loading model" )

Load model? [y/n]y
Loading model models/trump-tweets-w-links-n-ats-02.h5


In [74]:
start_time = get_time()
# compile model
model.compile( loss='categorical_crossentropy', optimizer='adam', metrics=[ 'accuracy' ] )
# fit model: recent version takes ~1.5 hrs for 50 epochs = ~33% accuracy
model.fit( X, y, batch_size=batch_size, epochs=50 )
end_time = get_time()
print_time( start_time, end_time, interval="hours" )

2018.05.22 17:19
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
2018.05.22 18:55
Time to process: [1.6028001518381967] hours


In [75]:
# save the model to file
model.save( "models/trump-tweets-w-links-n-ats-02.h5" )

# save the tokenizer
dump( tokenizer, open( "tokenizers/trump-tweets-w-links-n-ats-02.dump", 'wb' ) )

# save embedding_matrix based on wiki embeddings, complete w/ missing coefficients array dummies
dump( embedding_matrix, open( "embeddings/trump-tweats-w-links-n-ats-02.glove", 'wb' ) )


## Use The Model to Generate Text

In [62]:
seq_length = len( lines[ 0 ].split() ) - 1
seq_length

50

In [63]:
punctuation_dict.get( "smartquoteclose", "bar" )

'”'

In [64]:
def generate_seq( model, tokenizer, seq_length, seed_text, n_words ):
    
    result = list()
    in_text = seed_text
    yhat = [ 0.1 ]
    
    # generate a fixed number of words
    for _ in range( n_words ):
        
        # encode the text as integer
        encoded = tokenizer.texts_to_sequences( [ in_text ] )[ 0 ] 
        
        # truncate sequences to a fixed length
        encoded = pad_sequences( [ encoded ], maxlen=seq_length, truncating='pre' ) 
        
        # predict probabilities for each word
        yhat = model.predict_classes( encoded, verbose=0 )
        
        # map predicted word index to word
        out_word = sequences_to_texts[ yhat[ 0 ] ]
                
        # append to input
        in_text += ' ' + out_word
        
        #result.append( out_word )
        # substitute punctuation tags for actual punctuation
        result.append( punctuation_dict.get( out_word, out_word ) )
        
    print( yhat )
    return ' '.join( result )

In [65]:
type( sequences_to_texts[ 1 ] )

str

In [66]:
def reformat_punctuation( doc ):
    
    doc = doc.replace( ' . ', '. ' )
    doc = doc.replace( ' ! ', '! ' )
    doc = doc.replace( ' ? ', '? ' )
    doc = doc.replace( ' , ', ', ' )
    doc = doc.replace( ' : ', ': ' )
    doc = doc.replace( ' ; ', '; ' )
    
    doc = doc.replace( '“ ', '“' )
    doc = doc.replace( ' ”', '”' )
    doc = doc.replace( "attweetat", '@' )
    doc = doc.replace( "hashtweethash", '#' )
    
    return doc

In [76]:
# select a seed text
seed_text = lines[ randint( 0, len( lines ) ) ]
# substitute the seed words
raw_text = seed_text.split( " " )

clean_text = [ punctuation_dict.get( word, word ) for word in raw_text ]
clean_text = ' '.join( clean_text )

print( reformat_punctuation( clean_text ) + '... \n' )
#print( len( seed_text.split( " " ) ) )

# generate new text
generated = generate_seq( model, tokenizer, seq_length, seed_text, 50 )
print( "... " + generated )
print( "\n\n... " + reformat_punctuation( generated ) )

[link] congratulations to tom brady on yet another great victory tom is my friend and a total winner the deal with iran will go down as one of the most incompetent ever made. the us. lost on virtually every point. we just dont win anymore brithume i am... 

[438]
... in first place and a major election of my administration has the fbi . he strongly refused to look into the first days of with surveillance . who would invest the opportunity to the end they had a product . not to participate ! unbelievable evening live watch the lead


... in first place and a major election of my administration has the fbi. he strongly refused to look into the first days of with surveillance. who would invest the opportunity to the end they had a product. not to participate! unbelievable evening live watch the lead


In [77]:
my_input = input()

generated = generate_seq( model, tokenizer, seq_length, my_input, 50 )
print( "... " + reformat_punctuation( generated ) )

mexico
[77]
... agents dont jump until trump according to michigan first in history dept. [link] pennsylvania. send build wall smart [link] my @foxnews interview on @teamcavuto @seanhannity @todayshow [link] just arrived to quantico! [link] in the park hyatt in washington dc. on january! #americafirst [link] past am


In [None]:
# def generate_seq_word_by_word( model, tokenizer, seq_length, seed_text, n_words ):
    
#     print( "...", end='' )
#     #result = list()
#     in_text = seed_text
    
#     # generate a fixed number of words
#     for _ in range( n_words ):
        
#         # encode the text as integer
#         encoded = tokenizer.texts_to_sequences( [ in_text ] )[ 0 ] 
        
#         # truncate sequences to a fixed length
#         encoded = pad_sequences( [ encoded ], maxlen=seq_length, truncating='pre' ) 
        
#         # predict probabilities for each word
#         yhat = model.predict_classes( encoded, verbose=0 )
        
#         # map predicted word index to word
#         out_word = ''
#         for word, index in tokenizer.word_index.items():
#             if index == yhat:
#                 out_word = word
#                 print( word, end=' ' )
#                 break 
                
#         # append to input for next iteration
#         in_text += ' ' + out_word

In [None]:
# select a seed text
seed_text = lines[ randint( 0, len( lines ) ) ]
print( seed_text + '...\n' )

# generate new text
generate_seq_word_by_word( model, tokenizer, seq_length, seed_text, 25 )

In [58]:
my_input = input()

# generate new text
generate_seq_word_by_word( model, tokenizer, seq_length, my_input, 25 )

america is


NameError: name 'generate_seq_word_by_word' is not defined