### Based on: https://machinelearningmastery.com/use-word-embedding-layers-deep-learning-keras/ and https://machinelearningmastery.com/how-to-develop-a-word-level-neural-language-model-in-keras/ and https://github.com/stanfordnlp/GloVe

## Choose GPU to Run

In [1]:
# From: https://github.com/keras-team/keras/issues/6031
import os
gpu_id = input( "Select GPU [0 or 1]: " )

if gpu_id in [ "0", "1" ]:
    os.environ[ "CUDA_VISIBLE_DEVICES" ] = gpu_id
else:
    print( "Invalid GPU id.  Defaulting to '0,1'" )

Select GPU [0 or 1]: 1


## Choose CPU Cores

In [2]:
cores = 12
share_cores = input( "Share CPU cores w/ other models? [y/n]: " )

if share_cores == "y":
    
    cores = int( cores / 2 )

print( "Allocating %d cores to this notebook" % cores )

# From: https://stackoverflow.com/questions/46421258/limit-number-of-cores-used-in-keras

from keras import backend as K
K.set_session(
    K.tf.Session(
        config=K.tf.ConfigProto(
            intra_op_parallelism_threads=cores, inter_op_parallelism_threads=cores 
        )
    )
)

Share CPU cores w/ other models? [y/n]: y
Allocating 6 cores to this notebook


Using TensorFlow backend.


In [3]:
import gc
import re
import time
import datetime
import string
from pickle import dump
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
from keras.models import load_model
import collections

import numpy as np

from random import randint
from keras.preprocessing.sequence import pad_sequences

def get_time( output=True ):
    
    temp = time.time()
    if output:
        now = datetime.datetime.now()
        print( now.strftime( "%Y.%m.%d %H:%M" ) )
        
    return temp

foo = get_time()

def print_time( start_time, end_time, interval="seconds" ):
    
    if interval == "hours":
        print ( "Time to process: [%s] hours" % ( str( ( end_time - start_time ) / 60 / 60 ) ) )
    else:
        print ( "Time to process: [%s] seconds" % ( str( end_time - start_time ) ) )

print_time( 0, 1 )


#in_filename = "../texts/alice-in-wonderland.txt"
#in_filename = "../texts/dr-zeuss-compilation.txt"
in_filename = "../texts/trump-tweets.txt"

2018.06.25 11:56
Time to process: [1] seconds


## Load Doc, Line by Line

In [4]:
# http://cmdlinetips.com/2011/08/three-ways-to-read-a-text-file-line-by-line-in-python/
def load_doc_by_line( filename ):
    
    # Open the file with read only permit
    file = open( filename, "r" )
    
    # use readlines to read all lines in the file
    # The variable "lines" is a list containing all lines in the file
    lines = file.readlines()
    
    # close the file after reading the lines.
    file.close()
    
    return lines

start_time = get_time()
tweets = load_doc_by_line( in_filename )
print_time( start_time, get_time() )

2018.06.25 11:56
2018.06.25 11:56
Time to process: [0.01025390625] seconds


## Look at Tweet Stats

In [5]:
# approximate words per tweet
word_count = 0
tweet_lens = 0
max_words = 0
max_idx = 0
min_idx = 0
min_words = 100
for i, tweet in enumerate( tweets ):
    
    tweet_lens += len( tweet )
    words = len( tweet.split( " " ) )
    word_count += words
    if words > max_words:
        max_words = words
        max_id = i
    if words < min_words:
        min_words = words
        min_idx = i

words_per_tweet = word_count / len( tweets )
chars_pre_tweet = tweet_lens / len( tweets )

print( "Tweets [%d], total words [%d], (mean words & chars)/tweet [%.2f] words, [%.2f] chars" % ( len( tweets ), word_count, words_per_tweet, chars_pre_tweet ) )
print( "Max/Min words [%d/%d] per tweet" % ( max_words, min_words ) )
print( "Max tweet:", tweets[ max_idx ] )
print( "Min tweet:", tweets[ min_idx ] )


Tweets [22322], total words [392403], (mean words & chars)/tweet [17.58] words, [112.10] chars
Max/Min words [62/1] per tweet
Max tweet: Just met with UN Secretary-General António Guterres who is working hard to “Make the United Nations Great Again.” When the UN does more to solve conflicts around the world it means the U.S. has less to do and we save money. @NikkiHaley is doing a fantastic job! https://t.co/pqUv6cyH2z

Min tweet: https://t.co/6VLQYAlcto



In [6]:
# add tweet oepn/close tags
tweets = [ "opentweetopen {} closetweetclose".format( tweet ) for tweet in tweets ]
tweets[ 0 ]          
          

'opentweetopen Just met with UN Secretary-General António Guterres who is working hard to “Make the United Nations Great Again.” When the UN does more to solve conflicts around the world it means the U.S. has less to do and we save money. @NikkiHaley is doing a fantastic job! https://t.co/pqUv6cyH2z\n closetweetclose'

In [7]:
# create doc from individual tweets
doc = " ".join( tweets )
tweets = None
gc.collect()

0

In [8]:
def load_doc( filename ):
    
    # open the file as read only
    file = open( filename, 'r' )
    # read all text
    text = file.read()
    # close the file
    file.close()
    
    return text

# # load document
# doc = load_doc( in_filename )
# print( doc[ :200 ] )


In [9]:
print( doc[ :800 ] )

opentweetopen Just met with UN Secretary-General António Guterres who is working hard to “Make the United Nations Great Again.” When the UN does more to solve conflicts around the world it means the U.S. has less to do and we save money. @NikkiHaley is doing a fantastic job! https://t.co/pqUv6cyH2z
 closetweetclose opentweetopen America is a Nation that believes in the power of redemption. America is a Nation that believes in second chances - and America is a Nation that believes that the best is always yet to come! #PrisonReform https://t.co/Yk5UJUYgHN
 closetweetclose opentweetopen We grieve for the terrible loss of life and send our support and love to everyone affected by this horrible attack in Texas. To the students families teachers and personnel at Santa Fe High School – we are wit


In [10]:
# my_punctuation = string.punctuation
# print( type( my_punctuation ) )
# print( my_punctuation )
my_punctuation = '"#$%&\'()*+,-/:;<=>@[\\]^_`{|}~'
my_punctuation

'"#$%&\'()*+,-/:;<=>@[\\]^_`{|}~'

## Build Encoded Punctuation to Punctuation Dictionary

In [11]:
punctuation_dict = {}
punctuation_dict[ "endperiod" ] = "."
punctuation_dict[ "endquestion" ] = "?"
punctuation_dict[ "endexclamation" ] = "!"
punctuation_dict[ "pausecomma" ] = ","
punctuation_dict[ "pausecolon" ] = ":"
punctuation_dict[ "pausesemicolon" ] = ";"
punctuation_dict[ "smartquoteopen" ] = '“'
punctuation_dict[ "smartquoteclose" ] = '”'
punctuation_dict[ "attweetat" ] = '@'
punctuation_dict[ "tweetlink" ] = "[link]"
punctuation_dict[ "hashtweethash" ] = '#'
punctuation_dict[ "opentweetopen" ] = '[start]'
punctuation_dict[ "closetweetclose" ] = '[end]'


In [12]:
# turn a doc into clean tokens
def clean_doc( doc, to_lower=True ):
    
    # replace '--' with a space ' '
    doc = doc.replace( '--', ' ' )
    # replace sentence simple sentence boundaries w/ unique token/markers
    doc = doc.replace( '. ', ' endperiod ' )
    doc = doc.replace( '! ', ' endexclamation ' )
    doc = doc.replace( '? ', ' endquestion ' )
    doc = doc.replace( ', ', ' pausecomma ' )
    doc = doc.replace( ': ', ' pausecolon ' )
    doc = doc.replace( '; ', ' pausesemicolon ' )
    doc = doc.replace( '“', 'smartquoteopen ' )
    doc = doc.replace( '”', ' smartquoteclose' )
    doc = doc.replace( "@ ", " " ) # remove trailing @'s first...
    doc = doc.replace( " @", " attweetat" ) # ...then encode 1st char @'s
    doc = doc.replace( "# ", " " ) # remove trailing #'s first...
    doc = doc.replace( " #", " hashtweethash" ) # ...then encode 1st char #'s
    
    # replace links w/ "tweetlink"
    # basic regex here: https://bytes.com/topic/python/answers/741677-find-replace-hyperlinks-string
    http_pattern = r'http[^\s\n\r]+'
    doc = re.sub( http_pattern , "tweetlink", doc )
    
    # split into tokens by white space
    tokens = doc.split()
    
    # remove punctuation from each token
    table = str.maketrans( '', '', string.punctuation ) # will strip all .?!,:; that don't fit replace expr above.
    #table = str.maketrans( '', '', my_punctuation )
    tokens = [ w.translate( table ) for w in tokens ]
    
    # remove remaining tokens that are not alphabetic
    if to_lower:
        tokens = [ word for word in tokens if word.isalpha() ]
    
    # make lower case
    tokens = [ word.lower() for word in tokens ] 
    
    return tokens

## Load and Clean Doc

In [13]:
start_time = get_time()

# clean document
tokens = clean_doc( doc )
tokens_unique = list( set( tokens ) )
print( tokens[ :100 ] )
print( 'Total Tokens: %d' % len( tokens ) )
print( 'Unique Tokens: %d' % len( tokens_unique ) )

print_time( start_time, get_time() )

# 2018.06.01 10:14
# Total Tokens: 399137
# Unique Tokens: 21148
# Time to process: [0.31341099739074707] seconds

2018.06.25 11:56
['opentweetopen', 'just', 'met', 'with', 'un', 'secretarygeneral', 'antónio', 'guterres', 'who', 'is', 'working', 'hard', 'to', 'smartquoteopen', 'make', 'the', 'united', 'nations', 'great', 'again', 'smartquoteclose', 'when', 'the', 'un', 'does', 'more', 'to', 'solve', 'conflicts', 'around', 'the', 'world', 'it', 'means', 'the', 'us', 'endperiod', 'has', 'less', 'to', 'do', 'and', 'we', 'save', 'money', 'endperiod', 'attweetatnikkihaley', 'is', 'doing', 'a', 'fantastic', 'job', 'endexclamation', 'tweetlink', 'closetweetclose', 'opentweetopen', 'america', 'is', 'a', 'nation', 'that', 'believes', 'in', 'the', 'power', 'of', 'redemption', 'endperiod', 'america', 'is', 'a', 'nation', 'that', 'believes', 'in', 'second', 'chances', 'and', 'america', 'is', 'a', 'nation', 'that', 'believes', 'that', 'the', 'best', 'is', 'always', 'yet', 'to', 'come', 'endexclamation', 'hashtweethashprisonreform', 'tweetlink', 'closetweetclose', 'opentweetopen', 'we', 'grieve', 'for']
Total To

## Look at Token Frequencies

In [14]:
word_counts = collections.Counter( tokens )
first2pairs = { k: word_counts[ k ] for k in list( word_counts )[ :10 ] }
first2pairs

{'antónio': 1,
 'guterres': 1,
 'is': 6207,
 'just': 1334,
 'met': 51,
 'opentweetopen': 22322,
 'secretarygeneral': 1,
 'un': 37,
 'who': 1028,
 'with': 2401}

In [15]:
for word, count in word_counts.most_common( 25 ):
    print( '%s: %7d' % ( word, count ) )

opentweetopen:   22322
closetweetclose:   22322
the:   15056
endperiod:   14523
to:    9223
a:    6911
and:    6876
tweetlink:    6742
is:    6207
of:    6100
in:    5728
on:    4075
for:    4062
i:    4061
you:    3687
be:    3470
will:    3460
endexclamation:    3153
great:    3051
that:    2463
it:    2446
at:    2413
with:    2401
are:    2397
pausesemicolon:    2246


In [16]:
start_time = get_time()

# organize into sequences of tokens
sequence_len = 50 + 1
sequences = list()

for i in range( sequence_len, len( tokens ) ):
    
    # select sequence of tokens
    seq = tokens[ i - sequence_len:i ]
    
    # convert into a line
    line = ' '.join( seq )
    
    # store
    sequences.append( line )
    
print( 'Total Sequences: %d' % len( sequences ) )
print_time( start_time, get_time() )


2018.06.25 11:56
Total Sequences: 443730
2018.06.25 11:56
Time to process: [0.37273311614990234] seconds


In [17]:
# save tokens to file, one dialog per line
def save_doc( lines, filename ):
    
    data = '\n'.join( lines )
    file = open( filename, 'w' )
    file.write( data )
    file.close()

In [18]:
# save sequences to file
out_filename = "../texts/trump-tweets-sequences-02.txt"
save_doc( sequences, out_filename )

In [51]:
in_filename = "../texts/trump-tweets-sequences-02.txt"
#doc = load_doc( in_filename )
lines = load_doc( in_filename ).split( '\n' )
lines[ 0:10 ]

['opentweetopen just met with un secretarygeneral antónio guterres who is working hard to smartquoteopen make the united nations great again smartquoteclose when the un does more to solve conflicts around the world it means the us endperiod has less to do and we save money endperiod attweetatnikkihaley is doing a fantastic',
 'just met with un secretarygeneral antónio guterres who is working hard to smartquoteopen make the united nations great again smartquoteclose when the un does more to solve conflicts around the world it means the us endperiod has less to do and we save money endperiod attweetatnikkihaley is doing a fantastic job',
 'met with un secretarygeneral antónio guterres who is working hard to smartquoteopen make the united nations great again smartquoteclose when the un does more to solve conflicts around the world it means the us endperiod has less to do and we save money endperiod attweetatnikkihaley is doing a fantastic job endexclamation',
 'with un secretarygeneral an

## Convert Words to Index Values

In [52]:
# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts( lines )
sequences = tokenizer.texts_to_sequences( lines )

In [54]:
# iterate lists of lists, and get lens
seq_len_sum = 0
seq_len_dict = {}

for seq in sequences:
    
    seq_len_sum += len( seq )
    if len( seq ) in seq_len_dict:
        seq_len_dict[ len( seq ) ] += 1
    else:
        seq_len_dict[ len( seq ) ] = 1
        
print( seq_len_sum / len( sequences ) ) 
print( seq_len_dict )

51.0
{51: 443730}


In [48]:
print( type( sequences ) )
print( type( sequences[ 0 ] ) )
print( type( sequences[ 0 ][ 0 ] ) )
print()
print( type( sequences ) )
print( type( sequences[ 0:1 ] ) )
print( type( sequences[ 0:1 ][ 0 ] ) )
print( sequences[ 0:1 ][ 0 ].shape )


<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.int64'>

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(51,)


In [46]:
print( type( sequences[ 0:1 ][ 0 ] ) )
print( sequences[ 0:1 ][ 0 ] )

<class 'numpy.ndarray'>
[    2    41   937    23  1241 21062 21061 21060    56     9   259   160
     5    43    71     3   267   742    19    87    44    89     3  1241
   212    69     5  1303  5362   540     3   141    21   998     3    51
     4    38   572     5    59     7    28   818   154     4  7587     9
   119     6   235]


In [43]:
print( type( sequences ) )
print( type( sequences[ 0:1 ] ) )
print( type( sequences[ 0:1 ][ 0 ] ) )
print()
print( sequences.shape )
print( type( sequences[ 0:1 ][ 0 ] ) )
print( sequences[ 0:1 ][ 0 ] )

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>

(443730, 51)
<class 'numpy.ndarray'>
[    2    41   937    23  1241 21062 21061 21060    56     9   259   160
     5    43    71     3   267   742    19    87    44    89     3  1241
   212    69     5  1303  5362   540     3   141    21   998     3    51
     4    38   572     5    59     7    28   818   154     4  7587     9
   119     6   235]


In [22]:
# elegant! https://stackoverflow.com/questions/41971587/how-to-convert-predicted-sequence-back-to-text-in-keras
sequences_to_texts = dict( map( reversed, tokenizer.word_index.items() ) ) 

In [23]:
sequences_to_texts[ 39 ]

'thank'

In [24]:
print( len( sequences[ 0 ] ) == sequence_len )
print( len( sequences ) )

True
443730


In [25]:
print( len( tokenizer.word_index ) )
print( type( tokenizer.word_index ) )
print( tokenizer.word_index[ "thank" ] )

21062
<class 'dict'>
39


In [26]:
# vocabulary size
vocab_size = len( tokenizer.word_index ) + 1
vocab_size

21063

In [38]:
print( type( sequences ))
print( sequences.shape )

print( sequences[ 0 ] )
print( type( sequences[ 0 ] ) )

# separate into input and output: for now it's 50 words input and 1 word output
sequences = np.array( sequences )
X = sequences[ :,:-1 ] # all rows, from word 0 up to, but not including, the last word
y = sequences[ :,-1 ]  # all rows, last word only

# Throws MemoryError
# https://stackoverflow.com/questions/46293734/memoryerror-in-keras-utils-np-utils-to-categorical
#y = to_categorical( y, num_classes=vocab_size )

seq_length = X.shape[ 1 ]
seq_length

<class 'numpy.ndarray'>
(443730, 51)
[    2    41   937    23  1241 21062 21061 21060    56     9   259   160
     5    43    71     3   267   742    19    87    44    89     3  1241
   212    69     5  1303  5362   540     3   141    21   998     3    51
     4    38   572     5    59     7    28   818   154     4  7587     9
   119     6   235]
<class 'numpy.ndarray'>


50

## Load and Filter GloVe Data

In [28]:
start_time = get_time()

# load the whole embedding into memory
embeddings_index = dict()
embeddings_dimension = 300 #must be 50, 100, 200, 300
glove = open( "../glove/glove.6B." + str( embeddings_dimension ) + "d.txt" )

for line in glove:
    
    values = line.split()
    # 1st string is word...
    word = values[ 0 ]
    
    if word in tokens_unique:
        
        # ...the rest are coefficients
        coefs = np.asarray( values[ 1: ], dtype='float32' )
        embeddings_index[ word ] = coefs
        #print( "*", end="" )
    
glove.close()
print( '\nLoaded %s word vectors.' % len( embeddings_index ) )
print( '\nWords not found %d.' % ( len( tokenizer.word_index ) - len( embeddings_index ) ) )
print_time( start_time, get_time() )

2018.06.25 11:56

Loaded 13764 word vectors.

Words not found 7298.
2018.06.25 11:58
Time to process: [112.34948635101318] seconds


### Transform into Matrix That Maps Coefs by Index

In [29]:
# create a weight matrix for words in training docs
embedding_matrix = np.zeros( ( vocab_size, embeddings_dimension ) )
missing_words = []

# we need this to create empty coefficients array
dummy_shape = embeddings_index[ "the" ].shape

for word, i in tokenizer.word_index.items():
    
    embedding_vector = embeddings_index.get( word )
    
    # not all words in our token list are in the wikipedia 400K set!
    if embedding_vector is None:
        
        # report and create empty coefficients array
        missing_words.append( word )
        embedding_vector = np.zeros( dummy_shape )
        
    embedding_matrix[ i ] = embedding_vector
    
print( len( missing_words ) )
missing_words

7298


['closetweetclose',
 'opentweetopen',
 'endperiod',
 'tweetlink',
 'endexclamation',
 'pausesemicolon',
 'pausecolon',
 'smartquoteopen',
 'smartquoteclose',
 'endquestion',
 'attweetatbarackobama',
 'attweetatfoxnews',
 'hashtweethashmakeamericagreatagain',
 'attweetatmittromney',
 'attweetatfoxandfriends',
 'attweetatapprenticenbc',
 'hashtweethashcelebapprentice',
 'attweetatcnn',
 'attweetatbarackobamas',
 'attweetatnytimes',
 'attweetatcelebapprentice',
 'hashtweethashmaga',
 'hashtweethashtimetogettough',
 'hashtweethashtrumpvlog',
 'attweetatnbc',
 'twitlonger',
 'attweetatgretawire',
 'attweetativankatrump',
 'attweetatnewsmaxmedia',
 'attweetatseanhannity',
 'attweetatrealdonaldtrump',
 'attweetatbillmaher',
 'attweetatmacys',
 'hashtweethashamericafirst',
 'attweetatoreillyfactor',
 'realdonaldtrump',
 'hashtweethashdraintheswamp',
 'attweetattrumpdoral',
 'attweetatwhitehouse',
 'arod',
 'attweetatyankees',
 'daca',
 'attweetatgop',
 'attweetatbreitbartnews',
 'hashtweethash

In [30]:
# confirm visually that 
print( len( embedding_matrix[ 0 ] ) )
print( sum( embedding_matrix[ 0 ] ) )
empty_coefficients_count = 0

for i in range( len( embedding_matrix ) ):
    if sum( embedding_matrix[ i ] ) == 0:
        empty_coefficients_count += 1
        
empty_coefficients_count

300
0.0


7299

In [36]:
print( "sequences[ 0 ]", sequences[ 0 ] )
print()
print( "X[ 0:1 ]", X[ 0:1 ] )
print()
print( "y[ 0:3 ]", y[ 0:3 ] )
print( "X.shape", X.shape )
seq_length = len( X[ 0 ] )
print( "seq_length", seq_length )
print( "type( X )", type( X ) )
print( "type( y )", type( y ) )

sequences[ 0 ] [    2    41   937    23  1241 21062 21061 21060    56     9   259   160
     5    43    71     3   267   742    19    87    44    89     3  1241
   212    69     5  1303  5362   540     3   141    21   998     3    51
     4    38   572     5    59     7    28   818   154     4  7587     9
   119     6   235]

X[ 0:1 ] [[    2    41   937    23  1241 21062 21061 21060    56     9   259   160
      5    43    71     3   267   742    19    87    44    89     3  1241
    212    69     5  1303  5362   540     3   141    21   998     3    51
      4    38   572     5    59     7    28   818   154     4  7587     9
    119     6]]

y[ 0:3 ] [235 124  18]
X.shape (443730, 50)
seq_length 50
type( X ) <class 'numpy.ndarray'>
type( y ) <class 'numpy.ndarray'>


## Define Model

In [31]:
import keras
print( keras.__version__ )

import tensorflow as tf
print( tf.__version__ )

2.1.3
1.4.1


In [32]:
# define model
model = Sequential()

# now using a pre-trained, non-trainable embedding from glove's wiki analysis
model.add( Embedding( vocab_size, embeddings_dimension, weights=[embedding_matrix], input_length=seq_length, trainable=True ) )
model.add( LSTM( seq_length * 2, return_sequences=True ) )
model.add( LSTM( seq_length * 2 ) )
model.add( Dense( seq_length * 2, activation='relu' ) )

# fixed TypeError below, downgraded keras from 2.1.5 to 2.1.3: https://github.com/keras-team/keras/issues/9621
# TypeError: softmax() got an unexpected keyword argument 'axis'
model.add( Dense( vocab_size, activation='softmax' ) )

print( model.summary() )

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 50, 300)           6318900   
_________________________________________________________________
lstm_1 (LSTM)                (None, 50, 100)           160400    
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_1 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_2 (Dense)              (None, 21063)             2127363   
Total params: 8,697,163
Trainable params: 8,697,163
Non-trainable params: 0
_________________________________________________________________
None


## Fit the Model

In [32]:
# calc batch size
print( len( sequences ) / 128 )
print( len( sequences ) / 1028 )
# Was:
#batch_size = 124
batch_size = 1024

# can't remember where I read that batch sizes larger than 512 cause erratic convergence patterns.
# TODO: find that article!
#batch_size = 512


3466.640625
431.64396887159535


## Load Model?

In [33]:
load = input( "Load model? [y/n]" )

if load == "y":
    
    model_name = "models/trump-tweets-w-links-n-ats-take-II.h5"
    print( "Loading model %s" % model_name )
    model = load_model( model_name )
    
else:
    
    print( "NOT loading model, using default untrained model" )

Load model? [y/n]y
Loading model models/trump-tweets-w-links-n-ats-03.h5


In [34]:
start_time = get_time()
# compile model

# Per comment here: https://stackoverflow.com/questions/46293734/memoryerror-in-keras-utils-np-utils-to-categoricalhttps://stackoverflow.com/questions/46293734/memoryerror-in-keras-utils-np-utils-to-categorical
model.compile( loss='sparse_categorical_crossentropy', optimizer='adam', metrics=[ 'accuracy' ] )
# model.compile( loss='categorical_crossentropy', optimizer='adam', metrics=[ 'accuracy' ] )
# fit model: recent version takes ~1.5 hrs for 50 epochs = ~33% accuracy
model.fit( X, y, batch_size=batch_size, epochs=200 )
end_time = get_time()
print_time( start_time, end_time, interval="hours" )

# was 115s/epoch, before GTX 1080 card, 94s/epoch after
# Now 81s when batch size doubled to 1,024... And it converges faster than 512 batch.  Wuh?!?

# 2018.06.05 18:21
# Time to process: [1.338104132546319] hours for 100 epochs

2018.06.06 10:54
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoc

Epoch 155/200
Epoch 156/200
Epoch 157/200
Epoch 158/200
Epoch 159/200
Epoch 160/200
Epoch 161/200
Epoch 162/200
Epoch 163/200
Epoch 164/200
Epoch 165/200
Epoch 166/200
Epoch 167/200
Epoch 168/200
Epoch 169/200
Epoch 170/200
Epoch 171/200
Epoch 172/200
Epoch 173/200
Epoch 174/200
Epoch 175/200
Epoch 176/200
Epoch 177/200
Epoch 178/200
Epoch 179/200
Epoch 180/200
Epoch 181/200
Epoch 182/200
Epoch 183/200
Epoch 184/200
Epoch 185/200
Epoch 186/200
Epoch 187/200
Epoch 188/200
Epoch 189/200
Epoch 190/200
Epoch 191/200
Epoch 192/200
Epoch 193/200
Epoch 194/200
Epoch 195/200
Epoch 196/200
Epoch 197/200
Epoch 198/200
Epoch 199/200
Epoch 200/200
2018.06.06 13:36
Time to process: [2.706995360387696] hours


In [51]:
# save the whole model to file
model.save( "models/trump-tweets-w-links-n-ats-take-II.h5" )

# save the tokenizer
dump( tokenizer, open( "tokenizers/trump-tweets-w-links-n-ats-take-II.dump", 'wb' ) )

# save embedding_matrix based on wiki embeddings, complete w/ missing coefficients array dummies
dump( embedding_matrix, open( "embeddings/trump-tweats-w-links-n-ats-take-II.glove", 'wb' ) )


## Use The Model to Generate Text

In [52]:
seq_length = len( lines[ 0 ].split() ) - 1
seq_length

50

In [53]:
punctuation_dict.get( "smartquoteclose", "bar" )

'”'

In [54]:
def generate_seq( model, tokenizer, seq_length, seed_text, n_words ):
    
    result = list()
    in_text = seed_text
    yhat = [ 0.1 ]
    
    # generate a fixed number of words
    for _ in range( n_words ):
        
        # encode the text as integer
        encoded = tokenizer.texts_to_sequences( [ in_text ] )[ 0 ] 
        
        # truncate sequences to a fixed length
        encoded = pad_sequences( [ encoded ], maxlen=seq_length, truncating='pre' ) 
        
        # predict probabilities for each word
        yhat = model.predict_classes( encoded, verbose=0 )
        
        # map predicted word index to word
        out_word = sequences_to_texts[ yhat[ 0 ] ]
                
        # append to input
        in_text += ' ' + out_word
        
        #result.append( out_word )
        # substitute punctuation tags for actual punctuation
        result.append( punctuation_dict.get( out_word, out_word ) )
        
        if out_word == "closetweetclose":
            #print( "Tweet end detected" )
            break
            
    print( yhat )
    return ' '.join( result )

In [43]:
type( sequences_to_texts[ 1 ] )

str

In [44]:
def reformat_punctuation( doc ):
    
    doc = doc.replace( ' . ', '. ' )
    doc = doc.replace( ' ! ', '! ' )
    doc = doc.replace( ' ? ', '? ' )
    doc = doc.replace( ' , ', ', ' )
    doc = doc.replace( ' : ', ': ' )
    doc = doc.replace( ' ; ', '; ' )
    
    doc = doc.replace( '“ ', '“' )
    doc = doc.replace( ' ”', '”' )
    doc = doc.replace( "attweetat", '@' )
    doc = doc.replace( "hashtweethash", '#' )
    
    return doc

In [49]:
# select a seed text
seed_text = lines[ randint( 0, len( lines ) ) ]
# substitute the seed words
raw_text = seed_text.split( " " )

clean_text = [ punctuation_dict.get( word, word ) for word in raw_text ]
clean_text = ' '.join( clean_text )

print( reformat_punctuation( clean_text ) + '... \n' )
#print( len( seed_text.split( " " ) ) )

# generate new text
generated = generate_seq( model, tokenizer, seq_length, seed_text, 50 )
# print( "... " + generated )
print( "\n\n... " + reformat_punctuation( generated ) )

to others but know the final decision is yours” think like a champion [end] [start] along with a soaring bar of skybound gold pool deck overlooks the city of lights [link] [end] [start] an architectural landmark @trumptowerny offers sweeping panoramic views of fifth avenue [link] [end] [start] “age is... 

[1]


... whatever you think your socalled art of the deal on my life. know your and work amp; very. really badly really bravelythankavet rally with the #superbowl and its time. we said and do nothing not increase our country [end]


In [50]:
my_input = input()
"opentweetopen " + my_input
generated = generate_seq( model, tokenizer, seq_length, my_input, 50 )
print( "... " + reformat_punctuation( generated ) )

“age is
[1]
... big [link] [link] [end]
