### Based on: https://machinelearningmastery.com/use-word-embedding-layers-deep-learning-keras/ and https://machinelearningmastery.com/how-to-develop-a-word-level-neural-language-model-in-keras/ and https://github.com/stanfordnlp/GloVe

## Choose GPU to Run

In [1]:
# From: https://github.com/keras-team/keras/issues/6031
import os
gpu_id = input( "Select GPU [0 or 1]: " )

if gpu_id in [ "0", "1" ]:
    os.environ[ "CUDA_VISIBLE_DEVICES" ] = gpu_id
else:
    print( "Invalid GPU id.  Defaulting to '0,1'" )

Select GPU [0 or 1]: 0


## Choose CPU Cores

In [2]:
cores = 12
share_cores = input( "Share CPU cores w/ other models? [y/n]: " )

if share_cores == "y":
    
    cores = int( cores / 2 )

print( "Allocating %d cores to this notebook" % cores )

# From: https://stackoverflow.com/questions/46421258/limit-number-of-cores-used-in-keras

from keras import backend as K
K.set_session(
    K.tf.Session(
        config=K.tf.ConfigProto(
            intra_op_parallelism_threads=cores, inter_op_parallelism_threads=cores 
        )
    )
)

Share CPU cores w/ other models? [y/n]: y
Allocating 6 cores to this notebook


Using TensorFlow backend.


In [6]:
import gc
import re
import time
import datetime
import string
import pickle
import tqdm
from pickle import dump
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Bidirectional
from keras.layers import Dropout
from keras.layers import Embedding
from keras.models import load_model
import collections
from wordsegment import load, segment
import numpy as np
from spacy.vectors import Vectors
from random import randint
from keras.preprocessing.sequence import pad_sequences

def get_time( output=True ):
    
    temp = time.time()
    if output:
        now = datetime.datetime.now()
        print( now.strftime( "%Y.%m.%d %H:%M" ) )
        
    return temp

start_time = get_time()

def print_time( start_time, end_time, interval="seconds" ):
    
    if interval == "hours":
        print ( "Time to process: [%s] hours" % ( str( ( end_time - start_time ) / 60 / 60 ) ) )
    else:
        print ( "Time to process: [%s] seconds" % ( str( end_time - start_time ) ) )

in_filename = "../texts/trump-tweets.txt"
#in_filename = "../texts/trump-speeches.txt"

# load segmentation dictionary: http://www.grantjenks.com/docs/wordsegment/
load()

print_time( start_time, get_time() )

# Global paths
training_X_path = "data/training-X-trump-tweets-w-links-n-ats-take-III.dump"
training_y_path = "data/training-y-trump-tweets-w-links-n-ats-take-III.dump"
model_path = "models/trump-tweets-w-links-n-ats-take-III.h5"
tokenizer_path = "tokenizers/trump-tweets-w-links-n-ats-take-III.dump"
embedding_path = "embeddings/trump-tweats-w-links-n-ats-take-III.glove"
embedding_keys_path = "embeddings/trump-tweats-w-links-n-ats-take-III.keys"
#embeddings_spacy_numpy_path = "embeddings/trump-tweats-w-links-n-ats-take-III-spacy.np"


2018.07.09 14:27
2018.07.09 14:27
Time to process: [0.33530759811401367] seconds


## Load Doc, Line by Line

In [2]:
# http://cmdlinetips.com/2011/08/three-ways-to-read-a-text-file-line-by-line-in-python/
def load_doc_by_line( filename ):
    
    # Open the file with read only permit
    file = open( filename, "r" )
    
    # use readlines to read all lines in the file
    # The variable "lines" is a list containing all lines in the file
    lines = file.readlines()
    
    # close the file after reading the lines.
    file.close()
    
    return lines

start_time = get_time()
tweets = load_doc_by_line( in_filename )
print_time( start_time, get_time() )

2018.07.03 20:23
2018.07.03 20:23
Time to process: [0.017048120498657227] seconds


## Look at Tweet Stats

In [3]:
# approximate words per tweet
word_count = 0
tweet_lens = 0
max_words = 0
max_idx = 0
min_idx = 0
min_words = 100

for i, tweet in enumerate( tweets ):
    
    tweet_lens += len( tweet )
    words = len( tweet.split( " " ) )
    word_count += words
    
    if words > max_words:
        max_words = words
        max_id = i
    if words < min_words:
        min_words = words
        min_idx = i

words_per_tweet = word_count / len( tweets )
chars_pre_tweet = tweet_lens / len( tweets )

print( "Tweets [%d], total words [%d], (mean words & chars)/tweet [%.2f] words, [%.2f] chars" % ( len( tweets ), word_count, words_per_tweet, chars_pre_tweet ) )
print( "Max/Min words [%d/%d] per tweet" % ( max_words, min_words ) )
print( "Max tweet:", tweets[ max_idx ] )
print( "Min tweet:", tweets[ min_idx ] )


Tweets [22322], total words [392403], (mean words & chars)/tweet [17.58] words, [112.10] chars
Max/Min words [62/1] per tweet
Max tweet: Just met with UN Secretary-General António Guterres who is working hard to “Make the United Nations Great Again.” When the UN does more to solve conflicts around the world it means the U.S. has less to do and we save money. @NikkiHaley is doing a fantastic job! https://t.co/pqUv6cyH2z

Min tweet: https://t.co/6VLQYAlcto



## Add Tweet OPEN/CLOSE Tags

In [4]:
# add tweet oepn/close tags
tweets = [ "opentweetopen {} closetweetclose".format( tweet ) for tweet in tweets ]
print( tweets[ 0 ] )
          
# create doc from individual tweets
doc = " ".join( tweets )
doc[ :400 ]
# tweets = None
# gc.collect()

opentweetopen Just met with UN Secretary-General António Guterres who is working hard to “Make the United Nations Great Again.” When the UN does more to solve conflicts around the world it means the U.S. has less to do and we save money. @NikkiHaley is doing a fantastic job! https://t.co/pqUv6cyH2z
 closetweetclose


'opentweetopen Just met with UN Secretary-General António Guterres who is working hard to “Make the United Nations Great Again.” When the UN does more to solve conflicts around the world it means the U.S. has less to do and we save money. @NikkiHaley is doing a fantastic job! https://t.co/pqUv6cyH2z\n closetweetclose opentweetopen America is a Nation that believes in the power of redemption. America'

## Build Encoded Punctuation to Punctuation Dictionary

In [5]:
punctuation_dict = {}
punctuation_dict[ "endperiod" ] = "."
punctuation_dict[ "endquestion" ] = "?"
punctuation_dict[ "endexclamation" ] = "!"
punctuation_dict[ "pausecomma" ] = ","
punctuation_dict[ "pausecolon" ] = ":"
punctuation_dict[ "pausesemicolon" ] = ";"
punctuation_dict[ "pauseemdash" ] = "-"
punctuation_dict[ "pausedash" ] = "-"
punctuation_dict[ "smartquoteopen" ] = '“'
punctuation_dict[ "smartquoteclose" ] = '”'
punctuation_dict[ "quoteopen" ] = '"'
punctuation_dict[ "quoteclose" ] = '"'
punctuation_dict[ "attweetat" ] = '@'
punctuation_dict[ "tweetlink" ] = "[link]"
punctuation_dict[ "hashtweethash" ] = '#'
punctuation_dict[ "opentweetopen" ] = '[start]'
punctuation_dict[ "closetweetclose" ] = '[end]'
punctuation_dict[ "ampersand" ] = '&'
punctuation_dict[ "tweetelipsis" ] = "..."

punctuation_dict[ "contractionopen" ] = "contractionopen"
punctuation_dict[ "contractionclose" ] = "contractionclose"


In [6]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [7]:
print( re.sub( r"([a-z])'([a-z])", r"\1\2", "DIDN'T" , 0, re.IGNORECASE ) )
print( re.sub( r"([a-z])-([a-z])", r"\1 hyphentweethyphen \2", "italiaN-aAmerican-FOOD" , 0, re.IGNORECASE ) )
print( re.sub( ' +', ' ', "How's this     for a   lot  of space?" ) )

DIDNT
italiaN hyphentweethyphen aAmerican hyphentweethyphen FOOD
How's this for a lot of space?


In [8]:
print( re.sub( r"([a-z])’([a-z])", r"\1\2", "don’t" , 0, re.IGNORECASE ) )
print( re.sub( r"([a-z])’ ", r"\1 ", "goin’ " , 0, re.IGNORECASE ) )
print( re.sub( r"’([a-z])", r"\1", "’till" , 0, re.IGNORECASE ) )
print( "–" == "—" )


dont
goin 
till
False


In [9]:
# turn a doc into clean tokens
punctuation_string = '!‘"\'#$%&()*+,-./:;<=>?@[\\]^_`{|}~'

def clean_doc( doc ):
    
    # multiple kinds of emdash!
    doc = doc.replace( '--', ' pauseemdash ' )
    doc = doc.replace( '—', ' pauseemdash ' )
    doc = doc.replace( '–', ' pauseemdash ' )
    doc = doc.replace( 'u.s.', ' US ' )
    doc = doc.replace( 'U.S.', ' US ' )
    doc = doc.replace( "…", " tweetelipsis " )
    doc = doc.replace( "...", " tweetelipsis " )
    doc = doc.replace( '. ', ' endperiod ' )
    doc = doc.replace( '! ', ' endexclamation ' )
    doc = doc.replace( '? ', ' endquestion ' )
    doc = doc.replace( '•', ' endbullet ' )
    doc = doc.replace( ', ', ' pausecomma ' )
    doc = doc.replace( ': ', ' pausecolon ' )
    doc = doc.replace( '; ', ' pausesemicolon ' )
    doc = doc.replace( ' - ', ' pausedash ' )
    doc = doc.replace( '“', ' smartquoteopen ' )
    doc = doc.replace( '”', ' smartquoteclose ' )
    doc = doc.replace( ' "', ' quoteopen ' )
    doc = doc.replace( '" ', ' quoteclose ' )
    doc = doc.replace( "@ ", " " ) # remove trailing @'s first...
    doc = doc.replace( "@", "attweetat" ) # ...then prefix 1st char @ as word
    doc = doc.replace( "# ", " " ) # remove trailing #'s first...
    doc = doc.replace( "#", "hashtweethash" ) # ...then prefix 1st char # as word
    doc = doc.replace( "&amp;", " ampersand " )
    
    # From: https://stackoverflow.com/questions/33113338/how-to-replace-dash-between-characters-with-space-using-regex
    # replace hyphenated words w/ spaces
    doc = re.sub( r"([a-z])\-([a-z])", r"\1 \2", doc , 0, re.IGNORECASE )
    
    # replace comma separated words w/0 spaces
    doc = re.sub( r"([a-z]),([a-z])", r"\1 \2", doc , 0, re.IGNORECASE )
    
    # replace links w/ "tweetlink"
    # basic regex here: https://bytes.com/topic/python/answers/741677-find-replace-hyperlinks-string
    http_pattern = r'http[^\s\n\r]+'
    doc = re.sub( http_pattern , "tweetlink", doc )
    
    # this overgenerates texttexttexttweetlink, so insert space where it occurs
    doc = re.sub( r"([a-z])(tweetlink)", r"\1 \2", doc , 0, re.IGNORECASE )
    
    # above isn't catching all (wuh?!?), so just bruteforce it
    doc = doc.replace( "tweetlink", " tweetlink" )
    
    # overgeneration of foooooooooooooooohashtweethash, so insert space where it occurs
    doc = re.sub( r"([a-z])(hashtweethash)", r"\1 \2", doc , 0, re.IGNORECASE )
    # above isn't catching all (wuh?!?), so just bruteforce it
    doc = doc.replace( "hashtweethash", " hashtweethash" )
    
    # overgeneration of fooooooooooooooooattweetat, so insert space where it occurs
    doc = re.sub( r"([a-z])(attweetat)", r"\1 \2", doc , 0, re.IGNORECASE )
    # above isn't catching all (wuh?!?), so just bruteforce it
    doc = doc.replace( "attweetat", " attweetat" )
    
    # tag all hyphenated words to protect from deletion
    doc = re.sub( r"([a-z])-([a-z])", r"\1hyphentweethyphen\2", doc , 0, re.IGNORECASE )
    
    # do big ad-hoc global replacement, instead of using maketrans and string.punctuation
    # single quote is special case: don't leave a space, so that contractions are collapsed
    doc = re.sub( r"([a-z]+)'([a-z]+)", r"contractionopen \1\2 contractionclose", doc , 0, re.IGNORECASE )
    # some fool's using a DIFFERENT apostrophe... ’
    doc = re.sub( r"([a-z]+)’([a-z]+)", r"contractionopen \1\2 contractionclose", doc , 0, re.IGNORECASE )
    # replaced by above to wrap contractions
    # doc = re.sub( r"([a-z])'([a-z])", r"\1\2", doc , 0, re.IGNORECASE )
    # # some fool's using a DIFFERENT apostrophe... ’
    # doc = re.sub( r"([a-z])’([a-z])", r"\1\2", doc , 0, re.IGNORECASE )
    
    # trailing deletion: goin' to goin
    doc = re.sub( r"([a-z])’ ", r"\1 ", doc , 0, re.IGNORECASE )
    # not working?!?
    #doc = re.sub( r"’([a-z])", r"\1", doc , 0, re.IGNORECASE )
    
    # otherwise, delete all chars not already tagged as having semantic interest
    # moved up!
    #punctuation_string = '!‘"\'#$%&()*+,-./:;<=>?@[\\]^_`{|}~'
    for punctuation_char in punctuation_string:
        doc = doc.replace( punctuation_char, ' ' )
    
    # ...now that global deletion of stragging dashes has been performed, replaced hyphenated words
    doc = doc.replace( "hyphentweethyphen", "-" )
    
    # finally, reduce duplicate spaces to just one: https://stackoverflow.com/questions/1546226/simple-way-to-remove-multiple-spaces-in-a-string/15913564
    # doc = doc.replace( "  ", ' ' )
    doc = re.sub( ' +', ' ', doc )
    
    # split into tokens by white space
    tokens = doc.split()
    
#     # remove punctuation from each token
#     table = str.maketrans( '', '', string.punctuation ) # will strip all .?!,:; that don't fit replace expr above.
#     #table = str.maketrans( '', '', my_punctuation )
#     tokens = [ w.translate( table ) for w in tokens ]
    
    # remove remaining tokens that are not alphabetic
    # if to_lower:
    #tokens = [ word for word in tokens if word.isalpha() ]
    
    # make lower case
    #tokens = [ word.lower() for word in tokens ] 
    
    return tokens

In [10]:
clean_doc( "Feels like I'm goin’ crazy, this isn’t some fool's errand, wouldn't you agree?" )

['Feels',
 'like',
 'contractionopen',
 'Im',
 'contractionclose',
 'goin',
 'crazy',
 'pausecomma',
 'this',
 'contractionopen',
 'isnt',
 'contractionclose',
 'some',
 'contractionopen',
 'fools',
 'contractionclose',
 'errand',
 'pausecomma',
 'contractionopen',
 'wouldnt',
 'contractionclose',
 'you',
 'agree']

## Load Vocabulary and Segment Doc Tokens

In [11]:
# load dictionary
start_time = get_time()

embeddings_dimension = 300 #must be 50, 100, 200, 300
with open( "output/vocabulary-glove.6B." + str( embeddings_dimension ) + "d.txt", 'r' ) as vocabulary_file:
    
    # omit newline char: https://stackoverflow.com/questions/12330522/reading-a-file-without-newlines
    vocabulary_list = vocabulary_file.read().splitlines()

print_time( start_time, get_time() )

vocabulary_dict = dict.fromkeys( vocabulary_list )
print( "vocabulary_list", len( vocabulary_list ) )
print( "vocabulary_dict", len( vocabulary_dict ) )

print( "1234123412341234" in vocabulary_dict )
print( "earth" in vocabulary_dict )
print( "earth" in vocabulary_list )
print( vocabulary_dict[ "earth" ] )

2018.07.03 20:24
2018.07.03 20:24
Time to process: [0.03897261619567871] seconds
vocabulary_list 400000
vocabulary_dict 400000
False
True
True
None


In [12]:
start_time = get_time()

# clean document
tokens_raw = clean_doc( doc )

print_time( start_time, get_time() )

2018.07.03 20:24
2018.07.03 20:24
Time to process: [1.9162797927856445] seconds


In [13]:
start_time = get_time()

tokens_segmented = []
tokens_novel = []
digits = []
trumpisms = []

disposable_digits_len = len( "467821512981233666" )

# iterate raw tokens and segment hashtags
hashtag_prefix_len = len( "hashtweethash" )
attag_prefix_len = len( "attweetat" )

digits_dropped = 0

for token_raw in tokens_raw:
    
    if token_raw.startswith( "attweetat" ):
        
        # segment what comes after "attweetat"
        #print( token_raw, token_raw[ attag_prefix_len: ] )
        attag_segments = segment( token_raw[ attag_prefix_len: ] )
        #print( "attag_segments", attag_segments )
        
        # iterate identified segments
        tokens_segmented.append( "attagopen" )
        for attag_segment in attag_segments:
            tokens_segmented.append( attag_segment.capitalize() )
        tokens_segmented.append( "attagclose" )
        
    elif token_raw.startswith( "hashtweethash" ):
        
        # segment what comes after "hashtweethash"
        hashtag_segments = segment( token_raw[ hashtag_prefix_len: ] )
        #print( "hashtag_segments", hashtag_segments )
        
        # iterate identified segments
        tokens_segmented.append( "hashtagopen" )
        for hashtag_segment in hashtag_segments:
            tokens_segmented.append( hashtag_segment.capitalize() )
        tokens_segmented.append( "hashtagclose" )
        
    else:
        
        if token_raw in punctuation_dict:
        
            #print( "Punctuation RESERVED word:", token_raw )
            tokens_segmented.append( token_raw )
            
        else:
            
            if token_raw.lower() in vocabulary_dict:
            
                #print( "IN dictionary:", token_raw )
                tokens_segmented.append( token_raw )
                
            else:
                
                # there's some timestamp digit leakage, and they're almost all the same len.  Drop them!
                if token_raw.isdigit() and len( token_raw ) == disposable_digits_len:
                    
                    digits_dropped += 1
                    print( "-", end="" )
                    
                # keep the other ones
                elif token_raw.isdigit():
                    
                    digits.append( token_raw )
                    tokens_segmented.append( "digitsopen" )
                    tokens_segmented.append( token_raw )
                    tokens_segmented.append( "digitsclose" )
                    tokens_novel.append( token_raw )
                
                else:
                    
                    # just tag it for now, and get on with it...
                    # ...later we'll add emoji, segmentation and mispelling handling
                    tokens_segmented.append( "trumpismopen" )
                    tokens_segmented.append( token_raw )
                    tokens_segmented.append( "trumpismclose" )
                    
                    tokens_novel.append( token_raw )
                
#                     trumpism_segments = segment( token_raw )
#                     trumpisms.append( trumpism_segments )
#                   
#                     print( "\ntrumpism_segments: ", trumpism_segments, "\n" )
#    
#                     tokens_segmented.append( "trumpismopen" )
#                     for segment in trumpism_segments:
#                         tokens_segmented.append( segment.capitalize() )
#                     tokens_segmented.append( "trumpismclose" )

# Don't lower case, it's now relevant
# tokens_segmented = [ word.lower() for word in tokens_segmented ] 
            
# # 'compress' into list of unique tokens
tokens_unique = list( set( tokens_segmented ) )
tokens_unique_lowercase = [ token.lower() for token in tokens_unique ]

print()
#print( "tokens_raw", tokens_raw[ :100 ] )
#print( "tokens_segmented", tokens_segmented[ :100 ] )
print( 'Total tokens_raw: %d' % len( tokens_raw ) )
print( 'Total tokens_segmented: %d' % len( tokens_segmented ) )
print( 'Unique Tokens: %d' % len( tokens_unique ) )
print( 'Novel Tokens: %d' % len( tokens_novel ) )
print( 'Unique Novel Tokens: %d' % len( set( tokens_novel ) ) )
print( 'trumpisms: %d' % len( trumpisms ) )
print( 'digits: %d' % len( digits ) )
print( 'Unique digits: %d' % len( set( digits ) ) )
print( "digits_dropped", digits_dropped )

print_time( start_time, get_time() )

# 2018.06.07 09:32
# Total Tokens: 448210
# Unique Tokens: 19564
# Time to process: [0.35286879539489746] seconds

# 2018.06.18 15:01
# Total tokens_raw: 448583
# Total tokens_segmented: 457501
# Unique Tokens: 18911
# Time to process: [3.0585978031158447] seconds

# after processing @'s
# 2018.06.18 16:28
# Total tokens_raw: 448583
# Total tokens_segmented: 493688
# Unique Tokens: 17673
# Time to process: [9.280703783035278] seconds

# after processing _____
# 2018.06.19 17:50
# Total tokens_raw: 472558
# Total tokens_segmented: 522310
# Unique Tokens: 20747
# Time to process: [9.574301481246948] seconds

# 2018.06.21 12:43
# Total tokens_raw: 488581
# Total tokens_segmented: 541479
# Unique Tokens: 24495
# Novel Tokens: 2054
# Unique Novel Tokens: 1186
# trumpisms: 0
# digits: 194
# Unique digits: 141
# digits_dropped 552
# Time to process: [7.744369983673096] seconds

2018.07.03 20:24
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Total tokens_raw: 488581
Total tokens_segmented: 541867
Unique Tokens: 24497
Novel Tokens: 2054
Unique Novel Tokens: 1186
trumpisms: 0
digits: 194
Unique digits: 141
digits_dropped 552
2018.07.03 20:24
Time to process: [7.839493751525879] seconds


In [22]:
from emoji import UNICODE_EMOJI

def has_emoji( s ):
    count = 0
    for emoji in UNICODE_EMOJI:
        count += s.count( emoji )
        if count > 1:
            return False 
        
    return bool( count )

In [23]:
import emoji
# From: https://stackoverflow.com/questions/43146528/how-to-extract-all-the-emojis-from-text
# Create the function to extract the emojis
def extract_emojis( a_list ):
    
    emojis_list = map(lambda x: ''.join(x.split()), emoji.UNICODE_EMOJI.keys())
    r = re.compile('|'.join(re.escape(p) for p in emojis_list))
    aux=[ ' '.join(r.findall(s)) for s in a_list ]
    return(aux)

## Execute the function
extract_emojis( "Carolina✅Ohio" )

['', '', '', '', '', '', '', '', '✅', '', '', '', '']

In [24]:
extract_emojis( "🇺🇸🇪🇸" )

['', '', '', '']

In [25]:
# load()
# print( 'isnt' in vocabulary_dict )
# print( 'isnt' in vocabulary_dict )
# print( 'McCabes' in vocabulary_dict )

# foo = "NikkiHaley".lower()
# print( foo )
print( segment( "Carolina✅Ohio" ) )
print( segment( "NikkiHaley" ) ) 
print( segment( "OBAMATRADE" ) )
print( segment( "PRODUCTIONUp📈7" ) )

print( has_emoji( "Carolina✅Ohio" ) )
print( has_emoji( "🇺🇸🇪🇸" ) )



['carolina', 'ohio']
['nikki', 'haley']
['obama', 'trade']
['production', 'up7']
True
False


In [26]:
tokens_novel_set = set( tokens_novel )

for token in tokens_novel_set:
    
    print( "token", token, "segment(s)", segment( token ) )

token 8838 segment(s) ['8838']
token Frankenstien segment(s) ['franken', 'stien']
token thinkking segment(s) ['think', 'king']
token KatherineWebb segment(s) ['katherine', 'webb']
token Spolier segment(s) ['spo', 'lier']
token EWErickson segment(s) ['ew', 'erickson']
token Sharyl segment(s) ['sharyl']
token rolemodeland segment(s) ['role', 'model', 'and']
token failurecountries segment(s) ['failure', 'countries']
token tonights segment(s) ['to', 'nights']
token Jeters segment(s) ['jeter', 's']
token JCena segment(s) ['j', 'cena']
token Bergdhal segment(s) ['berg', 'dhal']
token everyones segment(s) ['everyones']
token ’79 segment(s) ['79']
token 96K segment(s) ['96k']
token 2yrs segment(s) ['2yrs']
token erictrump segment(s) ['eric', 'trump']
token Leachs segment(s) ['leach', 's']
token Pepp segment(s) ['pepp']
token haterseven segment(s) ['haters', 'even']
token CubaMemorandum segment(s) ['cuba', 'memorandum']
token freedomsit segment(s) ['freedoms', 'it']
token oddsin segment(s) ['od

token Oghene segment(s) ['og', 'hene']
token Moneynews segment(s) ['money', 'news']
token Gasparinos segment(s) ['gaspari', 'nos']
token NYers segment(s) ['ny', 'ers']
token Rowanne segment(s) ['row', 'anne']
token Garnetts segment(s) ['garnett', 's']
token OReilly segment(s) ['oreilly']
token MAN’ segment(s) ['man']
token 5000000 segment(s) ['5000000']
token SoHi segment(s) ['so', 'hi']
token leakin segment(s) ['leak', 'in']
token roarwatch segment(s) ['roar', 'watch']
token ⚾️ segment(s) []
token Sanduskys segment(s) ['sandusky', 's']
token Ocare segment(s) ['o', 'care']
token 790k segment(s) ['790k']
token Geronimos segment(s) ['geronimo', 's']
token wouldve segment(s) ['wouldve']
token 18290 segment(s) ['18290']
token 33000 segment(s) ['33000']
token richs segment(s) ['rich', 's']
token ✅Lower segment(s) ['lower']
token baaaaack segment(s) ['b', 'aaaa', 'ack']
token Carolina✅Ohio segment(s) ['carolina', 'ohio']
token 4992343 segment(s) ['4992343']
token 300ft segment(s) ['300ft']
t

token Habberman segment(s) ['hab', 'berman']
token Afghanistant segment(s) ['afghanistan', 't']
token Podestas segment(s) ['podesta', 's']
token 19000 segment(s) ['19000']
token CNNs segment(s) ['cnn', 's']
token POLLTrump segment(s) ['poll', 'trump']
token Wiliem98 segment(s) ['wiliem98']
token welchs segment(s) ['welch', 's']
token 10000000 segment(s) ['10000000']
token SHOULDNT segment(s) ['shouldnt']
token upcoming13th segment(s) ['upcoming', '13th']
token 951957 segment(s) ['951957']
token 📸 segment(s) []
token TrumpCollections segment(s) ['trump', 'collections']
token Jaynie segment(s) ['jay', 'nie']
token marshallx segment(s) ['marshall', 'x']
token unsexiest segment(s) ['un', 'sexiest']
token 315000 segment(s) ['315000']
token Kesslers segment(s) ['kessler', 's']
token America🇺🇸 segment(s) ['america']
token 250g segment(s) ['250g']
token Melania® segment(s) ['melania']
token behal segment(s) ['be', 'hal']
token triky segment(s) ['tri', 'ky']
token TheTrumpNetwork segment(s) ['t

## Look at Token Frequencies

In [14]:
word_counts = collections.Counter( tokens_segmented )
pairs = { k: word_counts[ k ] for k in list( word_counts )[ :10 ] }
pairs

{'António': 1,
 'General': 82,
 'Guterres': 3,
 'Just': 392,
 'Secretary': 61,
 'UN': 23,
 'met': 43,
 'opentweetopen': 22322,
 'who': 955,
 'with': 2218}

In [15]:
for word, count in word_counts.most_common( 30 ):
    print( '%s: %7d' % ( word, count ) )

opentweetopen:   22322
closetweetclose:   22322
endperiod:   13920
attagopen:   12889
attagclose:   12889
the:   12635
to:    9047
tweetlink:    7295
and:    6807
contractionopen:    6402
contractionclose:    6400
a:    6332
is:    6067
of:    6028
in:    5458
I:    4267
on:    3949
for:    3935
hashtagopen:    3470
hashtagclose:    3470
you:    3271
be:    3262
endexclamation:    3181
will:    3027
Trump:    2947
The:    2892
pauseemdash:    2863
quoteopen:    2418
that:    2398
at:    2351


In [29]:
print( "i", word_counts[ "i" ] )
print( "I", word_counts[ "I" ] )
print( "me", word_counts[ "me" ] )

print( "you", word_counts[ "you" ] )

print( "we", word_counts[ "we" ] )
print( "us", word_counts[ "us" ] )
print( "US", word_counts[ "US" ] )

print( "they", word_counts[ "they" ] )
print( "them", word_counts[ "them" ] )

i 6
I 4267
me 1178
you 3271
we 1063
us 414
US 746
they 940
them 482


In [16]:
start_time = get_time()

# organize into sequences of tokens
sequence_len = 25 + 1
sequences = []

for i in range( sequence_len, len( tokens_segmented ) ):
    
    # select sequence of tokens
    seq = tokens_segmented[ i - sequence_len:i ]
    
    # convert into a line
    line = ' '.join( seq )
    
    # store
    sequences.append( line )
    
print( 'Total Sequences: %d' % len( sequences ) )
print_time( start_time, get_time() )

# 2018.06.18 12:35
# Total Sequences: 448557
# 2018.06.18 12:35
# Time to process: [0.24109315872192383] seconds

# 2018.06.18 15:11
# Total Sequences: 457475
# 2018.06.18 15:11
# Time to process: [0.27341151237487793] seconds

# 2018.06.19 14:23
# Total Sequences: 501061
# 2018.06.19 14:23
# Time to process: [0.28377485275268555] seconds

2018.07.03 20:25
Total Sequences: 541841
2018.07.03 20:25
Time to process: [0.29225802421569824] seconds


In [17]:
# save tokens to file, one dialog per line
def save_doc( lines, filename ):
    
    data = '\n'.join( lines )
    file = open( filename, 'w' )
    file.write( data )
    file.close()

In [18]:
# save sequences to file
out_filename = "sequences/trump-tweets-sequences-take-III.txt"
save_doc( sequences, out_filename )

In [19]:
def load_doc( filename ):
    
    # open the file as read only
    file = open( filename, 'r' )
    # read all text
    text = file.read()
    # close the file
    file.close()
    
    return text

In [20]:
in_filename = "sequences/trump-tweets-sequences-take-III.txt"
#doc = load_doc( in_filename )
lines = load_doc( in_filename ).split( '\n' )
lines[ 0:10 ]

['opentweetopen Just met with UN Secretary General António Guterres who is working hard to smartquoteopen Make the United Nations Great Again smartquoteclose When the UN does',
 'Just met with UN Secretary General António Guterres who is working hard to smartquoteopen Make the United Nations Great Again smartquoteclose When the UN does more',
 'met with UN Secretary General António Guterres who is working hard to smartquoteopen Make the United Nations Great Again smartquoteclose When the UN does more to',
 'with UN Secretary General António Guterres who is working hard to smartquoteopen Make the United Nations Great Again smartquoteclose When the UN does more to solve',
 'UN Secretary General António Guterres who is working hard to smartquoteopen Make the United Nations Great Again smartquoteclose When the UN does more to solve conflicts',
 'Secretary General António Guterres who is working hard to smartquoteopen Make the United Nations Great Again smartquoteclose When the UN does more

In [35]:
seq_len_sum = 0;
line_len_dict = {}

for line in lines:
    
    token_count = len( line.split( " " ) )
    seq_len_sum += token_count
    
    if token_count in line_len_dict:
        line_len_dict[ token_count ] += 1
    else:
        line_len_dict[ token_count ] = 1

print( seq_len_sum / len( lines ) )
print( line_len_dict )

26.0
{26: 541841}


## Convert Words to Index Values

In [21]:
start_time = get_time()

# integer encode sequences of words
# tokenizer = Tokenizer( lower=False, filters=punctuation_string )
tokenizer = Tokenizer( lower=False, filters="" )

tokenizer.fit_on_texts( lines )
print( "sequences len *before* keras:", len( sequences ) )
sequences = tokenizer.texts_to_sequences( lines )
print( "sequences len *after* keras:", len( sequences ) )

# elegant! https://stackoverflow.com/questions/41971587/how-to-convert-predicted-sequence-back-to-text-in-keras
words_by_id = dict( map( reversed, tokenizer.word_index.items() ) ) 

# Check to and from of words and idx
print( tokenizer.word_index[ "opentweetopen" ] )
print( tokenizer.word_index[ "hashtagopen" ] )
print( tokenizer.word_index[ "attagopen" ] )

print( words_by_id[ 1 ] )
print( words_by_id[ 19 ] )
print( words_by_id[ 4 ] )

# vocabulary size
# discrepancy between these two lengths, of by a few words...
#vocab_size = len( tokens_unique ) + 1
vocab_size = len( tokenizer.word_index ) + 1
print( "vocab_size", vocab_size )

print_time( start_time, get_time() )


2018.07.03 20:26
sequences len *before* keras: 541841
sequences len *after* keras: 541841
1
19
4
opentweetopen
hashtagopen
attagopen
vocab_size 24498
2018.07.03 20:26
Time to process: [11.739763498306274] seconds


In [22]:
# print( type( sequences ) )
# print( type( sequences[ 0 ] ) )
# print( type( sequences[ 0 ][ 0 ] ) )
# print()
# print( type( sequences ) )
# print( type( sequences[ 0:1 ] ) )
# print( type( sequences[ 0:1 ][ 0 ] ) )
# print( type( np.array( sequences[ 0:1 ][ 0 ] ) ) )

# print( np.array( sequences[ 0:1 ][ 0 ] ).shape )

In [23]:
# # iterate lists of lists, and get lens
# seq_len_sum = 0
# seq_lens = []
# seq_len_dict = {}

# for seq in sequences:
    
#     seq_len_sum += len( seq )
#     seq_lens.append( len( seq ) )
    
#     if len( seq ) in seq_len_dict:
#         seq_len_dict[ len( seq ) ] += 1
#     else:
#         seq_len_dict[ len( seq ) ] = 1
    
# print( seq_len_sum / len( sequences ) ) 
# print( seq_len_dict )

# # Was:
# # 26.002879073381305
# # {26: 540281, 27: 1560}


## This KLUDGE Works, But Is No Longer Needed
*(Using an empty filter string in the tokenizer obviates the need for this workaround)*

In [216]:
# start_time = get_time()

# sequences_foo = np.zeros( ( len( sequences ), len( sequences[ 0 ] ) ), dtype=int )
# print( sequences_foo.shape )

# row_count = 0

# for row_idx, row in enumerate( sequences ):
    
#     for col_idx, col in enumerate( row ):
        
#         if col_idx > 26:
#             print( row_idx, col_idx )
#         else:
#             sequences_foo[ row_idx, col_idx ] = col
        
        
# #     if row_count == 30:
# #         break
# #     row_count += 1

# print( sequences_foo.shape )
# print( sequences_foo[ 0 ])

# print_time( start_time, get_time() )

## KLUDGE: We Need to Convert List of Lists into Array of Arrays
_(Tokenizer's output is different when asked to leave case as is!?!)_

In [24]:
start_time = get_time()

sequences_np = np.array( sequences )

for i in range( len( sequences ) ):
    sequences_np[ i ] = np.array( sequences[ i ] )

print( type( sequences ) )
print( type( sequences_np ) )
print( sequences_np.shape )
print( type( sequences_np[ 0 ] ) )
print( sequences_np[ 0 ] )

print_time( start_time, get_time() )

sequences = None
gc.collect()

2018.07.03 20:26
<class 'list'>
<class 'numpy.ndarray'>
(541841, 26)
<class 'numpy.ndarray'>
[    1   158  1342    33  2270   966   725 24497  9724    67    13   334
   195     7    48   106     6   328  1776    55   117    50   300     6
  2270   335]
2018.07.03 20:26
Time to process: [1.5164201259613037] seconds


22

## KLUDGE: np.array slicing WAS/is fucked up!?!
*(Was only b0rk3d when assuming texts_to_sequences(...) would return the same object types...)*
It doesn't! That's a bug.

In [25]:
print( type( sequences_np ))
print( sequences_np.shape )

print( sequences_np[ 0 ] )
print( type( sequences_np[ 0 ] ) )

# separate into input and output: for now it's 50 words input and 1 word output
#sequences = np.array( sequences )
X = sequences_np[ :,:-1 ] # all rows, from word 0 up to, but not including, the last word
y = sequences_np[ :,-1 ]  # all rows, last word only

# Throws MemoryError
# https://stackoverflow.com/questions/46293734/memoryerror-in-keras-utils-np-utils-to-categorical
#y = to_categorical( y, num_classes=vocab_size )
print( "X.shape", X.shape )
print( "y.shape", y.shape )

seq_length = len( X[ 0 ] )
seq_length


<class 'numpy.ndarray'>
(541841, 26)
[    1   158  1342    33  2270   966   725 24497  9724    67    13   334
   195     7    48   106     6   328  1776    55   117    50   300     6
  2270   335]
<class 'numpy.ndarray'>
X.shape (541841, 25)
y.shape (541841,)


25

In [42]:
## Write X's and y's to Local File System
dump( X, open( training_X_path, 'wb' ) )
dump( y, open( training_y_path, 'wb' ) )

## Load and Filter GloVe Data

In [35]:
import itertools

start_time = get_time()

# verify that mixed and lower case lists are the same length
# print( "¿", len( tokens_unique ), "==", len( tokens_unique_lowercase ), "?" )

# load the whole embedding into memory and iterate it, keeping only those word=embedding pairs present in tweets
embeddings_index = dict()

# moved up!
#embeddings_dimension = 300 #must be 50, 100, 200, 300
glove = open( "../glove/glove.6B." + str( embeddings_dimension ) + "d.txt" )

# for stats keeping
words_plural = 0
words_singular = 0

# ASSUME: that 1st item in list is lowercase word, vectors are 2nd item
# for line in tqdm.tqdm( lines, leave=False ):
for line in glove:
    
    values = line.split()
    # 1st string is word...
    word = values[ 0 ]
    
    # we're now searching w/in lowercase version to allow both mixed and lower case words ("WorD" and "word") to 
    # inherit same vectors
    if word in tokens_unique_lowercase:
        
        # ...the rest are coefficients
        coefs = np.asarray( values[ 1: ], dtype='float32' )
        
        # get indices for all occurences in lower_case list
        indices = [ i for i, word_lower in enumerate( tokens_unique_lowercase ) if word_lower == word ]
        
        if len( indices ) > 1:
            
            #print( "[%d] entries found for [%s]" % ( len( indices ), word ) )
            print( "+", end="" )
            words_plural += 1
            # iterate indices for this word
            for i in indices:
                word_temp = tokens_unique[ i ]
                embeddings_index[ word_temp ] = coefs
        else:
            
            print( ".", end="" )
            words_singular += 1
            embeddings_index[ word ] = coefs 
            
    
glove.close()
print( '\nLoaded %s word vectors.' % len( embeddings_index ) )
print( '\nWords not found %d.' % ( len( tokenizer.word_index ) - len( embeddings_index ) ) )
print( "words_singular", words_singular )
print( "words_plural", words_plural )

print_time( start_time, get_time(), interval="minute" )

2018.07.03 20:41


+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++.+++++++++++++++++++++++++++++++..++++++.+++++++++++++++.++++.++++++++++++++++++++++++.+++++++.++++++.++++++++..++++++++++++++++++++++++++.+.+.+.+.+.+++++++++++.++++.+++++++++.++.+++++++++++++.++.+++.+++++++.++++.++++++.++++++++++++++++++.+++.+.++.+++++++.+..+.++++++++.++..++++++++.+++++++++++++++++++++++++++++.+++.+++++.+++.++++++.+.++++++..+++++++++++++..+++.+..+.++.+..+++.++++++++++.+++.+++.+++++++.+.++++.++++.++++++++++.++++..+++++++++++.+++++.++++++++++.++.+.+++.+++.+++.++.+.++.+++.++++..+.++...+++++++.++.+++++++..+..+++..+++.++.+...+.++++.++++....+++++.+.+++.++.+.+++.++++++++++++++++++++++..+.++++++.+.++..++.+++.+..+++....++..+++.++.+++++++.+++++++.++.+..+++++..++++++++..+..++++++++++++.+++++..++.+++.+.+.++++++++.+++.++++++++++.++++..+.+++++.++.+.++++.+..++++++.++.+++....+..++.+++++++.+++...+++++.+++..+++.++++++++++++.++.++++++++....++++..+.++++.+.+++.+++

In [226]:
indices = [ i for i, word_lower in enumerate( tokens_unique_lowercase ) if word_lower == "president" ]

for i in indices:
    
    print( "index [%d] word [%s] frequencey [%d]" % ( i, tokens_unique[ i ], word_counts[ tokens_unique[ i ] ] ) )
  

index [13234] word [PRESIDENT] frequencey [9]
index [15233] word [president] frequencey [130]
index [22296] word [President] frequencey [550]


In [227]:
# what words appear in upper case?
words_uppercase = []

for word in tokens_unique:
    
    if word.isupper():
        words_uppercase.append( word )
        
print( "len( words_uppercase )", len( words_uppercase ) )

len( words_uppercase ) 2067


In [228]:
for word in words_uppercase:
    
    print( "word [%s], count[%d]" % ( word, word_counts[ word ] ) ) 

word [BLUE], count[4]
word [FENCE], count[2]
word [MATTER], count[2]
word [MICROSOFT], count[2]
word [LEAVE], count[1]
word [ALABAMA], count[3]
word [90M], count[1]
word [SMARTER], count[1]
word [GUILT], count[1]
word [RENOVATION], count[1]
word [JVF], count[1]
word [COS], count[2]
word [YEAR], count[7]
word [FBI], count[100]
word [PILOTS], count[1]
word [INFLUENCE], count[1]
word [NEGOTIATES], count[1]
word [CNN], count[95]
word [INNOVATION], count[2]
word [MESS], count[4]
word [MY], count[4]
word [SOUTHERN], count[3]
word [11PM], count[3]
word [ONLY], count[6]
word [BBC], count[2]
word [IRS], count[12]
word [WEST], count[2]
word [REPORT], count[9]
word [MERIT], count[2]
word [MOVEMENT], count[39]
word [EVENING], count[2]
word [LATER], count[1]
word [CHANGING], count[1]
word [COACH], count[1]
word [PLEASED], count[1]
word [LIED], count[4]
word [DEAL], count[10]
word [TRUST], count[2]
word [WTC], count[4]
word [KY], count[4]
word [V01], count[1]
word [CA], count[10]
word [THEIR], count

In [229]:
print( "len( tokens_unique ) == len( tokens_unique_lowercase )", len( tokens_unique ) == len( tokens_unique_lowercase ) )
print( "Words not in embeddings_index:", len( tokens_unique_lowercase ) - len( embeddings_index ) )
#print( embeddings_index[ "the" ] )
print( "len( tokenizer.word_index )", len( tokenizer.word_index ) )
print( "len( embeddings_index )", len( embeddings_index ) )
print( "vocab_size", vocab_size )

len( tokens_unique ) == len( tokens_unique_lowercase ) True
Words not in embeddings_index: 2058
len( tokenizer.word_index ) 24497
len( embeddings_index ) 22439
vocab_size 24498


### Transform into Matrix That Maps Coefs by Index

In [41]:
# create a weight matrix for words in training docs
embedding_matrix = np.zeros( ( vocab_size, embeddings_dimension ) )
embedding_matrix_words = [ "DUMMY_KLUDGE_DUMMY_KLUDGE_DUMMY_KLUDGE" ] # first word in list should be dummy
missing_words = []

# we need this to create empty coefficients array
dummy_shape = embeddings_index[ "the" ].shape

# use the mixed case list: tokens_unique
#for i, word in enumerate( tokens_unique ):
for word, i in tokenizer.word_index.items():
    
    embedding_vector = embeddings_index.get( word )
    
    # not all words in our token list are in the wikipedia 400K set!
    if embedding_vector is None:
        
        # 1st time, get the lowercase vector
        embedding_vector = embeddings_index.get( word.lower() )
        
    # 2nd test: If not found as original or lower case, then assign it an empty vector
    if embedding_vector is None:  
        
        # report and create empty coefficients array
        missing_words.append( word )
        embedding_vector = np.zeros( dummy_shape )
     
    #print( "i", i, "word", word )
    embedding_matrix[ i ] = embedding_vector
    embedding_matrix_words.append( word )
    
print( "Missing words:", len( missing_words ) )
print( "embedding_matrix_words", len( embedding_matrix_words ) )
print( "embedding_matrix.shape", embedding_matrix.shape )
print( "embedding_matrix[ 0 ]", embedding_matrix[ 0 ] )
print( "embedding_matrix_words[ 0 ]", embedding_matrix_words[ 0 ] )

# before hashtag segmentation Missing words: 5640
# after hashtag segmentation: 5060

Missing words: 2058
embedding_matrix_words 24498
embedding_matrix.shape (24498, 300)
embedding_matrix[ 0 ] [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0. 

## Write Tokenizer and Embeddings Matrix to Local Storage

In [None]:
# save the tokenizer
dump( tokenizer, open( tokenizer_path, 'wb' ) )

# save embedding_matrix based on wiki embeddings, complete w/ missing coefficients array dummies
dump( embedding_matrix, open( embedding_path, 'wb' ) )

In [231]:
missing_words.sort()
missing_words#[ :20 ]

['0017',
 '00A',
 '00AM',
 '00PM',
 '00am',
 '00amE',
 '00pm',
 '00pmE',
 '00pmEST',
 '01AM',
 '068',
 '071067',
 '1000000',
 '10000000',
 '100yrs',
 '100′',
 '10543',
 '109B',
 '10T',
 '10aj',
 '10pE',
 '10pmE',
 '113826',
 '118M',
 '119000',
 '11am6',
 '11pmE',
 '120000',
 '12000000',
 '120K',
 '12400000',
 '1250000',
 '12T',
 '12months',
 '130000',
 '13164',
 '132000',
 '13375',
 '136260',
 '13K',
 '14076',
 '14789',
 '147M',
 '150000',
 '150000000',
 '15T',
 '15Trillion',
 '16500',
 '16977',
 '169B',
 '16T',
 '16’s',
 '173K',
 '17500',
 '175000',
 '179249',
 '17T',
 '180000',
 '18000000',
 '18142',
 '18290',
 '185000',
 '18T',
 '19000',
 '190000',
 '19017',
 '1950’s',
 '1967porky',
 '1970’s',
 '197M',
 '1986fed',
 '1Trillion',
 '1davidkim',
 '1fares',
 '1fos1',
 '1mcd',
 '1pmE',
 '200000',
 '2000000',
 '200000000',
 '20000📈21000📈22000📈',
 '2004📈',
 '2013juniorpga',
 '2016Scranton',
 '2017jambo',
 '20PM',
 '20T',
 '21456',
 '217Billion',
 '21T',
 '21points',
 '2200000',
 '225000',
 

In [232]:
print( embedding_matrix.shape )
print( X.shape )
print( y.shape )

(24498, 300)
(541841, 25)
(541841,)


In [233]:
# write missing words to file
with open( "output/missing-words-tweets.txt", "w" ) as out_file:
    
    for word in missing_words:

        out_file.write( "%s\n" % word )


In [234]:
# confirm visually that 
print( len( embedding_matrix[ 0 ] ) )
print( sum( embedding_matrix[ 0 ] ) )
empty_coefficients_count = 0

for i in range( len( embedding_matrix ) ):
    if sum( embedding_matrix[ i ] ) == 0:
        empty_coefficients_count += 1
        
empty_coefficients_count

300
0.0


2059

## Define Model

In [235]:
import keras
print( keras.__version__ )

import tensorflow as tf
print( tf.__version__ )

2.1.3
1.4.1


In [240]:
# define model
model = Sequential()

# now using a pre-trained, non-trainable embedding from glove's wiki analysis
model.add( Embedding( vocab_size, embeddings_dimension, weights=[embedding_matrix], input_length=seq_length, trainable=True ) )
model.add( Bidirectional( LSTM( seq_length * 2, return_sequences=True ) ) )
#model.add( Dropout( 0.90 ) )
model.add( Bidirectional( LSTM( seq_length * 2 ) ) )
#model.add( Dropout( 0.90 ) )
model.add( Dense( seq_length * 2, activation='relu' ) )

# fixed TypeError below, downgraded keras from 2.1.5 to 2.1.3: https://github.com/keras-team/keras/issues/9621
# TypeError: softmax() got an unexpected keyword argument 'axis'
model.add( Dense( vocab_size, activation='softmax' ) )

print( model.summary() )

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 25, 300)           7349400   
_________________________________________________________________
bidirectional_3 (Bidirection (None, 25, 100)           140400    
_________________________________________________________________
bidirectional_4 (Bidirection (None, 100)               60400     
_________________________________________________________________
dense_3 (Dense)              (None, 50)                5050      
_________________________________________________________________
dense_4 (Dense)              (None, 24498)             1249398   
Total params: 8,804,648
Trainable params: 8,804,648
Non-trainable params: 0
_________________________________________________________________
None


## Fit the Model

## Load Model, Tokenizer, Training Data & Embeddings?

In [7]:
load = input( "Load model, tokenizer & embeddings? [y/n]" )

if load == "y":
    
    #model_name = "models/trump-tweets-w-links-n-ats-take-III.h5"
    print( "Loading model %s" % model_path )
    model = load_model( model_path )
    
    print( "Loading tokenizer %s" % tokenizer_path )
    tokenizer = pickle.load( open( tokenizer_path, "rb" ) )
    
    print( "Loading embeddings %s" % embedding_path )
    embedding_matrix = pickle.load( open( embedding_path, "rb" ) )
    
    print( "Loading training data X's %s" % training_X_path )
    X = pickle.load( open( training_X_path, "rb" ) )
    
    print( "Loading training data y's %s" % training_y_path )
    y = pickle.load( open( training_y_path, "rb" ) )
    
    seq_length = len( X[ 0 ] )
    
else:
    
    print( "NOT loading model, tokenizer, training data & embeddings, using defaults" )

Load model, tokenizer & embeddings? [y/n]y
Loading model models/trump-tweets-w-links-n-ats-take-III.h5
Loading tokenizer tokenizers/trump-tweets-w-links-n-ats-take-III.dump
Loading embeddings embeddings/trump-tweats-w-links-n-ats-take-III.glove
Loading training data X's data/training-X-trump-tweets-w-links-n-ats-take-III.dump
Loading training data y's data/training-y-trump-tweets-w-links-n-ats-take-III.dump


In [8]:
batch_size = 1024
# can't remember where I read that batch sizes larger than 512 cause erratic convergence patterns.
# TODO: find that article!
#batch_size = 512

start_time = get_time()

# Per comment here: https://stackoverflow.com/questions/46293734/memoryerror-in-keras-utils-np-utils-to-categoricalhttps://stackoverflow.com/questions/46293734/memoryerror-in-keras-utils-np-utils-to-categorical
model.compile( loss='sparse_categorical_crossentropy', optimizer='adam', metrics=[ 'accuracy' ] )
# model.compile( loss='categorical_crossentropy', optimizer='adam', metrics=[ 'accuracy' ] )

model.fit( X, y, batch_size=batch_size, epochs=10 )
end_time = get_time()
print_time( start_time, end_time, interval="hours" )

# 2018.06.06 19:52
# 100 epochs
# Time to process: [1.6088602582613627] hours
# 2018.06.06 21:29

# 2018.06.06 22:28
# 50 epochs
# Time to process: [0.8057563017474281] hours @ 80% accuracy



2018.06.29 13:45
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
2018.06.29 13:55
Time to process: [0.16192274226082698] hours


In [59]:
# save the whole model to file
model.save( model_path )

# save the tokenizer
dump( tokenizer, open( tokenizer_path, 'wb' ) )

# save embedding_matrix based on wiki embeddings, complete w/ missing coefficients array dummies
dump( embedding_matrix, open( embedding_path, 'wb' ) )

# write keys to embedding_matrix
out_file = open( embedding_keys_path, 'w', encoding='utf-8' ) 
for word in embedding_matrix_words:
    out_file.write( "%s\n" % word )
out_file.close()


## Use The Model to Generate Text

In [244]:
seq_length = len( lines[ 0 ].split() ) - 1
seq_length

25

In [245]:
punctuation_dict.get( "smartquoteclose", "bar" )

'”'

In [8]:
# # From: https://medium.com/@david.campion/text-generation-using-bidirectional-lstm-and-doc2vec-models-1-3-8979eb65cb3a
# if temperature = 1.0:
#     the probability for a word to be drawn is similar to the probability for the word to be the next one in the 
#     sequence (the output of the word prediction model), compared to other words in the dictionary,
# if temperature is big (much bigger than 1):
#     the range of probabilities is shorten: the probabilities for all words to be the next one will increase. More 
#         variety of words will be picked-up from the vocabulary, because more words will have high probabilities.
# if temperature is small (close to 0):
#     small probabilities will be avoided (they will be set to a value closed to 0). Less words will be picked-up 
#     from the vocabulary.

def sample_yhats( preds, temperature=1.0 ):
    
    # helper function to sample an index from a probability array
    preds = np.asarray( preds ).astype( 'float64' )
    preds = np.log( preds ) / temperature
    exp_preds = np.exp( preds )
    preds = exp_preds / np.sum( exp_preds )
    probas = np.random.multinomial( 1, preds, 1 )
    return np.argmax( probas )

In [9]:
def generate_seq( model, tokenizer, seq_length, seed_text, n_words, temperature=1.0 ):
    
    result = list()
    result_literal = list()
    in_text = seed_text
    yhat = [ 0.1 ]
    
    # generate a fixed number of words
    for _ in range( n_words ):
        
        # encode the text as integer
        encoded = tokenizer.texts_to_sequences( [ in_text ] )[ 0 ] 
        
        # truncate sequences to a fixed length
        encoded = pad_sequences( [ encoded ], maxlen=seq_length, truncating='pre' ) 
        
        # predict probabilities for each word
        yhat = model.predict_classes( encoded, verbose=0 )
        # this returns list of predictions
        yhats = model.predict( encoded, verbose=0 )[ 0 ]
        
        # map predicted word index to word
        print( "yhat", yhat, words_by_id[ yhat[ 0 ] ] )
        #print( "len( yhats )", len( yhats ) )
        #print( "type( yhats )", type( yhats ) )
        #print( "argmax( yhats )", np.argmax( yhats ) )
        #print( "yhats", yhats )
        
        out_word_id = sample_yhats( yhats, temperature )
        out_word = words_by_id[ out_word_id ]
        # out_word = words_by_id[ yhat[ 0 ] ]
                
        # append to input
        in_text += ' ' + out_word
        
        #result.append( out_word )
        # substitute punctuation tags for actual punctuation
        result.append( punctuation_dict.get( out_word, out_word ) )
        
#         if out_word == "closetweetclose":
#             #print( "Tweet end detected" )
#             break
            
    return ' '.join( result )

In [318]:
type( words_by_id[ 1 ] )

str

In [10]:
def reformat_punctuation( doc ):
    
    doc = doc.replace( ' . ', '. ' )
    doc = doc.replace( ' ! ', '! ' )
    doc = doc.replace( ' ? ', '? ' )
    doc = doc.replace( ' , ', ', ' )
    doc = doc.replace( ' : ', ': ' )
    doc = doc.replace( ' ; ', '; ' )
    
    doc = doc.replace( '“ ', '“' )
    doc = doc.replace( ' ”', '”' )
    doc = doc.replace( "attweetat", '@' )
    #doc = doc.replace( "hashtweethash", '#' )
    doc = doc.replace( " amp; ", '&' )
    
    return doc

In [11]:
# select a seed text
seed_text = lines[ randint( 0, len( lines ) ) ]
print( seed_text, "\n" )

# # substitute the seed words
# raw_text = seed_text.split( " " )

# clean_text = [ punctuation_dict.get( word, word ) for word in raw_text ]
# clean_text = ' '.join( clean_text )

# print( reformat_punctuation( clean_text ) + '... \n' )
# #print( len( seed_text.split( " " ) ) )

# generate new text
generated = generate_seq( model, tokenizer, seq_length, seed_text, 5, 1.5 )

print( "... " + generated )
print()
print( "\n\n... " + reformat_punctuation( generated ) )

NameError: name 'lines' is not defined

In [12]:
my_input = input()
"opentweetopen " + my_input
generated = generate_seq( model, tokenizer, seq_length, my_input, 50 )
print( "... " + reformat_punctuation( generated ) )

“45 year low in illegal immigration this year.” @foxandfriends "Anybody that believes in strong borders and stopping illegal immigration cannot vote for Marco Rubio  READ THIS: https://t.co/Tj85IsBPG8""" The weak illegal immigration policies of the Obama Admin.


NameError: name 'seq_length' is not defined

In [None]:
#initiate sentences
seed_sentences = "Nobody has better respect for intelligence than Donald Trump ."
generated = ''
sentence = []

for i in range( seq_length ):
    sentence.append( "a" )

seed = seed_sentences.split()

for i in range( len( seed ) ):
    sentence[ seq_length - i - 1 ]= seed[ len( seed ) - i - 1 ]

generated += ' '.join(sentence)
print('Generating text with the following seed: "' + ' '.join(sentence) + '"')

print ()

In [None]:
words_number = 100
#generate the text
for i in range(words_number):
    #create the vector
    x = np.zeros((1, seq_length, vocab_size))
    for t, word in enumerate(sentence):
        x[0, t, vocab[word]] = 1.
    #print(x.shape)

    #calculate next word
    preds = model.predict(x, verbose=0)[0]
    next_index = sample(preds, 0.34)
    next_word = vocabulary_inv[next_index]

    #add the next word to the text
    generated += " " + next_word
    # shift the sentence by one, and and the next word at its end
    sentence = sentence[1:] + [next_word]

print(generated)
