In [1]:
# From: https://pypi.org/project/import-ipynb/
import import_ipynb

import deepily.util as dp_util
from gensim import corpora, models, similarities
from pprint import pprint  # pretty-printer
from gensim.summarization import summarize

importing Jupyter notebook from /data/include/my-deep-learning/trump-bot-take-I/deepily/util.ipynb
2018.07.10 12:37


## Global Vars

In [2]:
INPUT_FILE_PATH = "../texts/trump-tweets.txt"
# INPUT_FILE_PATH = "../texts/trump-speeches.txt"

DICTIONARY_PATH = "output/gensim-trump-tweets.dict"
CORPUS_PATH = "output/gensim-trump-tweets.mm"

In [None]:
dp_util.select_gpu()

In [None]:
dp_util.select_cores()

## Load Tweets Verbatim

In [3]:
start_time = dp_util.get_time()
documents = dp_util.load_doc_by_lines( INPUT_FILE_PATH )
dp_util.print_time( start_time, dp_util.get_time() )

2018.07.10 12:37
2018.07.10 12:37
Time to process: [0.0062792301177978516] seconds


## Preprocess Tweets

In [4]:
# remove common words and tokenize
stoplist = set( "for a of the and to in".split() )
texts = [ [ word for word in document.lower().split() if word not in stoplist ] for document in documents ]

# # remove words that appear only once
# from collections import defaultdict
# frequency = defaultdict(int)
# for text in texts:
#     for token in text:
#         frequency[token] += 1

# texts = [[token for token in text if frequency[token] > 1] for text in texts]
pprint( texts[ 3 ] )

['school',
 'shooting',
 'texas.',
 'early',
 'reports',
 'not',
 'looking',
 'good.',
 'god',
 'bless',
 'all!']


## Create Dictionary

In [5]:
dictionary = corpora.Dictionary( texts )
dictionary.save( DICTIONARY_PATH )  # store the dictionary, for future reference
print( dictionary )

Dictionary(47359 unique tokens: ['@nikkihaley', 'again.”', 'antónio', 'around', 'conflicts']...)


In [12]:
# token2id gets dict object
# print( dictionary.token2id )

## Create Corpora

In [6]:
corpus = [ dictionary.doc2bow( text ) for text in texts ]
corpora.MmCorpus.serialize( CORPUS_PATH, corpus )   

In [31]:
# tfidf = models.TfidfModel( corpus )

In [32]:
# corpus_tfidf = tfidf[ corpus ]
# corpus_tfidf[ 3 ]

In [7]:
lsi = models.LsiModel( corpus, id2word=dictionary, num_topics=300 ) # ideal number is > 300 and < 500

In [8]:
start_time = dp_util.get_time()
index_lsi = similarities.MatrixSimilarity( lsi[ corpus ] )
dp_util.print_time( start_time, dp_util.get_time() )

2018.07.10 12:37
2018.07.10 12:37
Time to process: [3.5188889503479004] seconds


In [35]:
#search_doc = "mexico build a wall"
search_doc = "illegal immigrants gangs"
#search_doc = "luxury condo"
#search_doc = "basket of deplorables"
#search_doc = "russia collusion"
#search_doc = "racism in america" # MAGA?!?
#search_doc = "black lives matter" # meh!
#search_doc = "trump tower" # meh!
search_bow = dictionary.doc2bow( search_doc.lower().split() )
print( search_bow )  
search_lsi = lsi[ search_bow ]
print( search_lsi[ 0 ] ) 

[(171, 1), (172, 1), (7852, 1)]
(0, 0.010897999038419407)


In [36]:
start_time = dp_util.get_time()

# perform a similarity query against the corpus
similar_docs = index_lsi[ search_lsi ] 
# sort in descending order by 2nd value in tuple
similar_docs = sorted( enumerate( similar_docs ), key=lambda item: -item[ 1 ] )

dp_util.print_time( start_time, dp_util.get_time() )

2018.07.10 13:24
2018.07.10 13:24
Time to process: [0.015276432037353516] seconds


In [30]:
print( similar_docs[ :7 ] )
print( similar_docs[ -7: ] )


[(12591, 0.55641544), (12701, 0.53337717), (13671, 0.50377685), (8866, 0.49728006), (14461, 0.48256284), (12940, 0.4744817), (12038, 0.46274188)]
[(16777, -0.16397551), (17649, -0.16397752), (16894, -0.16398136), (14267, -0.1639816), (13492, -0.16398442), (18399, -0.16398838), (14015, -0.17466487)]


In [31]:
def print_docs( docs ):
    
    for doc in docs:
        
        print( "Score [%f.4] Tweet [%s]\n" % ( doc[ 1 ], documents[ doc[ 0 ] ].strip() ) )

In [38]:
print_docs( similar_docs[ :7 ] )
print( "-----------------------------" )
print_docs( similar_docs[ -7: ] )


Score [0.655812.4] Tweet [The super Liberal Democrat in the Georgia Congressioal race tomorrow wants to protect criminals allow illegal immigration and raise taxes!]

Score [0.576457.4] Tweet [“45 year low in illegal immigration this year.” @foxandfriends]

Score [0.568625.4] Tweet ["Anybody that believes in strong borders and stopping illegal immigration cannot vote for Marco Rubio  READ THIS: https://t.co/Tj85IsBPG8"""]

Score [0.560368.4] Tweet [.@MarcoRubio is weak on illegal immigration and will allow anyone into the country.....]

Score [0.522708.4] Tweet [Obama can sign an illegal executive action anytime for ObamaCare but he can’t fix the illegal loophole.]

Score [0.508659.4] Tweet [The weak illegal immigration policies of the Obama Admin. allowed bad MS 13 gangs to form in cities across U.S. We are removing them fast!]

Score [0.490698.4] Tweet [According to @pewresearch illegal immigrants favor Dems 8:1 http://t.co/ZN4rOIFVzA @GOP pushing amnesty. Do they have death wish]

-

## Summarize Top 7 Similar Docs

In [14]:
similar_docs[ 3 ]
#summarize( similar_docs[ 3 ], ratio=0.5 )

(21843, 0.80679631)

In [39]:
similar_tweets = []

for doc in similar_docs[ 0:7 ]:

    similar_tweets.append( documents[ doc[ 0 ] ].strip() )
    
similar_tweets = " ".join( similar_tweets )
print( similar_tweets )
print() 
    

The super Liberal Democrat in the Georgia Congressioal race tomorrow wants to protect criminals allow illegal immigration and raise taxes! “45 year low in illegal immigration this year.” @foxandfriends "Anybody that believes in strong borders and stopping illegal immigration cannot vote for Marco Rubio  READ THIS: https://t.co/Tj85IsBPG8""" .@MarcoRubio is weak on illegal immigration and will allow anyone into the country..... Obama can sign an illegal executive action anytime for ObamaCare but he can’t fix the illegal loophole. The weak illegal immigration policies of the Obama Admin. allowed bad MS 13 gangs to form in cities across U.S. We are removing them fast! According to @pewresearch illegal immigrants favor Dems 8:1 http://t.co/ZN4rOIFVzA @GOP pushing amnesty. Do they have death wish



In [40]:
summary = summarize( similar_tweets, word_count=25 )# ratio=0.5 )
print( summary )


The weak illegal immigration policies of the Obama Admin.
