In [15]:
# From: https://pypi.org/project/import-ipynb/
import import_ipynb

import deepily.util as dp_util
from gensim import corpora, models, similarities
from pprint import pprint  # pretty-printer
from gensim.summarization import summarize

## Global Vars

In [2]:
INPUT_FILE_PATH = "../texts/trump-tweets.txt"
# INPUT_FILE_PATH = "../texts/trump-speeches.txt"

DICTIONARY_PATH = "output/gensim-trump-tweets.dict"
CORPUS_PATH = "output/gensim-trump-tweets.mm"

In [None]:
dp_util.select_gpu()

In [None]:
dp_util.select_cores()

## Load Tweets Verbatim

In [3]:
start_time = dp_util.get_time()
documents = dp_util.load_doc_by_lines( INPUT_FILE_PATH )
dp_util.print_time( start_time, dp_util.get_time() )

2018.07.09 13:06
2018.07.09 13:06
Time to process: [0.010229110717773438] seconds


## Preprocess Tweets

In [4]:
# remove common words and tokenize
stoplist = set( "for a of the and to in".split() )
texts = [ [ word for word in document.lower().split() if word not in stoplist ] for document in documents ]

# # remove words that appear only once
# from collections import defaultdict
# frequency = defaultdict(int)
# for text in texts:
#     for token in text:
#         frequency[token] += 1

# texts = [[token for token in text if frequency[token] > 1] for text in texts]
pprint( texts[ 3 ] )

['school',
 'shooting',
 'texas.',
 'early',
 'reports',
 'not',
 'looking',
 'good.',
 'god',
 'bless',
 'all!']


## Create Dictionary

In [5]:
dictionary = corpora.Dictionary( texts )
dictionary.save( DICTIONARY_PATH )  # store the dictionary, for future reference
print( dictionary )

Dictionary(47359 unique tokens: ['@nikkihaley', 'again.”', 'antónio', 'around', 'conflicts']...)


In [12]:
# token2id gets dict object
# print( dictionary.token2id )

## Create Corpora

In [6]:
corpus = [ dictionary.doc2bow( text ) for text in texts ]
corpora.MmCorpus.serialize( CORPUS_PATH, corpus )   

In [31]:
# tfidf = models.TfidfModel( corpus )

In [32]:
# corpus_tfidf = tfidf[ corpus ]
# corpus_tfidf[ 3 ]

In [7]:
lsi = models.LsiModel( corpus, id2word=dictionary, num_topics=300 ) # ideal number is > 300 and < 500

In [8]:
start_time = dp_util.get_time()
index_lsi = similarities.MatrixSimilarity( lsi[ corpus ] )
dp_util.print_time( start_time, dp_util.get_time() )

2018.07.09 13:07
2018.07.09 13:07
Time to process: [3.540031671524048] seconds


In [9]:
#search_doc = "mexico build a wall"
#search_doc = "luxury condominium"
#search_doc = "basket of deplorables"
search_doc = "russia collusion"
search_bow = dictionary.doc2bow( search_doc.lower().split() )
print( search_bow )  
search_lsi = lsi[ search_bow ]
print( search_lsi[ 0 ] ) 

[(118, 1), (344, 1)]
(0, 0.014219028417728746)


In [10]:
start_time = dp_util.get_time()

# perform a similarity query against the corpus
similar_docs = index_lsi[ search_lsi ] 
# sort in descending order by 2nd value in tuple
similar_docs = sorted( enumerate( similar_docs ), key=lambda item: -item[ 1 ] )

dp_util.print_time( start_time, dp_util.get_time() )

2018.07.09 13:07
2018.07.09 13:07
Time to process: [0.01387643814086914] seconds


In [11]:
print( similar_docs[ :7 ] )
print( similar_docs[ -7: ] )


[(961, 0.64740396), (1251, 0.57600957), (664, 0.56409812), (11235, 0.55780679), (2631, 0.54056376), (14318, 0.49824637), (2424, 0.46445632)]
[(11239, -0.1455529), (16332, -0.15065646), (17669, -0.151124), (15243, -0.15285259), (13763, -0.16303205), (18216, -0.1858997), (17167, -0.20957218)]


In [12]:
def print_docs( docs ):
    
    for doc in docs:
        
        print( "Score [%f.4] Tweet [%s]\n" % ( doc[ 1 ], documents[ doc[ 0 ] ].strip() ) )

In [13]:
print_docs( similar_docs[ :7 ] )
print( "-----------------------------" )
print_docs( similar_docs[ -7: ] )


Score [0.647404.4] Tweet [Congratulations to @ABC News for suspending Brian Ross for his horrendously inaccurate and dishonest report on the Russia Russia Russia Witch Hunt. More Networks and “papers” should do the same with their Fake News!]

Score [0.576010.4] Tweet [.@FoxNews Chris Wallace: “More evidence of Dem collusion with Russia than GOP” https://t.co/nACQerJSiu]

Score [0.564098.4] Tweet [Great jobs numbers and finally after many years rising wages- and nobody even talks about them. Only Russia Russia Russia despite the fact that after a year of looking there is No Collusion!]

Score [0.557807.4] Tweet [Twitter for Android,A classic - China just signs massive oil and gas deal with Russia giving Russia plenty of ammo to continue laughing in U.S. face.,05-20-2014 05:45:02,257,184,false,468628444192845825]

Score [0.540564.4] Tweet ["...money to Bill the Hillary Russian ""reset"" praise of Russia by Hillary or Podesta Russian Company. Trump Russia story is a hoax. #MAGA!"]

Score

## Summarize Top 7 Similar Docs

In [21]:
similar_docs[ 3 ]
#summarize( similar_docs[ 3 ], ratio=0.5 )

(11235, 0.55780679)

In [29]:
summaries = []similar_tweets = []

for doc in similar_docs[ 1:7 ]:

    similar_tweets.append( documents[ doc[ 0 ] ].strip() )
    
similar_tweets = " ".join( similar_tweets )
print( similar_tweets )
print() 
    

.@FoxNews Chris Wallace: “More evidence of Dem collusion with Russia than GOP” https://t.co/nACQerJSiu Great jobs numbers and finally after many years rising wages- and nobody even talks about them. Only Russia Russia Russia despite the fact that after a year of looking there is No Collusion! Twitter for Android,A classic - China just signs massive oil and gas deal with Russia giving Russia plenty of ammo to continue laughing in U.S. face.,05-20-2014 05:45:02,257,184,false,468628444192845825 "...money to Bill the Hillary Russian ""reset"" praise of Russia by Hillary or Podesta Russian Company. Trump Russia story is a hoax. #MAGA!" Putin says Russia can’t allow a weakening of its nuclear deterrent—U.S. wants to reduce—are we crazy? "Director Clapper reiterated what everybody including the fake media already knows- there is ""no evidence"" of collusion w/ Russia and Trump."




In [36]:
summary = summarize( similar_tweets, word_count=25 )# ratio=0.5 )
print( summary )


.@FoxNews Chris Wallace: “More evidence of Dem collusion with Russia than GOP” https://t.co/nACQerJSiu Great jobs numbers and finally after many years rising wages- and nobody even talks about them.
