In [None]:
import pandas as pd
import qgrid
from datetime import datetime
from gensim import corpora, models, similarities
import numpy as np
from nltk.corpus import stopwords
from collections import defaultdict

stoplist = stopwords.words('english')

df = pd.read_csv('realDonaldTrump_tweets.csv')
df['hour'] = pd.to_datetime(df['created_at']).dt.hour
df.drop(['id', 'created_at'], axis=1, inplace=True)
df['text'] = df['text'].str[2:-1]
twitts_list = df.text.values.tolist()

In [None]:
# remove words that appear only once and TODO: replace mentions and hashtags
def preprocess(twitts_list=twitts_list, frequency=None):
    texts = [[word for word in document.lower().split() if word not in stoplist]
             for document in twitts_list]
    
    if frequency==None:
        frequency = defaultdict(int)
        for text in texts:
            for token in text:
                frequency[token] += 1

    return [[token for token in text if frequency[token] > 1] for text in texts], frequency

texts, frequency = preprocess()
from pprint import pprint  # pretty-printer
pprint(texts[0])

In [None]:
from gensim import corpora
dictionary = corpora.Dictionary(texts)
dictionary.save('/tmp/deerwester.dict')  # store the dictionary, for future reference
print(dictionary)

In [None]:
corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize('corpus.mm', corpus)  # store to disk, for later use

In [None]:
from gensim import models
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

In [None]:
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary) # initialize an LSI trans

In [None]:
lsi.print_topic(1, topn=20)

In [None]:
doc = ['Despite what you hear in the press, healthcare is coming along great. We are talking to many groups and it will end in a beautiful picture!']
doc2 = ['LinkedIn Workforce Report: January and February were the strongest consecutive months for hiring since August and September 2015']
word, temp = preprocess(doc2, frequency)
vec_bow = dictionary.doc2bow(word[0][0].lower().split())
vec_lsi = lsi[vec_bow] # convert the query to LSI space

In [None]:
index = similarities.MatrixSimilarity(lsi[corpus])

In [None]:
sims = index[vec_lsi] # perform a similarity query against the corpus
print('max score:' + str(sims.max()))
print('mean score:' + str(np.mean(sims)))
print('median score: ' + str(np.median(sims)))
sims = sorted(enumerate(sims), key=lambda item: -item[1])
# sims_scores = [x[1] for x in sims]
print('most similar tweet: ' + twitts_list[sims[0][0]])
print('original: ' + doc2[0])