In [1]:
from gensim.models import Phrases
import data
from utils import go_to_project_root
import spacy
import pandas as pd
import itertools as it

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from gensim.corpora import Dictionary, MmCorpus
from gensim.models.ldamulticore import LdaMulticore

from sklearn.manifold import TSNE 
import pyLDAvis
import pyLDAvis.gensim
import warnings

from bokeh.plotting import figure, show, output_notebook
from bokeh.models import HoverTool, ColumnDataSource, value
# import cPickle as pickle

nlp = spacy.load("en_core_web_md")
auths = data.get_processed_data()
output_notebook()

In [2]:
sentences = [[" ".join(tw) for tw in auth.nosw] for auth in list(auths.values())]

In [3]:
def trigram_bow_generator(sentences):
    """
    generator function to read reviews from a file
    and yield a bag-of-words representation
    """
    
    for sentence in sentences:
        yield trigram_dictionary.doc2bow(sentence.split(" "))
        
def explore_topic(topic_number, topn=25):
    """
    accept a user-supplied topic number and
    print out a formatted list of the top terms
    """

    for term, frequency in lda.show_topic(topic_number, topn=25):
        print (term, round(frequency, 3))

def get_related_terms(token, topn=10):
    """
    look up the topn most similar terms to token
    and print them as a formatted list
    """

    for word, similarity in model.most_similar(positive=[token], topn=topn):
        print("{: >20} {: >20}".format(word, round(similarity, 3)))

def word_algebra(add=[], subtract=[], topn=1):
    """
    combine the vectors associated with the words provided
    in add= and subtract=, look up the topn most similar
    terms to the combined vector, and print the result(s)
    """
    answers = model.most_similar(positive=add, negative=subtract, topn=topn)
    
    for term, similarity in answers:
        print( term)

In [4]:
%%time
bigram_model = Phrases(sentences, min_count=1, threshold=1)

bigram_sentences = []
for sentence in sentences:
    bigram_sentence = " ".join(bigram_model[sentence])
    bigram_sentences += [bigram_sentence.split(" ")]

trigram_model = Phrases(bigram_sentences)

trigram_sentences = []
for bigram_sentence in bigram_sentences:
    trigram_sentence = " ".join(trigram_model[bigram_sentence])
    trigram_sentences += [trigram_sentence]

trigram_dictionary = Dictionary([sentence.split(" ") for sentence in trigram_sentences])
trigram_dictionary.filter_extremes(no_below=10, no_above=0.4)
trigram_dictionary.compactify()

MmCorpus.serialize("data/interim/misc/corpus", trigram_bow_generator(trigram_sentences))
    
# load the finished bag-of-words corpus from disk
trigram_bow_corpus = MmCorpus("data/interim/misc/corpus")

import warnings
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    
    # workers => sets the parallelism, and should be
    # set to your number of physical cores minus one
    lda = LdaMulticore(trigram_bow_corpus,
                        num_topics=50,
                        id2word=trigram_dictionary,
                        workers=3)

lda.save("data/interim/misc/lda-model")

CPU times: user 3.86 s, sys: 163 ms, total: 4.02 s
Wall time: 2.93 s


In [5]:
lda = LdaMulticore.load("data/interim/misc/lda-model")
LDAvis_prepared = pyLDAvis.gensim.prepare(lda, trigram_bow_corpus, trigram_dictionary)

In [67]:
pyLDAvis.display(LDAvis_prepared)

In [199]:
from gensim.models import Word2Vec
word2vec_filepath = "data/interim/misc/word2vec"

# initiate the model and perform the first epoch of training
sentences_split = [tr.split(" ") for tr in trigram_sentences]
model = Word2Vec(sentences_split, size=80, window=4,
                    min_count=1, sg=1, workers=4)

model.save(word2vec_filepath)

model.train(sentences_split, total_examples=model.corpus_count, epochs=10) # 10 epochs is good
model.save(word2vec_filepath)

In [196]:
ordered_vocab = [(term, voc.index, voc.count) for term, voc in model.wv.vocab.items()]
ordered_vocab = sorted(ordered_vocab, key=lambda p: (lambda x,y,z: (-z))(*p))
ordered_terms, term_indices, term_counts = zip(*ordered_vocab)
word_vectors = pd.DataFrame(model.wv.vectors[term_indices, :], index=ordered_terms)

# word_vectors

In [197]:
get_related_terms("donald_trump")

president_elect                0.824
               trump                0.754
                dirk                 0.74
             diggler                0.732
        barack_obama                0.731
            kim_jong                 0.72
        alec_baldwin                 0.72
             skripal                0.718
        william_barr                0.699
                 jab                0.696
  import sys


In [198]:
word_algebra(add=["president_trump", "bernie_sanders"], topn=5)

take_aim
obesity
zelensky
majority
donal
  import sys


In [193]:
tsne_input = word_vectors
tsne_input = tsne_input.head(5000)

tsne = TSNE()
tsne_vectors = tsne.fit_transform(tsne_input.values)
tsne_vectors = pd.DataFrame(tsne_vectors,
                            index=pd.Index(tsne_input.index),
                            columns=[u'x_coord', u'y_coord'])

In [194]:
plot_data = ColumnDataSource(tsne_vectors)

tsne_plot = figure(title=u't-SNE Word Embeddings',
                   plot_width = 800,
                   plot_height = 800,
                   tools= (u'pan, wheel_zoom, box_zoom,'
                           u'box_select, reset'),
                   active_scroll=u'wheel_zoom')

tsne_plot.add_tools( HoverTool(tooltips = u'@index') )

tsne_plot.circle(u'x_coord', u'y_coord', source=plot_data,
                 color=u'blue', line_alpha=0.2, fill_alpha=0.1,
                 size=10, hover_line_color=u'black')

tsne_plot.title.text_font_size = value(u'16pt')
tsne_plot.xaxis.visible = True
tsne_plot.yaxis.visible = True
tsne_plot.grid.grid_line_color = None
tsne_plot.outline_line_color = None

show(tsne_plot)