In [9]:
import features
import data
import numpy as np
import pandas as pd
import itertools as it
import warnings
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.models import Phrases, Word2Vec
from gensim.corpora import Dictionary, MmCorpus
from gensim.models.ldamulticore import LdaMulticore

In [10]:
def extract_word_embeddings(authors):
    # Split sentences
    sentences = [[" ".join(tw) for tw in auth.nosw] for auth in list(authors.values())]

    # Create bigram model
    bigram_model = Phrases(sentences, min_count=1, threshold=1)
    bigram_sentences = []
    for sentence in sentences:
        bigram_sentence = " ".join(bigram_model[sentence])
        bigram_sentences += [bigram_sentence.split(" ")]

    # Create trigram model
    trigram_model = Phrases(bigram_sentences)
    trigram_sentences = []
    for bigram_sentence in bigram_sentences:
        trigram_sentence = " ".join(trigram_model[bigram_sentence])
        trigram_sentences += [trigram_sentence]

    # Create trigram dictionary
    trigram_dictionary = Dictionary([sentence.split(" ") for sentence in trigram_sentences])
    trigram_dictionary.filter_extremes(no_below=10, no_above=0.4)
    trigram_dictionary.compactify()

    # initiate the model and perform the first epoch of training
    sentences_split = [tr.split(" ") for tr in trigram_sentences]
    model = Word2Vec(sentences_split, size=300, window=4, min_count=1, sg=1, workers=7)
    model.train(sentences_split, total_examples=model.corpus_count, epochs=10) # 10 epochs is good

    # Get ordered vocabulary
    ordered_vocab = [(term, voc.index, voc.count) for term, voc in model.wv.vocab.items()]
    ordered_vocab = sorted(ordered_vocab, key=lambda p: (lambda x,y,z: (-z))(*p))
    ordered_terms, term_indices, term_counts = zip(*ordered_vocab)
    word_vectors = pd.DataFrame(model.wv.vectors[term_indices, :], index=ordered_terms)

    

    return word_vectors

In [11]:
authors = data.get_processed_data()

In [12]:
extract_word_embeddings(authors)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
new,-0.369784,0.141009,0.055778,-0.418144,-0.024401,0.208131,-0.420504,0.324528,-0.173325,-0.149505,...,0.075717,-0.181482,-0.358523,-0.247344,0.030666,0.335571,-0.116987,0.080384,-0.088567,0.037428
say,0.434162,-0.021810,-0.047034,-0.261422,-0.434942,0.450776,-0.017344,0.020008,0.352316,-0.314236,...,0.325750,0.017092,0.082372,0.557990,0.366484,0.282300,0.011835,-0.561864,0.221967,0.215300
trump,-0.303410,-0.322475,0.389426,0.103882,0.138253,-0.019693,-0.091739,0.061032,-0.263646,-1.180605,...,-0.142401,-0.441269,0.045012,0.228855,0.121923,0.135348,-0.327279,-0.379233,0.326957,0.332165
get,-0.506700,0.154490,0.466314,0.307095,-0.203990,-0.127998,-0.027809,-0.329827,0.589091,0.290373,...,-0.202997,0.344854,0.092286,0.025407,-0.043116,-0.051287,0.329831,0.315833,0.116165,0.056847
make,-0.172733,0.263569,-0.477806,0.141618,0.667739,0.193659,0.373511,0.681837,-0.155508,0.134450,...,-0.259661,-0.583516,-0.796639,0.189896,0.280515,0.368113,-0.719639,-0.039583,0.259604,-0.394774
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
looot,-0.033626,-0.027094,0.061696,-0.113536,0.046585,0.067574,-0.011137,-0.069455,0.048118,-0.082641,...,-0.150200,-0.077837,0.010695,0.115523,0.053264,0.089651,0.022475,-0.028893,0.021275,0.003953
waterparks,-0.032632,-0.000278,0.061473,-0.112335,0.052731,0.089424,-0.014229,-0.054276,0.036335,-0.105293,...,-0.133158,-0.069423,0.019447,0.133627,0.009680,0.116290,0.014332,-0.007247,0.027178,0.019263
siren,-0.035896,-0.028629,0.050580,-0.109280,0.054341,0.091093,-0.014018,-0.051028,0.052495,-0.095744,...,-0.131747,-0.075568,-0.004079,0.120672,0.034092,0.105262,0.033650,-0.030997,0.020885,0.000142
decapitated,-0.018493,-0.024038,0.064579,-0.094584,0.046242,0.072547,-0.015697,-0.039569,0.023840,-0.092815,...,-0.127796,-0.073799,0.017202,0.113157,0.047215,0.088760,0.009108,-0.035587,0.034211,0.009424
