In [2]:
import spacy
import pandas as pd
import itertools as it

In [3]:
nlp = spacy.load('en')

In [4]:
from gensim.models import Phrases
from gensim.models.word2vec import LineSentence



In [5]:
def punct_space(token):
    """
    helper function to eliminate tokens
    that are pure punctuation or whitespace
    """
    
    return token.is_punct or token.is_space

def line_review(filename):
    """
    generator function to read in from the file
    and un-escape the original line breaks in the text
    """
    
    with codecs.open(filename, encoding='utf_8') as f:
        for review in f:
            yield review.replace('\\n', '\n')
            
def lemmatized_sentence_corpus(filename):
    """
    generator function to use spaCy to parse,
    lemmatize the text, and yield sentences
    """
    
    for parsed_review in nlp.pipe(line_review(filename),
                                  batch_size=10000, n_threads=4):
        
        for sent in parsed_review.sents:
            yield u' '.join([token.lemma_ for token in sent
                             if not punct_space(token)])

In [6]:
ticker_list = ['AA','ACEYY','ADSK','ARCW','ASUUY','BTX','CAJ','CAT','CGIX','CRL','CSBR','DDD','DE','DVMT','FUJIY','GE',
'HAGE','HPE','HPQ','INCR','JNJ','LJPC','LMT','LNVGY','LOGI','MEDP','MMM','MSFT','MTLS','ONVO','PRAH','PRLB','PRXL','Q',
'QUCPF','RNSHF','SGLB','SIEGY','SSNNF','TKSTF','TOSYY','TRMB','VJET','XONE','ZBH']

total_content = ''

for ticker in ticker_list:
    fh = open('C:/Users/Daniel/Documents/Yewno/company_'+ticker,'rb')
    content = str(fh.read())
    total_content = total_content+content
    fh.close()
    

In [9]:
a = total_content
a_unicode = str(a)
parsed_a = nlp(a_unicode)
print(parsed_a)

b'Ticker:AA\r\nb\'Alcoa Corporation390 Park AvenueNew York, NY 10022United States212-518-5400http://www.alcoa.comSector:\\xc2\\xa0Basic MaterialsIndustry:\\xc2\\xa0AluminumFull Time Employees:\\xc2\\xa014,000Key ExecutivesNameTitlePayExercisedAgeMr. Roy C. HarveyChief Exec. Officer, Pres and Director2.51MN/A42Mr. William F. OplingerChief Financial Officer and Exec. VP1.49MN/A50Mr. T\\xc3\\xb3mas M\\xc3\\xa1r SigurdssonChief Operating Officer and Exec. VP1.12MN/A48Ms. Leigh Ann C. FisherChief Admin. Officer and Exec. VP734.15kN/A50Mr. Jeffrey D. HeeterExec. VP, Gen. Counsel and Sec.575.06kN/A51Amounts are as of December 31, 2016 and compensation values are for the last fiscal year ending on that date. Pay is salary, bonuses, etc. Exercised is the value of options exercised during the fiscal year. Currency in USD.DescriptionAlcoa Corporation produces and sells bauxite, alumina, and aluminum products. It operates through six segments, Bauxite, Alumina, Aluminum, Cast Products, Energy, and

In [10]:
fh = open(r"C:/Users/Daniel/Documents/Yewno/total_content",'w')
fh.write(a_unicode)
fh.close()

In [11]:
for num, sentence in enumerate(parsed_a.sents):
    print('Sentence {}:'.format(num + 1))
    print(sentence) 
    print('') 

Sentence 1:
b'Ticker:AA\r\nb\'Alcoa Corporation390 Park AvenueNew York, NY 10022United States212-518-5400http://www.alcoa.comSector:\\xc2\\xa0Basic MaterialsIndustry:\\xc2\\xa0AluminumFull Time Employees:\\xc2\\xa014,000Key ExecutivesNameTitlePayExercisedAgeMr.

Sentence 2:
Roy C. HarveyChief Exec.

Sentence 3:
Officer, Pres and Director2.51MN/A42Mr.

Sentence 4:
William F. OplingerChief Financial Officer and Exec.

Sentence 5:
VP1.49MN/A50Mr

Sentence 6:
. T\\xc3\\xb3mas M\\xc3\\xa1r SigurdssonChief Operating Officer and Exec.

Sentence 7:
VP1.12MN/A48Ms.

Sentence 8:
Leigh Ann C. FisherChief Admin.

Sentence 9:
Officer and Exec.

Sentence 10:
VP734.15kN/A50Mr.

Sentence 11:
Jeffrey D. HeeterExec.

Sentence 12:
VP, Gen. Counsel and Sec.575.06kN/A51Amounts are as of December 31, 2016 and compensation values are for the last fiscal year ending on that date.

Sentence 13:
Pay is salary, bonuses, etc.

Sentence 14:
Exercised is the value of options exercised during the fiscal year.

Sente

In [12]:
for num, entity in enumerate(parsed_a.ents):
    print('Entity {}:'.format(num + 1), entity, '-', entity.label_)
    print('')

Entity 1: AvenueNew York - GPE

Entity 2: States212-518-5400http://www.alcoa.comSector:\\xc2\\xa0Basic - CARDINAL

Entity 3: MaterialsIndustry:\\xc2\\xa0AluminumFull Time Employees:\\xc2\\xa014,000Key ExecutivesNameTitlePayExercisedAgeMr - ORG

Entity 4: Roy C. HarveyChief Exec - PERSON

Entity 5: Pres - PERSON

Entity 6: A42Mr - PERSON

Entity 7: William F. OplingerChief - PERSON

Entity 8: Exec - ORG

Entity 9: M\\xc3\\xa1r SigurdssonChief - ORG

Entity 10: Exec - ORG

Entity 11: Leigh Ann C. FisherChief Admin - PERSON

Entity 12: Exec - ORG

Entity 13: Jeffrey D. HeeterExec - PERSON

Entity 14: Counsel - PERSON

Entity 15: December 31, 2016 - DATE

Entity 16: the last fiscal year ending on that date - DATE

Entity 17: the fiscal year - DATE

Entity 18: USD.DescriptionAlcoa Corporation - ORG

Entity 19: six - CARDINAL

Entity 20: Bauxite - GPE

Entity 21: Alumina - GPE

Entity 22: Cast Products - ORG

Entity 23: Energy - ORG

Entity 24: Rolled Products - ORG

Entity 25: AA - GPE

Ent

In [13]:
# part of speech
token_text = [token.orth_ for token in parsed_a]
token_pos = [token.pos_ for token in parsed_a]

pd.DataFrame(list(zip(token_text, token_pos)),
             columns=['token_text', 'part_of_speech'])

Unnamed: 0,token_text,part_of_speech
0,b'Ticker,NOUN
1,:,PUNCT
2,AA\r\nb\'Alcoa,PROPN
3,Corporation390,PROPN
4,Park,PROPN
5,AvenueNew,PROPN
6,York,PROPN
7,",",PUNCT
8,NY,PROPN
9,10022United,VERB


In [14]:
token_lemma = [token.lemma_ for token in parsed_a]
token_lemma

["b'ticker",
 ':',
 "aa\\r\\nb\\'alcoa",
 'corporation390',
 'park',
 'avenuenew',
 'york',
 ',',
 'ny',
 '10022unit',
 'states212',
 '-',
 '518',
 '-',
 '5400http://www.alcoa.comsector:\\\\xc2\\\\xa0basic',
 'materialsindustry:\\\\xc2\\\\xa0aluminumfull',
 'time',
 'employees:\\\\xc2\\\\xa014,000key',
 'executivesnametitlepayexercisedagemr',
 '.',
 'roy',
 'c.',
 'harveychief',
 'exec',
 '.',
 'officer',
 ',',
 'pres',
 'and',
 'director2.51mn',
 '/',
 'a42mr',
 '.',
 'william',
 'f.',
 'oplingerchief',
 'financial',
 'officer',
 'and',
 'exec',
 '.',
 'vp1.49mn',
 '/',
 'a50mr',
 '.',
 't\\\\xc3\\\\xb3ma',
 'm\\\\xc3\\\\xa1r',
 'sigurdssonchief',
 'operate',
 'officer',
 'and',
 'exec',
 '.',
 'vp1.12mn',
 '/',
 'a48ms',
 '.',
 'leigh',
 'ann',
 'c.',
 'fisherchief',
 'admin',
 '.',
 'officer',
 'and',
 'exec',
 '.',
 'vp734.15kn',
 '/',
 'a50mr',
 '.',
 'jeffrey',
 'd.',
 'heeterexec',
 '.',
 'vp',
 ',',
 'gen.',
 'counsel',
 'and',
 'sec.575.06kn',
 '/',
 'a51amount',
 'be',
 'as',

In [15]:
token_nopunc = [token.is_punct for token in parsed_a]

In [16]:
words = [token for token in parsed_a if token.is_punct == False]
words

[b'Ticker,
 AA\r\nb\'Alcoa,
 Corporation390,
 Park,
 AvenueNew,
 York,
 NY,
 10022United,
 States212,
 518,
 5400http://www.alcoa.comSector:\\xc2\\xa0Basic,
 MaterialsIndustry:\\xc2\\xa0AluminumFull,
 Time,
 Employees:\\xc2\\xa014,000Key,
 ExecutivesNameTitlePayExercisedAgeMr,
 Roy,
 C.,
 HarveyChief,
 Exec,
 Officer,
 Pres,
 and,
 Director2.51MN,
 A42Mr,
 William,
 F.,
 OplingerChief,
 Financial,
 Officer,
 and,
 Exec,
 VP1.49MN,
 A50Mr,
 T\\xc3\\xb3mas,
 M\\xc3\\xa1r,
 SigurdssonChief,
 Operating,
 Officer,
 and,
 Exec,
 VP1.12MN,
 A48Ms,
 Leigh,
 Ann,
 C.,
 FisherChief,
 Admin,
 Officer,
 and,
 Exec,
 VP734.15kN,
 A50Mr,
 Jeffrey,
 D.,
 HeeterExec,
 VP,
 Gen.,
 Counsel,
 and,
 Sec.575.06kN,
 A51Amounts,
 are,
 as,
 of,
 December,
 31,
 2016,
 and,
 compensation,
 values,
 are,
 for,
 the,
 last,
 fiscal,
 year,
 ending,
 on,
 that,
 date,
 Pay,
 is,
 salary,
 bonuses,
 etc,
 Exercised,
 is,
 the,
 value,
 of,
 options,
 exercised,
 during,
 the,
 fiscal,
 year,
 Currency,
 in,
 USD.

In [17]:
from gensim.models import Phrases
from gensim.models.word2vec import LineSentence

In [18]:
import os
import codecs
unigram_sentences_filepath = os.path.join('./','unigram_sentences_all.txt')

In [19]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if 0 == 0:

    with codecs.open(unigram_sentences_filepath, 'w', encoding='utf_8') as f:
        for sentence in lemmatized_sentence_corpus(r"C:/Users/Daniel/Documents/Yewno/total_content"):
            f.write(sentence + '\n')

Wall time: 1.57 s


In [20]:
unigram_sentences = LineSentence(unigram_sentences_filepath)

In [21]:
for sentence in lemmatized_sentence_corpus(r"C:/Users/Daniel/Documents/Yewno/total_content"):
    print(sentence)

b'ticker aa\r b\'alcoa corporation390 park avenuenew york ny 10022unit states212 518 5400http://www.alcoa.comsector:\\xc2\\xa0basic materialsindustry:\\xc2\\xa0aluminumfull time employees:\\xc2\\xa014,000key executivesnametitlepayexercisedagemr
roy c. harveychief exec
officer pres and director2.51mn a42mr
william f. oplingerchief financial officer and exec
vp1.49mn a50mr
t\\xc3\\xb3ma m\\xc3\\xa1r sigurdssonchief operate officer and exec
vp1.12mn a48ms
leigh ann c. fisherchief admin
officer and exec
vp734.15kn a50mr
jeffrey d. heeterexec
vp gen. counsel and sec.575.06kn a51amount be as of december 31 2016 and compensation value be for the last fiscal year end on that date
pay be salary bonus etc
exercise be the value of option exercise during the fiscal year
currency in usd.descriptionalcoa corporation produce and sell bauxite alumina and aluminum product
-PRON- operate through six segment bauxite alumina aluminum cast products energy and rolled products
the aa also offer aluminum cast

In [22]:
for unigram_sentence in unigram_sentences:
    print(u' '.join(unigram_sentence))
    print(u'')

b'ticker aa\r b\'alcoa corporation390 park avenuenew york ny 10022unit states212 518 5400http://www.alcoa.comsector:\\xc2\\xa0basic materialsindustry:\\xc2\\xa0aluminumfull time employees:\\xc2\\xa014,000key executivesnametitlepayexercisedagemr

roy c. harveychief exec

officer pres and director2.51mn a42mr

william f. oplingerchief financial officer and exec

vp1.49mn a50mr

t\\xc3\\xb3ma m\\xc3\\xa1r sigurdssonchief operate officer and exec

vp1.12mn a48ms

leigh ann c. fisherchief admin

officer and exec

vp734.15kn a50mr

jeffrey d. heeterexec

vp gen. counsel and sec.575.06kn a51amount be as of december 31 2016 and compensation value be for the last fiscal year end on that date

pay be salary bonus etc

exercise be the value of option exercise during the fiscal year

currency in usd.descriptionalcoa corporation produce and sell bauxite alumina and aluminum product

-PRON- operate through six segment bauxite alumina aluminum cast products energy and rolled products

the aa also off

In [23]:
bigram_model_filepath = os.path.join('./', 'bigram_model_all')

In [24]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute modeling yourself.
if 0 == 0:

    bigram_model = Phrases(unigram_sentences)

    bigram_model.save(bigram_model_filepath)
    
# load the finished model from disk
bigram_model = Phrases.load(bigram_model_filepath)

Wall time: 82.6 ms


In [25]:
bigram_sentences_filepath = os.path.join("./",
                                         'bigram_sentences_all.txt')

In [26]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if 0 == 0:

    with codecs.open(bigram_sentences_filepath, 'w', encoding='utf_8') as f:
        
        for unigram_sentence in unigram_sentences:
            
            bigram_sentence = u' '.join(bigram_model[unigram_sentence])
            
            f.write(bigram_sentence + '\n')



Wall time: 111 ms


In [27]:
bigram_sentences = LineSentence(bigram_sentences_filepath)

In [28]:
for bigram_sentence in it.islice(bigram_sentences, 0, 240):
    print(u' '.join(bigram_sentence))
    print(u'')

b'ticker aa\r b\'alcoa corporation390 park avenuenew york ny 10022unit states212 518 5400http://www.alcoa.comsector:\\xc2\\xa0basic materialsindustry:\\xc2\\xa0aluminumfull time employees:\\xc2\\xa014,000key executivesnametitlepayexercisedagemr

roy c. harveychief exec

officer_pres and director2.51mn a42mr

william f. oplingerchief financial_officer and exec

vp1.49mn a50mr

t\\xc3\\xb3ma m\\xc3\\xa1r sigurdssonchief operate officer and exec

vp1.12mn a48ms

leigh ann c. fisherchief admin

officer and exec

vp734.15kn a50mr

jeffrey d. heeterexec

vp_gen. counsel and sec.575.06kn a51amount be as_of december_31 2016_and compensation_value be for the_last fiscal_year end_on that_date

pay_be salary_bonus etc

exercise_be the_value of_option exercise_during the_fiscal year

currency_in usd.descriptionalcoa corporation produce and sell bauxite alumina and aluminum product

-PRON-_operate through six segment bauxite alumina aluminum cast products energy and rolled products

the aa also_off

In [29]:
trigram_model_filepath = os.path.join('./',
                                      'trigram_model_all')

In [30]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute modeling yourself.
if 0 == 0:

    trigram_model = Phrases(bigram_sentences)

    trigram_model.save(trigram_model_filepath)
    
# load the finished model from disk
trigram_model = Phrases.load(trigram_model_filepath)

Wall time: 80 ms


In [31]:
trigram_sentences_filepath = os.path.join("./",
                                          'trigram_sentences_all.txt')

In [32]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if 0 == 0:

    with codecs.open(trigram_sentences_filepath, 'w', encoding='utf_8') as f:
        
        for bigram_sentence in bigram_sentences:
            
            trigram_sentence = u' '.join(trigram_model[bigram_sentence])
            
            f.write(str(trigram_sentence) + '\n')



Wall time: 110 ms


In [33]:
trigram_sentences = LineSentence(trigram_sentences_filepath)

In [34]:
for trigram_sentence in it.islice(trigram_sentences, 0, 240):
    print(u' '.join(trigram_sentence))
    print(u'')

b'ticker aa\r b\'alcoa corporation390 park avenuenew york ny 10022unit states212 518 5400http://www.alcoa.comsector:\\xc2\\xa0basic materialsindustry:\\xc2\\xa0aluminumfull time employees:\\xc2\\xa014,000key executivesnametitlepayexercisedagemr

roy c. harveychief exec

officer_pres and director2.51mn a42mr

william f. oplingerchief financial_officer and exec

vp1.49mn a50mr

t\\xc3\\xb3ma m\\xc3\\xa1r sigurdssonchief operate officer and exec

vp1.12mn a48ms

leigh ann c. fisherchief admin

officer and exec

vp734.15kn a50mr

jeffrey d. heeterexec

vp_gen._counsel and sec.575.06kn a51amount be_as_of december_31_2016_and compensation_value_be for_the_last fiscal_year_end_on that_date

pay_be_salary_bonus etc

exercise_be_the_value of_option_exercise_during the_fiscal_year

currency_in usd.descriptionalcoa corporation produce and sell bauxite alumina and aluminum product

-PRON-_operate_through six segment bauxite alumina aluminum cast products energy and rolled products

the aa also_off

In [35]:
trigram_reviews_filepath = os.path.join("./",
                                        'trigram_transformed_reviews_all.txt')

In [36]:
# ways to run through all three at once

# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if 0 == 0:

    with codecs.open(trigram_reviews_filepath, 'w', encoding='utf_8') as f:
        
        for parsed_review in nlp.pipe(line_review(r"C:/Users/Daniel/Documents/Yewno/total_content"),
                                      batch_size=10000, n_threads=4):
            
            # lemmatize the text, removing punctuation and whitespace
            unigram_review = [token.lemma_ for token in parsed_review
                              if not punct_space(token)]
            
            # apply the first-order and second-order phrase models
            bigram_review = bigram_model[unigram_review]
            trigram_review = trigram_model[bigram_review]
            
            # remove any remaining stopwords
            trigram_review = [term for term in trigram_review
                              if term not in spacy.en.language_data.STOP_WORDS]
            
            # write the transformed review as a line in the new file
            trigram_review = u' '.join(trigram_review)
            f.write(str(trigram_review) + '\n')



In [37]:
from gensim.corpora import Dictionary, MmCorpus
from gensim.models.ldamulticore import LdaMulticore

import pyLDAvis
import pyLDAvis.gensim
import warnings
# import cPickle as pickle
import _pickle as pickle

In [38]:
trigram_dictionary_filepath = os.path.join("./",
                                           'trigram_dict_all.dict')

In [39]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to learn the dictionary yourself.
if 0 == 0:

    trigram_reviews = LineSentence(trigram_reviews_filepath)

    # learn the dictionary by iterating over all of the reviews
    trigram_dictionary = Dictionary(trigram_reviews)
    
    # filter tokens that are very rare or too common from
    # the dictionary (filter_extremes) and reassign integer ids (compactify)
#     trigram_dictionary.filter_extremes(no_below=10, no_above=0.4)
#     trigram_dictionary.compactify()

    trigram_dictionary.save(trigram_dictionary_filepath)
    
# load the finished dictionary from disk
trigram_dictionary = Dictionary.load(trigram_dictionary_filepath)

Wall time: 46 ms


In [40]:
trigram_bow_filepath = os.path.join("./",
                                    'trigram_bow_corpus_all.mm')

In [41]:
def trigram_bow_generator(filepath):
    """
    generator function to read reviews from a file
    and yield a bag-of-words representation
    """
    
    for review in LineSentence(filepath):
        yield trigram_dictionary.doc2bow(review)

In [42]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to build the bag-of-words corpus yourself.
if 0 == 0:

    # generate bag-of-words representations for
    # all reviews and save them as a matrix
    MmCorpus.serialize(trigram_bow_filepath,
                       trigram_bow_generator(trigram_reviews_filepath))
    
# load the finished bag-of-words corpus from disk
trigram_bow_corpus = MmCorpus(trigram_bow_filepath)

Wall time: 36 ms


In [43]:
lda_model_filepath = os.path.join("./", 'lda_model_all')

In [44]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to train the LDA model yourself.
if 0 == 0:

    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        
        # workers => sets the parallelism, and should be
        # set to your number of physical cores minus one
        lda = LdaMulticore(trigram_bow_corpus,
                           num_topics=50,
                           id2word=trigram_dictionary,
                           workers=3)
    
    lda.save(lda_model_filepath)
    
# load the finished LDA model from disk
lda = LdaMulticore.load(lda_model_filepath)

Wall time: 11.9 s


In [45]:
def explore_topic(topic_number, topn=25):
    """
    accept a user-supplied topic number and
    print out a formatted list of the top terms
    """
        
    print(u'{:20} {}'.format(u'term', u'frequency') + u'\n')

    for term, frequency in lda.show_topic(topic_number, topn=25):
        print(u'{:20} {:.3f}'.format(term, round(frequency, 3)))

In [46]:
topic_names = {0: u'software',
               1: u'CAD',
               2: u'3D',
               3: u'design',
               4: u'tools',
               5: u'materials',
               6: u'plastic',
               7: u'elastomeric',
               8: u'polymeric',
               9: u'dental',
               10: u'stereolithography',
               11: u'laser',
               12: u'markets',
               13: u'company',
               14: u'multijet',
               15: u'colorjet',
               16: u'laser',
               17: u'markets',
               18: u'haptic',
               19: u'devices',
               20: u'products',
               21: u'design',
               22: u'scan',
               23: u'machining',
               24: u'proprietary',
               25: u'placement',
               26: u'queue'}

In [47]:
def get_sample_review(review_number):
    """
    retrieve a particular review index
    from the reviews file and return it
    """
    
    return list(it.islice(line_review(r"C:/Users/Daniel/Documents/Yewno/total_content"),
                          review_number, review_number+1))[0]

In [48]:
def lda_description(review_text, min_topic_freq=0.05):
    """
    accept the original text of a review and (1) parse it with spaCy,
    (2) apply text pre-proccessing steps, (3) create a bag-of-words
    representation, (4) create an LDA representation, and
    (5) print a sorted list of the top topics in the LDA representation
    """
    
    # parse the review text with spaCy
    parsed_review = nlp(review_text)
    
    # lemmatize the text and remove punctuation and whitespace
    unigram_review = [token.lemma_ for token in parsed_review
                      if not punct_space(token)]
    
    # apply the first-order and secord-order phrase models
    bigram_review = bigram_model[unigram_review]
    trigram_review = trigram_model[bigram_review]
    
    # remove any remaining stopwords
    trigram_review = [term for term in trigram_review
                      if not term in spacy.en.language_data.STOP_WORDS]
    
    # create a bag-of-words representation
    review_bow = trigram_dictionary.doc2bow(trigram_review)
    
    # create an LDA representation
    review_lda = lda[review_bow]
    
    # sort with the most highly related topics first
    review_lda = sorted(review_lda, key=lambda tup: -tup[1])
#     print(review_lda)
    for topic_number, freq in review_lda:
        if freq < min_topic_freq:
            break
#         print(str(topic_number)+':'+topic_names[topic_number])
        # print the most highly related topic names and frequencies
        print('{:25} {}'.format(topic_names[topic_number],round(freq, 3)))

In [49]:
sample_review = get_sample_review(0)

fh = open(r"C:/Users/Daniel/Documents/Yewno/company_HPQ",'rb')
sample_review = str(fh.read())
fh.close()

print(sample_review)

b'Ticker:HPQ\r\nb\'HP Inc.1501 Page Mill RoadPalo Alto, CA 94304United States650-857-1501http://www.hp.comSector:\\xc2\\xa0TechnologyIndustry:\\xc2\\xa0Diversified Computer SystemsFull Time Employees:\\xc2\\xa049,000Key ExecutivesNameTitlePayExercisedAgeMr. Dion J. WeislerChief Exec. Officer, Pres and Director3.64MN/A50Ms. Catherine A. LesjakChief Financial Officer1.9M1.8M58Mr. Jon E. FlaxmanChief Operating Officer1.55MN/A59Ms. Kim M. RiveraChief Legal Officer, Gen. Counsel and Sec.2.2MN/A48Ms. Tracy Suitt  KeoghChief HR Officer1.35M3.78M56Amounts are as of December 31, 2016 and compensation values are for the last fiscal year ending on that date. Pay is salary, bonuses, etc. Exercised is the value of options exercised during the fiscal year. Currency in USD.DescriptionHP Inc. provides products, technologies, software, solutions, and services to individual consumers, small- and medium-sized businesses, and large enterprises, including customers in the government, health, and education 

In [50]:
lda_description(sample_review)



devices                   0.997


In [51]:
LDAvis_data_filepath = os.path.join("./", 'ldavis_prepared')

In [53]:
LDAvis_prepared = pyLDAvis.gensim.prepare(lda, trigram_bow_corpus,
                                          trigram_dictionary)


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  topic_term_dists = topic_term_dists.ix[topic_order]


In [55]:
from gensim.models import Word2Vec

trigram_sentences = LineSentence(trigram_sentences_filepath)
word2vec_filepath = os.path.join("./", 'word2vec_model_all')

In [57]:
token_count = sum([len(sentence) for sentence in trigram_sentences])
token_count

16559

In [58]:
for sentence in trigram_sentences:
    print(sentence)

["b'ticker", 'aa\\r', "b\\'alcoa", 'corporation390', 'park', 'avenuenew', 'york', 'ny', '10022unit', 'states212', '518', '5400http://www.alcoa.comsector:\\\\xc2\\\\xa0basic', 'materialsindustry:\\\\xc2\\\\xa0aluminumfull', 'time', 'employees:\\\\xc2\\\\xa014,000key', 'executivesnametitlepayexercisedagemr']
['roy', 'c.', 'harveychief', 'exec']
['officer_pres', 'and', 'director2.51mn', 'a42mr']
['william', 'f.', 'oplingerchief', 'financial_officer', 'and', 'exec']
['vp1.49mn', 'a50mr']
['t\\\\xc3\\\\xb3ma', 'm\\\\xc3\\\\xa1r', 'sigurdssonchief', 'operate', 'officer', 'and', 'exec']
['vp1.12mn', 'a48ms']
['leigh', 'ann', 'c.', 'fisherchief', 'admin']
['officer', 'and', 'exec']
['vp734.15kn', 'a50mr']
['jeffrey', 'd.', 'heeterexec']
['vp_gen._counsel', 'and', 'sec.575.06kn', 'a51amount', 'be_as_of', 'december_31_2016_and', 'compensation_value_be', 'for_the_last', 'fiscal_year_end_on', 'that_date']
['pay_be_salary_bonus', 'etc']
['exercise_be_the_value', 'of_option_exercise_during', 'the_fi

In [59]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to train the word2vec model yourself.
if 0 == 0:

    # initiate the model and perform the first epoch of training
    word2vec_model = Word2Vec(trigram_sentences, size=100, window=5,
                        min_count=5, sg=1, workers=4)
    
    word2vec_model.save(word2vec_filepath)

    # perform another 11 epochs of training
    for i in range(1,12):

        word2vec_model.train(trigram_sentences, total_examples = token_count, epochs=word2vec_model.iter)
        word2vec_model.save(word2vec_filepath)
        
# load the finished model from disk
word2vec_model = Word2Vec.load(word2vec_filepath)
word2vec_model.init_sims()

print(u'{} training epochs so far.'.format(word2vec_model.train_count))

12 training epochs so far.
Wall time: 1.58 s


In [62]:
 word2vec_model.wv.most_similar(positive=['printer', 'systems','personal'], negative=['aerospace','military'])

[('product_under_the', 0.45313823223114014),
 ('brand', 0.43247243762016296),
 ('printing', 0.4308511018753052),
 ('name', 0.40668952465057373),
 ('office', 0.37219804525375366),
 ('canon', 0.3580116033554077),
 ('laser', 0.34961646795272827),
 ('hardware', 0.34627506136894226),
 ('-PRON-_offer', 0.33907806873321533),
 ('windows', 0.33583319187164307)]

In [64]:
print(u'{:,} terms in the word2vec_model vocabulary.'.format(len(word2vec_model.wv.vocab)))

689 terms in the word2vec_model vocabulary.


In [65]:
ordered_vocab = [(term, voc.index, voc.count)
                 for term, voc in word2vec_model.wv.vocab.items()]
ordered_vocab
sorted(ordered_vocab,key= lambda tup:-tup[2])

[('and', 0, 993),
 ('the', 1, 365),
 ('of', 2, 229),
 ('to', 3, 180),
 ('in', 4, 175),
 ('service', 5, 142),
 ('-PRON-', 6, 140),
 ('a', 7, 137),
 ('for', 8, 134),
 ('product', 9, 125),
 ('system', 10, 89),
 ('be', 11, 86),
 ('software', 12, 76),
 ('solution', 13, 76),
 ('inc.', 14, 70),
 ('technology', 15, 61),
 ('day', 16, 56),
 ('management', 17, 54),
 ('on', 18, 51),
 ('that', 19, 50),
 ('with', 20, 50),
 ('an', 21, 50),
 ('company', 22, 48),
 ('time', 23, 46),
 ('etc', 24, 44),
 ('score_indicate_decile_rank', 25, 44),
 ('relative_to_index_or', 26, 44),
 ('region', 27, 44),
 ('a_decile_score', 28, 44),
 ('of_1_indicate_low', 29, 44),
 ('governance_risk_while_a', 30, 44),
 ('10_indicate_high_governance', 31, 44),
 ('include', 32, 44),
 ('provide', 33, 44),
 ('corporate', 34, 42),
 ('exec', 35, 40),
 ('pay_be_salary_bonus', 36, 40),
 ('exercise_be_the_value', 37, 40),
 ('of_option_exercise_during', 38, 40),
 ('the_fiscal_year', 39, 40),
 ('currency_in', 40, 40),
 ('as_well_as', 41, 4

In [66]:
# build a list of the terms, integer indices,
# and term counts from the food2vec model vocabulary
ordered_vocab = [(term, voc.index, voc.count)
                 for term, voc in word2vec_model.wv.vocab.items()]

# sort by the term counts, so the most common terms appear first
ordered_vocab = sorted(ordered_vocab, key=lambda tup: -tup[2])

# unzip the terms, integer indices, and counts into separate lists
ordered_terms, term_indices, term_counts = zip(*ordered_vocab)

# create a DataFrame with the food2vec vectors as data,
# and the terms as row labels
word_vectors = pd.DataFrame(word2vec_model.wv.syn0norm[term_indices, :],
                            index=ordered_terms)

word_vectors

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
and,-0.023645,-0.038171,0.059789,0.041828,0.011822,-0.107963,-0.049031,-0.050972,0.017772,0.074300,...,-0.104045,-0.105604,-0.014342,-0.055892,-0.086925,-0.067978,-0.208319,-0.054081,0.010501,-0.056453
the,0.026747,0.029485,-0.004793,-0.129301,-0.105599,-0.097232,0.007952,-0.040576,0.020348,0.082273,...,0.002127,-0.111556,-0.200759,0.057812,0.163365,-0.166457,0.019825,-0.086732,-0.071309,-0.049871
of,-0.094481,-0.045131,0.052841,0.134109,-0.182718,0.048849,-0.155351,-0.126259,-0.071666,-0.124926,...,-0.028082,0.025886,-0.116702,-0.061341,-0.051326,-0.027084,0.253679,0.079986,-0.040008,0.055770
to,-0.088491,0.140661,-0.005836,0.158597,0.026034,0.019748,-0.086108,-0.076431,0.070916,-0.014278,...,0.096160,-0.057166,-0.053430,-0.114061,-0.013228,0.002547,0.034243,-0.005560,-0.007879,0.033363
in,-0.063592,-0.042531,0.070607,-0.077441,0.086212,-0.031569,-0.143613,-0.126484,-0.049489,-0.016282,...,0.165220,-0.181171,-0.112174,-0.090869,-0.043964,-0.173300,-0.186948,-0.056557,-0.198267,-0.149049
service,-0.008233,0.019200,-0.102822,-0.189926,-0.069382,0.179039,-0.035749,0.044136,0.299106,0.053393,...,-0.028068,-0.072661,-0.071732,-0.092187,-0.110507,0.021183,-0.255311,0.009922,-0.078562,-0.050558
-PRON-,-0.186677,0.036366,0.076878,-0.039332,-0.057263,-0.012324,-0.088680,-0.121334,0.051457,0.053202,...,0.060441,-0.060583,-0.127860,-0.028532,-0.046317,0.002595,0.028565,0.098720,0.057112,0.046394
a,-0.083587,0.249915,0.135552,-0.103679,-0.043210,0.066708,-0.124653,0.049875,0.047665,-0.029842,...,-0.020380,-0.174839,-0.028402,-0.208854,0.049850,0.135622,0.062706,-0.032716,-0.025759,-0.020507
for,-0.045210,-0.031958,-0.121664,-0.050387,0.019464,0.051082,-0.149423,-0.079136,0.146656,0.011970,...,-0.085231,-0.041872,0.182956,-0.035281,0.056514,0.031408,-0.023310,-0.030582,-0.101479,-0.081657
product,-0.139859,0.028789,-0.077308,-0.163328,-0.252567,-0.008697,0.024674,-0.075519,0.057329,-0.088690,...,-0.003589,0.183670,-0.137474,0.075646,-0.050822,-0.074779,-0.181057,-0.150514,0.050494,0.030095


In [67]:
def get_related_terms(token, topn=10):
    """
    look up the topn most similar terms to token
    and print them as a formatted list
    """

    for word, similarity in word2vec_model.most_similar(positive=[token], topn=topn):

        print(u'{:20} {}'.format(word, round(similarity, 3)))

In [72]:
get_related_terms(u'ddd',topn=10)

etf_with             0.599
earning_analysis     0.552
us                   0.527
xone                 0.522
exposure_to          0.52
q2                   0.513
training             0.502
systems              0.488
produce              0.486
3d_systems           0.485


In [73]:
 word2vec_model.wv.most_similar(positive=['printer', '3d','xone'], negative=['surgical'])

[('laser', 0.5837117433547974),
 ('machine', 0.5807568430900574),
 ('print', 0.5516412854194641),
 ('additive', 0.5265382528305054),
 ('ddd', 0.5244567394256592),
 ('camera', 0.5211118459701538),
 ('enhance', 0.5091466903686523),
 ('production', 0.5071998834609985),
 ('document', 0.5020123720169067),
 ('part', 0.49066609144210815)]