## Step1: Imports

In [1]:
import sys
sys.path.append('../phraseextraction/')

import utility
import ranking
import pandas as pd
import candidate_generation

pd.set_option('display.max_rows', 10000)

[nltk_data] Downloading package punkt to /home/dsp/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/dsp/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Step2: Reading Data

In [2]:
with open('../dataset/AMZN_0001018724_10K_20191231_Item1_excerpt.txt') as company_description:
    description = company_description.read().lower()

## Step3: Post Processing: Cleaning document

In [3]:
#1. Remove punnctuations in each phrase
desc = utility.remove_punct_num(description)
print(desc)

part i item business this annual report on form 10-k and the documents incorporated herein by reference contain forward-looking statements based on expectations estimates and projections as of the date of this filing actual results may differ materially from those expressed in forward-looking statements see item 1a of part i risk factors amazon com inc was incorporated in in the state of washington and reincorporated in in the state of delaware our principal corporate offices are located in seattle washington we completed our initial public offering in may and our common stock is listed on the nasdaq global select market under the symbol amzn as used herein amazon com we our and similar terms include amazon com inc and its subsidiaries unless the context indicates otherwise general we seek to be earth s most customer-centric company we are guided by four principles customer obsession rather than competitor focus passion for invention commitment to operational excellence and long-term t

In [4]:
#2. Remove stop words from phrases
custom_stop_words = ['include','including','provide','provides','offers','related','new', 'allowed','help','overall','array','without','with','persuant',
                 'legacy','existing','exists','addition','additional','ensure','used','uses','process','offered','described','using','provided','sold',
                 'allows','providing','following','follows','rate','labels','come','vast','covers','end','late','use','allowing','exiting','full','many',
                'includes','earlier','become','upon','allow','among','required','based','higher','exciting','seen','certain','see','continue','various',
                'trademarks','patents','copyrights','intellectual','patent','trade-secret','trademark','copyright']

desc = utility.remove_stopwords(desc, custom_stopword_list=custom_stop_words)
print(desc)

item business annual report form 10-k documents incorporated reference forward-looking statements expectations estimates projections date filing actual results differ materially expressed forward-looking statements item 1a risk factors amazon incorporated state washington reincorporated state delaware principal corporate offices located seattle washington completed initial public offering common stock listed nasdaq global select market symbol amzn amazon similar terms amazon subsidiaries context general seek earth customer-centric company guided principles customer obsession competitor focus passion invention commitment operational excellence long-term thinking segments serve primary customer sets consisting consumers sellers developers enterprises content creators services advertising sellers vendors publishers authors programs sponsored ads display video advertising organized operations segments north america international amazon web services aws segments reflect company evaluates bu

In [5]:
#3. Remove non english words in each phrase
desc = utility.remove_non_english(desc)
print(desc)

item business annual report form 10-k documents incorporated reference forward-looking statements expectations estimates projections date filing actual results differ materially expressed forward-looking statements item risk factors amazon incorporated state reincorporated state principal corporate offices located completed initial public offering common stock listed global select market symbol amazon similar terms amazon subsidiaries context general seek earth company guided principles customer obsession competitor focus passion invention commitment operational excellence long-term thinking segments serve primary customer sets consisting consumers sellers developers enterprises content creators services advertising sellers vendors publishers authors programs sponsored ads display video advertising organized operations segments north international amazon web services aws segments reflect company evaluates business performance manages operations information net sales contained item ii f

## Step4: Candidate Generation: NGram Based

In [6]:
ngram_model = candidate_generation.Ngram_Keyphrase()
key_phrases = ngram_model.get_keyphrases(desc)
print(key_phrases)

[('item', 'business', 'annual'), ('business', 'annual', 'report'), ('annual', 'report', 'form'), ('report', 'form', '10-k'), ('form', '10-k', 'documents'), ('10-k', 'documents', 'incorporated'), ('documents', 'incorporated', 'reference'), ('incorporated', 'reference', 'forward-looking'), ('reference', 'forward-looking', 'statements'), ('forward-looking', 'statements', 'expectations'), ('statements', 'expectations', 'estimates'), ('expectations', 'estimates', 'projections'), ('estimates', 'projections', 'date'), ('projections', 'date', 'filing'), ('date', 'filing', 'actual'), ('filing', 'actual', 'results'), ('actual', 'results', 'differ'), ('results', 'differ', 'materially'), ('differ', 'materially', 'expressed'), ('materially', 'expressed', 'forward-looking'), ('expressed', 'forward-looking', 'statements'), ('forward-looking', 'statements', 'item'), ('statements', 'item', 'risk'), ('item', 'risk', 'factors'), ('risk', 'factors', 'amazon'), ('factors', 'amazon', 'incorporated'), ('amaz

## Step5: Frequency Distribution Ranking

In [7]:
freqDistRank = ranking.FrequencyDistRank(description)
keyword_candidates = freqDistRank.rank_phrases(key_phrases)

In [8]:
display(keyword_candidates)

Unnamed: 0,Phrases,Score
0,vice president business,0.007559
1,vice president amazon,0.007072
2,served vice president,0.006829
3,senior vice president,0.006585
4,services senior vice,0.006585
5,business senior vice,0.006342
6,vice president consumer,0.006097
7,vice president general,0.006097
8,president consumer business,0.006097
9,services served senior,0.005855
