## Step1: Imports

In [1]:
import sys
sys.path.append('../phraseextraction/')

import utility
import ranking
import pandas as pd
from candidate_generation import Rake_Keyphrase


pd.set_option('display.max_rows', 500)

[nltk_data] Downloading package punkt to /home/dsp/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/dsp/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
custom_stop_words = ['include','including','provide','provides','offers','related','new', 'allowed','help','overall','array','without','with','persuant',
                 'legacy','existing','exists','addition','additional','ensure','used','uses','process','offered','described','using','provided','sold',
                 'allows','providing','following','follows','rate','labels','come','vast','covers','end','late','use','allowing','exiting','full','many',
                'includes','earlier','become','upon','allow','among','required','based','higher','exciting','seen','certain','see','continue','various',
                'trademarks','patents','copyrights','intellectual','patent','trade-secret','trademark','copyright']

## Step2: Read Data

In [3]:
# Read the data from 10k filing business descriptions
with open('../dataset/AMZN_0001018724_10K_20191231_Item1_excerpt.txt') as company_description:
    description = company_description.read()

## Step3: Pre-processing the Data

In [4]:
#1. Remove named entities
ent_list=['DATE','GPE','PERSON','CARDINAL','ORDINAL','LAW','LOC','PERCENT','QUANTITY']
description = utility.remove_named_entities(description, ent_list)
print(description)

PART I

Item .Business

This Annual Report on Form 0-K and the documents incorporated herein by reference contain forward-looking statements based on expectations, estimates, and projections as of the date of this filing. Actual results may differ materially from those expressed in forward-looking statements. See Item  of Part I — “Risk Factors.”

Amazon.com, Inc. was incorporated in  in the state of  and reincorporated in  in the state of . Our principal corporate offices are located in , . We completed our initial public offering in  and our common stock is listed on the Nasdaq Global Select Market under the symbol “AMZN.”

As used herein, “Amazon.com,” “we,” “our,” and similar terms include Amazon.com, Inc. and its subsidiaries, unless the context indicates otherwise.

General

We seek to be ’s most customer-centric company. We are guided by  principles: customer obsession rather than competitor focus, passion for invention, commitment to operational excellence, and long-term thinki

## Step4: Candidate Generation: Stopword based splitting

In [5]:
#2. Remove punctuations and numbers
description = utility.remove_punct_num(description)
print(description)

PART I Item Business This Annual Report on Form 0-K and the documents incorporated herein by reference contain forward-looking statements based on expectations estimates and projections as of the date of this filing Actual results may differ materially from those expressed in forward-looking statements See Item of Part I Risk Factors Amazon com Inc was incorporated in in the state of and reincorporated in in the state of Our principal corporate offices are located in We completed our initial public offering in and our common stock is listed on the Nasdaq Global Select Market under the symbol AMZN As used herein Amazon com we our and similar terms include Amazon com Inc and its subsidiaries unless the context indicates otherwise General We seek to be s most customer-centric company We are guided by principles customer obsession rather than competitor focus passion for invention commitment to operational excellence and long-term thinking In each of our segments we serve our primary custo

In [6]:
#4. Remove non english words
description = utility.remove_non_english(description)
print(description)

PART I Item Business This Annual Report on Form 0-K and the documents incorporated herein by reference contain forward-looking statements based on expectations estimates and projections as of the date of this filing Actual results may differ materially from those expressed in forward-looking statements See Item of Part I Risk Factors Amazon com Inc was incorporated in in the state of and reincorporated in in the state of Our principal corporate offices are located in We completed our initial public offering in and our common stock is listed on the Global Select Market under the symbol As used herein Amazon com we our and similar terms include Amazon com Inc and its subsidiaries unless the context indicates otherwise General We seek to be s most company We are guided by principles customer obsession rather than competitor focus passion for invention commitment to operational excellence and long-term thinking In each of our segments we serve our primary customer sets consisting of consum

In [7]:
rake_model = Rake_Keyphrase(ngram_ = (2,4), custom_stop_words=custom_stop_words)
phrases = rake_model.get_keyphrases(description)
print(phrases)

['item business', 'annual report', 'form 0-', 'documents incorporated', 'expectations estimates', 'filing actual results', 'differ materially', 'risk factors amazon', 'principal corporate offices', 'initial public offering', 'common stock', 'global select market', 'similar terms', 'principles customer obsession', 'competitor focus passion', 'invention commitment', 'operational excellence', 'long-term thinking', 'primary customer sets', 'consumers sellers developers', 'content creators', 'sellers vendors publishers', 'sponsored ads display', 'video advertising', 'segments international', 'amazon web services', 'segments reflect', 'company evaluates', 'business performance', 'operations information', 'net sales', 'ii financial statements', 'supplementary data note', 'segment information', 'financial results', 'foods market', 'foods market', 'consolidated financial statements', 'august consumers', 'serve consumers', 'physical stores', 'selection price', 'unique products', 'product categor

## Step4: Rake|Degree Ranking
* Degree
* Rake Score

In [8]:
rake_rank = ranking.RakeRank(method='degree')
keyword_candidates = rake_rank.rank_phrases(phrases)
display((keyword_candidates))

Unnamed: 0,Phrases,Score
0,senior vice president,76
1,vice president worldwide,66
2,vice president general,66
3,president consumer business,64
4,june vice president,61
5,vice president,58
6,april senior vice,58
7,amazon web services,53
8,global consumer business,46
9,ceo amazon web,42
