## Step 1: Imports

In [2]:
import sys
sys.path.append('../phraseextraction/')

import utility
import ranking
import pandas as pd
from rule import grammar
import candidate_generation

## Settings
pd.set_option('display.max_rows', 500)

[nltk_data] Downloading package punkt to /home/dsp/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/dsp/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Step 2: Reading Data

In [3]:
with open('../dataset/AMZN_0001018724_10K_20191231_Item1_excerpt.txt') as company_description:
    description = company_description.read()
#print(description)

## Step 3: Candidate Generation: Grammar Rules Based

In [4]:
grammar_model = candidate_generation.Grammar_Keyphrase(grammar)
key_phrases = grammar_model.get_keyphrases(description)
print(key_phrases)

['herein by reference contain forward-looking', 'statements based on expectations', 'expressed in forward-looking statements', 'principal corporate offices are located', 'common stock is listed', 'similar terms include', 's most customer-centric company', 'competitor focus , passion for invention', 'customer sets , consisting', 'consumers , sellers , developers', 'services , such as advertising', 'sellers , vendors , publishers', 'sponsored ads , display', 'segments reflect the way', 'business performance and manages', 'net sales is contained', 'results of Whole Foods', 'online and physical stores', 'focus on selection , price', 'hundreds of millions of unique', 'third parties across dozens', 'websites , mobile apps', 'sell electronic devices', 'produce media content', 'customers low prices', 'delivery , easy-to-use functionality', 'includes unlimited free shipping', 'unlimited streaming of tens', 'movies and TV episodes , including', 'fulfill customer orders', 'number of ways , includ

## Step 4: Post Processing: Cleaning candidate phrases

### Remove Punctuations and Digits

In [5]:
#1. Remove punnctuations in each phrase
phrases_wo_puncts = [utility.remove_punct_num(ph) for ph in key_phrases]
print(phrases_wo_puncts)

['herein by reference contain forward-looking', 'statements based on expectations', 'expressed in forward-looking statements', 'principal corporate offices are located', 'common stock is listed', 'similar terms include', 's most customer-centric company', 'competitor focus passion for invention', 'customer sets consisting', 'consumers sellers developers', 'services such as advertising', 'sellers vendors publishers', 'sponsored ads display', 'segments reflect the way', 'business performance and manages', 'net sales is contained', 'results of Whole Foods', 'online and physical stores', 'focus on selection price', 'hundreds of millions of unique', 'third parties across dozens', 'websites mobile apps', 'sell electronic devices', 'produce media content', 'customers low prices', 'delivery easy-to-use functionality', 'includes unlimited free shipping', 'unlimited streaming of tens', 'movies and TV episodes including', 'fulfill customer orders', 'number of ways including', 'fulfillment and del

### Remove Stopwords

In [6]:
#2. Remove stop words from phrases

custom_stop_words = ['include','including','provide','provides','offers','related','new', 'allowed','help','overall','array','without','with','persuant',
                 'legacy','existing','exists','addition','additional','ensure','used','uses','process','offered','described','using','provided','sold',
                 'allows','providing','following','follows','rate','labels','come','vast','covers','end','late','use','allowing','exiting','full','many',
                'includes','earlier','become','upon','allow','among','required','based','higher','exciting','seen','certain','see','continue','various',
                'trademarks','patents','copyrights','intellectual','patent','trade-secret','trademark','copyright']

phrases_wo_stopwords = [utility.remove_stopwords(ph, custom_stop_words) for ph in phrases_wo_puncts]
print(phrases_wo_stopwords)

['reference forward-looking', 'statements expectations', 'expressed forward-looking statements', 'principal corporate offices located', 'common stock listed', 'similar terms', 'customer-centric company', 'competitor focus passion invention', 'customer sets consisting', 'consumers sellers developers', 'services advertising', 'sellers vendors publishers', 'sponsored ads display', 'segments reflect', 'business performance manages', 'net sales contained', 'results Whole Foods', 'online physical stores', 'focus selection price', 'hundreds millions unique', 'parties dozens', 'websites mobile apps', 'sell electronic devices', 'produce media content', 'customers low prices', 'delivery easy-to-use functionality', 'unlimited free shipping', 'unlimited streaming tens', 'movies TV episodes', 'fulfill customer orders', 'number ways', 'fulfillment delivery networks', 'arrangements countries', 'operate customer service centers', 'supplemented co-sourced arrangements', 'percentage sales per-unit', 'ac

### Remove Non english words

In [7]:
#3. Remove non english words in each phrase
phrases_wo_nonenglish_words = [utility.remove_non_english(phrase) for phrase in phrases_wo_stopwords]
print(phrases_wo_nonenglish_words)

['reference forward-looking', 'statements expectations', 'expressed forward-looking statements', 'principal corporate offices located', 'common stock listed', 'similar terms', 'company', 'competitor focus passion invention', 'customer sets consisting', 'consumers sellers developers', 'services advertising', 'sellers vendors publishers', 'sponsored ads display', 'segments reflect', 'business performance manages', 'net sales contained', 'results Whole Foods', 'online physical stores', 'focus selection price', 'hundreds millions unique', 'parties dozens', 'websites mobile apps', 'sell electronic devices', 'produce media content', 'customers low prices', 'delivery easy-to-use functionality', 'unlimited free shipping', 'unlimited streaming tens', 'movies TV episodes', 'fulfill customer orders', 'number ways', 'fulfillment delivery networks', 'arrangements countries', 'operate customer service centers', 'supplemented co-sourced arrangements', 'percentage sales per-unit', 'activity fees', 'se

### Only keep Trigrams & Quadgrams

In [8]:
#4. Only keeping bigrams, trigrams, quadgrams
final_phrases = candidate_generation.get_ngrams(phrases_wo_nonenglish_words, (3,4))
print(final_phrases)

['expressed forward-looking statements', 'principal corporate offices located', 'common stock listed', 'competitor focus passion invention', 'customer sets consisting', 'consumers sellers developers', 'sellers vendors publishers', 'sponsored ads display', 'business performance manages', 'net sales contained', 'results Whole Foods', 'online physical stores', 'focus selection price', 'hundreds millions unique', 'websites mobile apps', 'sell electronic devices', 'produce media content', 'customers low prices', 'delivery easy-to-use functionality', 'unlimited free shipping', 'unlimited streaming tens', 'movies TV episodes', 'fulfill customer orders', 'fulfillment delivery networks', 'operate customer service centers', 'supplemented co-sourced arrangements', 'percentage sales per-unit', 'serve developers enterprises', 'set global compute', 'authors independent publishers', 'authors publishers choose', 'filmmakers skill app developers', 'large variety product types', 'retailers publishers ve

## Step 5: Degree Ranking

In [9]:
rakeRank = ranking.RakeRank(method='degree')
ranked_df = rakeRank.rank_phrases(final_phrases)

In [10]:
display(ranked_df)

Unnamed: 0,Phrases,Score
0,information technology services products,34
1,fulfillment logistics services,28
2,business combinations alliances strengthen,28
3,technology infrastructure fulfillment,25
4,customers greater brand recognition,25
5,fulfillment delivery networks,24
6,sellers vendors publishers,24
7,filmmakers skill app developers,22
8,operate customer service centers,22
9,potential customers restrictive,22
