## Step1: Imports

In [1]:
import sys
sys.path.append('../phraseextraction/')

import utility
import ranking
import pandas as pd
from rule import grammar
from ranking import TextRank
from candidate_generation import Rake_Keyphrase



## Settings
pd.set_option('display.max_rows', 500)

[nltk_data] Downloading package punkt to /home/dsp/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/dsp/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
custom_stop_words = ['include','including','provide','provides','offers','related','new', 'allowed','help','overall','array','without','with','persuant',
                 'legacy','existing','exists','addition','additional','ensure','used','uses','process','offered','described','using','provided','sold',
                 'allows','providing','following','follows','rate','labels','come','vast','covers','end','late','use','allowing','exiting','full','many',
                'includes','earlier','become','upon','allow','among','required','based','higher','exciting','seen','certain','see','continue','various',
                'trademarks','patents','copyrights','intellectual','patent','trade-secret','trademark','copyright']

## Step2: Read Data

In [3]:
# Read the data from 10k filing business descriptions
with open('../dataset/AMZN_0001018724_10K_20191231_Item1_excerpt.txt') as company_description:
    description = company_description.read()


## Step3: Pre-processing the Data

In [4]:
#1. Remove named entities
ent_list=['DATE','GPE','PERSON','CARDINAL','ORDINAL','LAW','LOC','PERCENT','QUANTITY']
description = utility.remove_named_entities(description, ent_list)
#print(description)

## Step4: Candidate Generation: Stopword based splitting

In [5]:
#3. Remove punctuations and numbers
description = utility.remove_punct_num(description)
#print(description)

In [6]:
#4. Remove non english words
description = utility.remove_non_english(description)
print(description)

PART I Item Business This Annual Report on Form 0-K and the documents incorporated herein by reference contain forward-looking statements based on expectations estimates and projections as of the date of this filing Actual results may differ materially from those expressed in forward-looking statements See Item of Part I Risk Factors Amazon com Inc was incorporated in in the state of and reincorporated in in the state of Our principal corporate offices are located in We completed our initial public offering in and our common stock is listed on the Global Select Market under the symbol As used herein Amazon com we our and similar terms include Amazon com Inc and its subsidiaries unless the context indicates otherwise General We seek to be s most company We are guided by principles customer obsession rather than competitor focus passion for invention commitment to operational excellence and long-term thinking In each of our segments we serve our primary customer sets consisting of consum

In [7]:
rake_model = Rake_Keyphrase(ngram_ = (2,4), custom_stop_words=custom_stop_words)
phrases = rake_model.get_keyphrases(description)
print(phrases)

['item business', 'annual report', 'form 0-', 'documents incorporated', 'expectations estimates', 'filing actual results', 'differ materially', 'risk factors amazon', 'principal corporate offices', 'initial public offering', 'common stock', 'global select market', 'similar terms', 'principles customer obsession', 'competitor focus passion', 'invention commitment', 'operational excellence', 'long-term thinking', 'primary customer sets', 'consumers sellers developers', 'content creators', 'sellers vendors publishers', 'sponsored ads display', 'video advertising', 'segments international', 'amazon web services', 'segments reflect', 'company evaluates', 'business performance', 'operations information', 'net sales', 'ii financial statements', 'supplementary data note', 'segment information', 'financial results', 'foods market', 'foods market', 'consolidated financial statements', 'august consumers', 'serve consumers', 'physical stores', 'selection price', 'unique products', 'product categor

## Step4: Text Rank Scoring


### Score based on Word Embeddings Method


In [8]:
TR_WordEmbedding= ranking.TextRank(method= "WordEmbeddings")
WE_Phrases = TR_WordEmbedding.rank_phrases(phrases)

In [9]:
display(WE_Phrases)

Unnamed: 0,Phrases,Score
0,product types service,0.005311
1,services companies,0.005279
2,business additionally,0.005222
3,change business practices,0.005179
4,information technology services,0.005135
5,global consumer business,0.005084
6,president consumer business,0.005025
7,product categories customers,0.005025
8,initial public offering,0.005023
9,supply direct consumers,0.00502


### Score based on Window Size Method

In [10]:
TR_WindowSize = ranking.TextRank(original_text = description, method= "WindowSize")
WS_Phrases = TR_WindowSize.rank_phrases(phrases)

In [11]:
display(WS_Phrases)

Unnamed: 0,Phrases,Score
0,operate customer service,0.072652
1,timely customer service,0.071752
2,information technology services,0.070421
3,services companies,0.068658
4,product types service,0.065722
5,online service,0.063816
6,businesses sell,0.063139
7,enterprise services,0.061745
8,service marks,0.060242
9,serve consumers,0.059732
