### Apply Keyword Extraction using Graphical Models

In [1]:
import os
import pandas as pd
import numpy as np
import string

import pke

import spacy
from spacy import displacy

from nltk import download
download('stopwords')
from nltk.corpus import stopwords

nlp = spacy.load('en')
os.getcwd()

[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


'/mnt/Model'

#### Built stoplist

In [2]:
stoplist = list(string.punctuation)
stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']
stoplist += stopwords.words('english')

In [3]:
def extract_key_words(text, method, n_word = 10):
    '''
    input: text- string, method: string key for extractor_list
    output: list of top key words extracted
    '''
    extractor_dict = {
        ## graph-based methods:
        'text rank': pke.unsupervised.TextRank(),
        'single rank': pke.unsupervised.SingleRank(),
        'position rank': pke.unsupervised.PositionRank(),
        'topic rank': pke.unsupervised.TopicRank(),
        'topical page rank': pke.unsupervised.TopicalPageRank()
    }
    
    extractor = extractor_dict[method]
    extractor.load_document(input= text, language='en', normalization='stemming')
    # keyphrase candidate selection
    extractor.candidate_selection()
    # candidate weighting, in the case of TopicRank: using a random walk algorithm  
    if (method == 'topical page rank'):      
        extractor.candidate_weighting(window=10, lda_model='../Output/lda_model')
    else:
        extractor.candidate_weighting() #in the case of TopicRank: sequences of nouns and adjectives (i.e. `(Noun|Adj)*`)    

    keyphrases = extractor.get_n_best(n= n_word) # get key phrases
    
    return keyphrases

#### Load Data

In [4]:
submissions = pd.read_excel('../Output/Submissions_test.xls')

In [5]:
import warnings
warnings.filterwarnings('ignore')

In [7]:
min(submissions.Description.apply(len))

5

In [11]:
## for key in ['text rank', 'single rank', 'position rank', 'topic rank', 'topical page rank']:
for key in ['topic rank']:
    print('Keyword extraction using key: ', key, r'...')
    submissions[key] = submissions.Description.apply(lambda x: extract_key_words(x, key, n_word= 5) if len(x)>20 else None)
## For topic rank and topical page rank, perhaps make sense to build on the entire corpus instead of individual descriptions.

Keyword extraction using key:  topic rank ...


KeyboardInterrupt: 

In [7]:
submissions.to_csv('../Output/Submissions_with_Keyword_Extraction_Graphical.csv')

#### Pick one for example

In [12]:
i = 15
print(str.title(submissions.loc[i,'Title']))
text = submissions.loc[i, 'Description']
#### Visuzliae Entity
doc = nlp(text)
displacy.render(doc, style='ent', jupyter= True)
#### Print out keywords Extracted
for key in ['text rank', 'single rank', 'position rank', 'topic rank', 'topical page rank']:
    print('Keyword extraction using key: ', key, r'...')
    print(extract_key_words(text, key, n_word= 6), '\n')

Big Data Platform For Sharing Knowledge 


Keyword extraction using key:  text rank ...
[('big data best practices', 0.12880107991853745), ('big data platform', 0.103548256742373), ('big data usage', 0.091499655882469), ('data analytics', 0.07717878721618945), ('big data', 0.07717875721618946), ('innovative ideas relevant', 0.0750941073341677)] 

Keyword extraction using key:  single rank ...
[('big data platform', 0.24227717140470373), ('big data best practices', 0.23452620190409137), ('big data usage', 0.21806314642914218), ('big data', 0.20369496539850632), ('data analytics', 0.13579176926256342), ('fund staff', 0.0767007311381496)] 

Keyword extraction using key:  position rank ...
[('big data platform', 0.2783764464295005), ('big data usage', 0.21924817012354117), ('big data', 0.20818278673612037), ('knowledge sharing platform', 0.1461123726060911), ('data analytics', 0.12958986950394058), ('fund staff', 0.06851656755570432)] 

Keyword extraction using key:  topic rank ...
[('big data best practices', 0.15713289729980567),