### Apply Keyword Extraction using Graphical Models

In [1]:
import os
import pandas as pd
import numpy as np
import string

import pke

import spacy
from spacy import displacy

from nltk import download
download('stopwords')
from nltk.corpus import stopwords

nlp = spacy.load('en')
os.getcwd()

[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


'/mnt/Model'

#### Built stoplist

In [2]:
stoplist = list(string.punctuation)
stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']
stoplist += stopwords.words('english')

In [3]:
def extract_key_words(text, method, n_word = 10):
    '''
    input: text- string, method: string key for extractor_list
    output: list of top key words extracted
    '''
    extractor_dict = {
        ## graph-based methods:
        'text rank': pke.unsupervised.TextRank(),
        'single rank': pke.unsupervised.SingleRank(),
        'position rank': pke.unsupervised.PositionRank(),
        'topic rank': pke.unsupervised.TopicRank(),
        'topical page rank': pke.unsupervised.TopicalPageRank()
    }
    
    extractor = extractor_dict[method]
    extractor.load_document(input= text, language='en', normalization='stemming')
    # keyphrase candidate selection
    extractor.candidate_selection()
    # candidate weighting, in the case of TopicRank: using a random walk algorithm  
    if (method == 'topical page rank'):      
        extractor.candidate_weighting(window=10, lda_model='../Output/lda_model')
    else:
        extractor.candidate_weighting() #in the case of TopicRank: sequences of nouns and adjectives (i.e. `(Noun|Adj)*`)    

    keyphrases = extractor.get_n_best(n= n_word) # get key phrases
    
    return keyphrases

#### Load Data

In [4]:
submissions = pd.read_excel('../Output/Submissions_test.xls')

In [5]:
import warnings
warnings.filterwarnings('ignore')

In [6]:
min(submissions.Description.apply(len))

5

In [9]:
method = 'single rank' ## choose from ['text rank', 'single rank', 'position rank', 'topic rank', 'topical page rank']:

In [8]:

for key in method:
    print('Keyword extraction using key: ', key, r'...')
    submissions[key] = submissions.Description.apply(lambda x: extract_key_words(x, key, n_word= 5) if len(x)>20 else None)
## For topic rank and topical page rank, perhaps make sense to build on the entire corpus instead of individual descriptions.



Keyword extraction using key:  single rank ...




In [10]:
submissions.to_csv('../Output/Submissions_with_Keyword_Extraction_Graphical_{}.csv'.format(method))

In [11]:
submissions.head()

Unnamed: 0,Webstorm,Webstorm ID,Code,Category,Submitted,Submitter,Submitter email,Title,Description,Status,...,Position,Work History,Expert Skills,Photo,Submitted From,External ID,Internal_External,Title_modified,Team_Submission,single rank
0,IMF Knowledge Sharing Challenge,DC8A2946-5EFF-481B-949B-91D314A100EB,D194,Visualization,11/11/2017 02:20,Team - Damon Hanlan,DHanlan@imf.org,IMF Explained,This project aims to promote wider education a...,Under Review,...,,,,,WebStorm,DHanlan@imf.org,External,explain imf,No,"[(imf knowledge material, 0.07005838508891428)..."
1,IMF Knowledge Sharing Challenge,DC8A2946-5EFF-481B-949B-91D314A100EB,D193,Visualization,11/11/2017 00:30,Reem Disu,RDisu@imf.org,iCreativity Bubble - A visual space to explor...,\tiCreativity Bubble\n\nThis idea is an engagi...,Under Review,...,,,,,WebStorm,RDisu@imf.org,External,bubbl explor icr share space visual,Yes,"[(their own bubbles, 0.18881243038625825), (th..."
2,IMF Knowledge Sharing Challenge,DC8A2946-5EFF-481B-949B-91D314A100EB,D191,Other,11/10/2017 20:26,Team - Sue Hopkins,SHopkins@imf.org,QR Codes for Fund-wide Document Management and...,The Office of the Managing Director is uniquel...,Under Review,...,,,,,WebStorm,SHopkins@imf.org,External,analyt code document fundwid manag qr,No,"[(wide document management, 0.1149812780104402..."
3,IMF Knowledge Sharing Challenge,DC8A2946-5EFF-481B-949B-91D314A100EB,D190,Other,11/10/2017 18:39,Team - Cosette Wong,CWong@imf.org,Knowledge librarians,We propose the creation of a knowledge librari...,Under Review,...,,,,,WebStorm,CWong@imf.org,External,knowledg librarian,No,"[(knowledge management unit, 0.190070333511578..."
4,IMF Knowledge Sharing Challenge,DC8A2946-5EFF-481B-949B-91D314A100EB,D189,Other,11/10/2017 18:35,Chenju Chakravarthy,CChakravarthy@imf.org,Intranet Data portal,Recently worked on a project to mine fungible ...,Under Review,...,,,,,WebStorm,CChakravarthy@imf.org,External,data intranet portal,Yes,"[(fungible economist skills data, 0.1806118050..."


#### Pick one for example

In [None]:
i = 16
print(str.title(submissions.loc[i,'Title']))
text = submissions.loc[i, 'Description']

#### Visuzliae Named-Entity
doc = nlp(text)
displacy.render(doc, style='ent', jupyter= True)

#### Print out keywords Extracted
for key in ['text rank', 'single rank', 'position rank', 'topic rank', 'topical page rank']:
    print('Keyword extraction using key: ', key, r'...')
    print(extract_key_words(text, key, n_word= 6), '\n')