In [None]:
exec('from __future__ import unicode_literals')

import os
import sys
import random
import json

module_path = os.path.abspath(os.path.join('../'))
if module_path not in sys.path:
    sys.path.append(module_path)
module_path = os.path.abspath(os.path.join('../onmt'))
if module_path not in sys.path:
    sys.path.append(module_path)

from itertools import repeat

from onmt.utils.logging import init_logger
from onmt.utils.misc import split_corpus
import onmt.translate.translator as translator

import onmt.opts as opts
from onmt.utils.parse import ArgumentParser
from kp_gen_eval import _get_parser


from nltk.corpus import stopwords
stoplist = stopwords.words('english')

from string import punctuation
import onmt.keyphrase.pke as pke
from nltk.corpus import stopwords

import onmt.keyphrase.kp_inference as kp_inference

In [None]:
import importlib
importlib.reload(kp_inference)
importlib.reload(translator)

### Load a text (assume current directory is OpenNMT-kpg/notebook/)

In [None]:
data_root_path = '../data/keyphrase/json/duc/duc_test.json'
doc_dicts = []
with open(data_root_path, 'r') as data_file:
    doc_dicts = [json.loads(l) for l in data_file]
        
print('Loaded #(docs)=%d' % (len(doc_dicts)))

##### Sample a paragraph

In [None]:
doc_id = random.randint(0, len(doc_dicts))
doc = doc_dicts[doc_id]
print(doc.keys())
text_to_extract = doc['abstract']
print(doc_id)
print(text_to_extract)


### Supervised Deep Keyphrase Model

In [18]:
parser = _get_parser()
config_path = '/zfs1/pbrusilovsky/rum20/kp/OpenNMT-kpg/config/translate/config-rnn-keyphrase.yml'
print(os.path.abspath('../config/translate/config-rnn-keyphrase.yml'))
print(os.path.exists(config_path))
# one2one_ckpt_path = 'models/keyphrase/keyphrase/meng17-one2seq/meng17-one2seq-kp20k/kp20k-meng17-one2one-rnn-BS128-LR0.05-Layer1-Dim150-Emb100-Dropout0.0-Copytrue-Covfalse-Contboth-IF1_step_30000.pt'
one2seq_ckpt_path = '/zfs1/pbrusilovsky/rum20/kp/OpenNMT-kpg/models/keyphrase/meng17-one2seq/meng17-one2seq-kp20k/kp20k-meng17-verbatim_append-rnn-BS64-LR0.05-Layer1-Dim150-Emb100-Dropout0.0-Copytrue-Reusetrue-Covtrue-PEfalse-Contboth-IF1_step_50000.pt'
opt = parser.parse_args('-config %s' % (config_path))
setattr(opt, 'models', [one2seq_ckpt_path])

translator = translator.build_translator(opt, report_score=False)

/zfs1/pbrusilovsky/rum20/kp/OpenNMT-kpg/config/translate/config-rnn-keyphrase.yml
True


In [23]:
scores, predictions = translator.translate(
    src=[text_to_extract],
    tgt=None,
    src_dir=opt.src_dir,
    batch_size=opt.batch_size,
    attn_debug=opt.attn_debug,
    opt=opt
)
print('Paragraph:\n\t'+text_to_extract)
print('Top predictions:')
keyphrases = [kp.lower().strip() for kp in predictions[0] if (not kp.lower().strip() in stoplist) and (kp != '<unk>') and (len(kp.strip())) > 0]
for kp_id, kp in enumerate(keyphrases[: min(len(keyphrases), 20)]):
    print('\t%d: %s' % (kp_id+1, kp))

Translating 10/1
Total translation time (s): 0.613762
Average translation time (s): 0.613762
Tokens per second: 1.629296
Paragraph:
	Elizabeth Taylor will remain in the hospital six more weeks due to complications in her fifth week of treatment for pneumonia, doctors said. The recovery of Miss Taylor, near death two weeks ago with viral pneumonia, was dealt a setback by bacterial pneumonia and a yeast infection, her doctors said Friday. ``This secondary bacterial pneumonia often follows viral pneumonia. Her condition is listed as stable and she is improving significantly,'' they said in a statement released by St. John's Hospital and Health Center. Earlier this week Miss Taylor's New York publicist, Chen Sam, had said the 58-year-old actress was improving and would be released from the hospital this week to recuperate at home. During a news conference last month, Miss Taylor's doctors revealed she was near death on April 22. The Oscar-winning star of ``Who's Afraid of Virginia Woolf?''

### PKE models

#### TF-IDF

In [None]:
dataset_name = 'test'
dataset_path = '../data/%s/' % dataset_name
_ = kp_inference.extract_pke(text_to_extract, method='tfidf' , dataset_path=dataset_path,
            df_path=os.path.abspath(dataset_path + '../%s.df.tsv.gz' % dataset_name), top_k=20)

#### YAKE

In [4]:
_ = kp_inference.extract_pke(text_to_extract, method='yake', top_k=20)

	1: called business risk (0.0331)
	2: financial goals (0.0332)
	3: called business (0.0632)
	4: business risk (0.0830)
	5: risk (0.1075)
	6: company (0.1209)
	7: anything (0.1383)
	8: ability (0.1383)
	9: risks may come (0.1415)
	10: business (0.1657)
	11: may (0.1765)
	12: threatens (0.1793)
	13: meet (0.1793)
	14: target (0.1793)
	15: achieve (0.1793)
	16: financial (0.1793)
	17: goals (0.1793)
	18: called (0.1793)
	19: risk management strategy (0.1844)
	20: anything that threatens (0.1851)


#### TextRank

In [5]:
# define the set of valid Part-of-Speeches
pos = {'NOUN', 'PROPN', 'ADJ'}

# 1. create a TextRank extractor.
extractor = pke.unsupervised.TextRank()

# 2. load the content of the document.
extractor.load_document(input=text_to_extract,
                        language='en_core_web_sm',
                        normalization=None)

# 3. build the graph representation of the document and rank the words.
#    Keyphrase candidates are composed from the 33-percent
#    highest-ranked words.
extractor.candidate_weighting(window=2,
                              pos=pos,
                              top_percent=0.33)

# 4. get the 10-highest scored candidates as keyphrases
keyphrases = extractor.get_n_best(n=10)
for kp_id, kp in enumerate(keyphrases):
    print('\t%d: %s (%.4f)' % (kp_id+1, kp[0], kp[1]))



	1: risk management (0.1901)
	2: company head (0.1464)
	3: financial goals (0.1464)
	4: risk (0.0950)
	5: sources (0.0732)
	6: company (0.0732)
