### About
The goal of this script is to process a few common keyphrase datasets, including
 - **Tokenize**: by default using method from Meng et al. 2017, which fits more for academic text since it splits strings by hyphen etc. and makes tokens more fine-grained. 
     - keep [_<>,\(\)\.\'%]
     - replace digits with < digit >
     - split by [^a-zA-Z0-9_<>,#&\+\*\(\)\.\'%]
 - **Determine present/absent phrases**: determine whether a phrase appears verbatim in a text. This is believed a very important step for the evaluation of keyphrase-related tasks, since in general extraction methods cannot recall any phrases don't appear in the source text.

In [3]:
import torch
import tensorflow
print(torch.__version__, torch.cuda.is_available())

In [4]:
import os
import sys
import re
import json
import numpy as np
from collections import defaultdict

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
module_path = os.path.abspath(os.path.join('../onmt'))
if module_path not in sys.path:
    sys.path.append(module_path)

import kp_evaluate
import onmt.keyphrase.utils as utils

In [5]:
import os
from tqdm import tqdm
dataset_names = ['inspec', 'krapivin', 'nus', 'semeval', 'kp20k', 'duc', 'stackexchange']

# path to the json folder
json_base_dir = 'UniKP/UniKeyphrase/data' 
# store in preprocessed data directory
splits = ['train', 'test', 'valid']

for dataset_name in dataset_names:
    for split in splits:
        print(dataset_name)

        input_json_path = os.path.join(json_base_dir, dataset_name, '%s_%s.json' % (dataset_name, split))
        if not os.path.isfile(input_json_path):
            print('File %s does not exist, skipping..' % os.path.basename(input_json_path))
            continue

        output_json_path = os.path.join(json_base_dir, dataset_name, '%s_%s_meng17token.json' % (dataset_name, split))

        doc_count, present_doc_count, absent_doc_count = 0, 0, 0
        tgt_num, present_tgt_num, absent_tgt_num = [], [], []

        with open(input_json_path, 'r') as input_json, open(output_json_path, 'w') as output_json:
            lines = input_json.readlines()
            for json_line in tqdm(lines, total=len(lines)):
                json_dict = json.loads(json_line)

                if dataset_name == 'stackexchange':
                    json_dict['abstract'] = json_dict['question']
                    json_dict['keywords'] = json_dict['tags']            
                    del json_dict['question']
                    del json_dict['tags']
                if dataset_name == 'openkp':
                    json_dict['abstract'] = json_dict['text']
                    json_dict['keywords'] = json_dict['KeyPhrases']
                    # no title
                    json_dict['title'] = ''
                    del json_dict['text']
                    del json_dict['KeyPhrases']
                if dataset_name == "kptimes":
                    json_dict["keywords"] = json_dict["keyword"]
                    del json_dict["keyword"]

                title = json_dict['title']
                abstract = json_dict['abstract']
                keywords = json_dict['keywords']

                if isinstance(keywords, str):
                    keywords = keywords.split(';')
                    json_dict['keywords'] = keywords
                # remove all the abbreviations/acronyms in parentheses in keyphrases
                keywords = [re.sub(r'\(.*?\)|\[.*?\]|\{.*?\}', '', kw) for kw in keywords]

                # tokenize text
                title_token = utils.meng17_tokenize(title)
                abstract_token = utils.meng17_tokenize(abstract)
                keywords_token = [utils.meng17_tokenize(kw) for kw in keywords]

                # replace numbers
                title_token = utils.replace_numbers_to_DIGIT(title_token, k=2)
                # restrict to maximum 384 tokens for longer datasets (like kptimes, openkp etc)
                abstract_token = utils.replace_numbers_to_DIGIT(abstract_token, k=2)[:384]
                keywords_token = [utils.replace_numbers_to_DIGIT(kw, k=2) for kw in keywords_token]                
                
                num_title_tokens = len(title_token)
                json_dict['title_len'] = num_title_tokens
                
                src_token = title_token+["."]+abstract_token
                # print('len of source tokens: ', len(src_token))
                tgts_token = keywords_token

                # split tgts by present/absent
                src_seq = src_token
                tgt_seqs = tgts_token

                present_tgt_flags, _, _ = utils.if_present_duplicate_phrases(src_seq, tgt_seqs)
                present_tgts = [tgt for tgt, present in zip(tgt_seqs, present_tgt_flags) if present]
                absent_tgts = [tgt for tgt, present in zip(tgt_seqs, present_tgt_flags) if ~present]

                doc_count += 1
                present_doc_count = present_doc_count + 1 if len(present_tgts) > 0 else present_doc_count
                absent_doc_count = absent_doc_count + 1 if len(absent_tgts) > 0 else absent_doc_count

                tgt_num.append(len(tgt_seqs))
                present_tgt_num.append(len(present_tgts))
                absent_tgt_num.append(len(absent_tgts))

                # write to output json
                tokenized_dict = {'src': src_token, 'tgt': tgts_token, 
                                  'present_tgt': present_tgts, 'absent_tgt': absent_tgts}
                json_dict['meng17_tokenized'] = tokenized_dict
                output_json.write(json.dumps(json_dict) + '\n')

        print('#doc=%d, #present_doc=%d, #absent_doc=%d, #tgt=%d, #present=%d, #absent=%d' 
              % (doc_count, present_doc_count, absent_doc_count, 
                 sum(tgt_num), sum(present_tgt_num), sum(absent_tgt_num)))
    
    

# Convert Datasets to KPDrop format (output inside ./dataset directory inside KPDrop)

In [4]:
# converting to KPDrop Format
import os
from tqdm import tqdm

dataset_names = ["openkp", "kptimes", "stackexchange", "krapivin", "semeval", "inspec", "nus"]
# train dataset names
train_dataset_names = ["openkp", "kptimes", "stackexchange"]

for dataset_name in dataset_names:
    print('dataset name: ', dataset_name)
    if dataset_name in train_dataset_names:
        splits = ["train", "test", "valid"]
    else:
        splits = ["test"]
    
    for split in splits:
        print('split: ', split)
        data_dir = f"UniKP/UniKeyphrase/data/{dataset_name}"
        out_data_dir = f"KPDrop/dataset/{dataset_name}"
        if not os.path.exists(out_data_dir):
            os.makedirs(out_data_dir, exist_ok=True)
        
        path = f'{dataset_name}_{split}_meng17token.json'
        
        kpdrop_tgt = os.path.join(out_data_dir,f'{split}_trg.txt')
        kpdrop_src = os.path.join(out_data_dir,f'{split}_src.txt')
        in_path = os.path.join(data_dir, path)

        with open(in_path, 'r') as f, open(kpdrop_tgt, 'w') as f1, open(kpdrop_src, 'w') as f2:
            lines = f.readlines()
            for line in tqdm(lines, total=len(lines)):
                line = json.loads(line)
                # print(line)
                source = line['meng17_tokenized']['src']
                num_tokens = line['title_len']


                source = source[:num_tokens] + ['<eos>'] + source[num_tokens:]
                source_text = " ".join(source)

                targets = line['meng17_tokenized']['tgt']
                target_list = []
                for targ in targets:
                    kp = " ".join(targ)
                    target_list.append(kp)
                target_text = ";".join(target_list)

                f2.write(source_text.strip().lower()+"\n")
                f1.write(target_text.strip().lower()+"\n")

dataset name:  openkp
split:  train


100%|██████████████████████████████████████████████████████████████████████████████████████████████| 134894/134894 [00:07<00:00, 17016.94it/s]


split:  test


100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 6614/6614 [00:00<00:00, 19568.22it/s]


split:  valid


100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 6616/6616 [00:00<00:00, 20208.38it/s]


dataset name:  kptimes
split:  train


100%|██████████████████████████████████████████████████████████████████████████████████████████████| 259923/259923 [00:17<00:00, 15281.95it/s]


split:  test


100%|████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 15487.47it/s]


split:  valid


100%|████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 15646.81it/s]


dataset name:  stackexchange
split:  train


100%|██████████████████████████████████████████████████████████████████████████████████████████████| 298965/298965 [00:09<00:00, 31895.72it/s]


split:  test


100%|████████████████████████████████████████████████████████████████████████████████████████████████| 16000/16000 [00:00<00:00, 33192.27it/s]


split:  valid


100%|████████████████████████████████████████████████████████████████████████████████████████████████| 16000/16000 [00:00<00:00, 33822.65it/s]


dataset name:  krapivin
split:  test


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 460/460 [00:00<00:00, 5702.18it/s]


dataset name:  semeval
split:  test


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 3898.52it/s]


dataset name:  inspec
split:  test


100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:00<00:00, 32686.79it/s]


dataset name:  nus
split:  test


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 211/211 [00:00<00:00, 4711.95it/s]
