In [1]:
import json
from functools import reduce
import numpy as np
import os 
from os import path
import re

root = '../datasets/keyphrase_datasets/Hulth2003'
dirnames = ['Test', 'Training', 'Validation']

stories = []
for leaf in dirnames:
    dirname = path.join(root, leaf)
    for file in os.listdir(dirname):
        if file.endswith('.abstr'):
            file = path.join(dirname, file)
            with open(file) as f:
                filecontent = f.read()
            
            def parseEntities(line, label):
                entities = [{'id' : p[0:].strip(), 'label' : label} for p in line.split(';')]
                entities = [entity for entity in entities if len(entity['id']) > 0]
                return entities
            def getEntities(file, label):
                entities = []
                with open(file) as f:
                    for line in f:
                        entities = entities + parseEntities(line, label) 
                    return entities
                return []
            
            contrFile = file[:-6] + '.contr'
            contrEntities = getEntities(contrFile, 'contr')
            
            uncontrFile = file[:-6] + '.uncontr'
            uncontrEntities = getEntities(uncontrFile, 'uncontr')
            
            stories.append({'headline' : '', 'body' : filecontent, 'entities' : contrEntities + uncontrEntities})
                
def getOffsets(surface, content):
    return [m.span() for m in re.finditer(re.escape(surface), content)]

def getSurfaceForms(forms, headline, body):
    headlineOffsets = [getOffsets(form, headline) for form in forms]
    bodyOffsets = [getOffsets(form, body) for form in forms]
    surfaceForms = [ {'form': form, 'headOffsets' : headOff, 'bodyOffsets' : bodyOff} 
                     for (form, headOff, bodyOff) in zip(forms, headlineOffsets, bodyOffsets)
                     if len(headOff) > 0 or len(bodyOff) > 0]
    return surfaceForms, forms[0]   
    
def addSurfaceForms(story):
    entities = [entity for entity in story['entities']]
    headline = story['headline'].lower()
    body = story['body'].lower()
    for entity in entities:
        form = entity['id'].strip()
        if len(form) == 0:
            continue
        entity['forms'], entity['name'] = getSurfaceForms([form], headline, body)
    story['entities'] = entities

[addSurfaceForms(story) for story in stories]
print(stories[0])

{'headline': '', 'body': 'Uncertainty bounds and their use in the design of interval type-2 fuzzy logic\n\tsystems\nWe derive inner- and outer-bound sets for the type-reduced set of an interval\n\ttype-2 fuzzy logic system (FLS), based on a new mathematical\n\tinterpretation of the Karnik-Mendel iterative procedure for computing\n\tthe type-reduced set. The bound sets can not only provide estimates\n\tabout the uncertainty contained in the output of an interval type-2\n\tFLS, but can also be used to design an interval type-2 FLS. We\n\tdemonstrate, by means of a simulation experiment, that the resulting\n\tsystem can operate without type-reduction and can achieve similar\n\tperformance to one that uses type-reduction. Therefore, our new design\n\tmethod, based on the bound sets, can relieve the computation burden of\n\tan interval type-2 FLS during its operation, which makes an interval\n\ttype-2 FLS useful for real-time applications\n', 'entities': [{'id': 'forecasting theory', 'label

In [2]:
with open('../datasets/inspec-standard.json', 'w') as f:
    f.write(json.dumps(stories))