In [5]:
import json
from functools import reduce
import numpy as np

filename = '/home/george/salience-prediction/dexter-salience/saliency-dataset.json'
with open(filename, 'r') as f:
    stories = [json.loads(line) for line in f]
    
def transformToStandardFormat(story):
    document = story['document']
    # get headline and body
    headline = ''
    body = ''
    parts = []
    for part in document:
        if part['name'] == 'headline':
            headline = part['value']
        elif part['name'][0:4] == 'body':
            parts.append(part)
    parts = sorted(parts, key= lambda part : int(part['name'][10:]))
    body  = reduce(lambda x, y: x + y, [part['value'] for part in parts], '')
    entities = [{'id' : str(entity['entityid']), 'score' : entity['score']} for entity in story['saliency']]  
    return {'headline' : headline, 'body' : body, 'entities' : entities}
        
stories = [transformToStandardFormat(story) for story in stories]
print(stories[0])

l = [len(story['body']) for story in stories]
print(np.median(l))

{'headline': 'Iran close to decision on nuclear program', 'body': "Iranian representatives say negotiations with Europe on its nuclear program are in the final stages.Iran's foreign minister, Kamal Kharazi, told state television Saturday Iranian negotiators have given their final response to a European Union proposal to suspend Iran's uranium enrichment program. He said it is now up to the Europeans to decide whether or not to accept their decision.Iran and the European Union's big three powers &mdash; Britain, Germany, and France &mdash; have been negotiating a deal under which Tehran would agree to freeze sensitive nuclear work to avoid possible U.N. Security Council sanctions.U.S. Secretary of State Colin Powell, says that Iran's nuclear program is intended to make nuclear weapons. Iran authorities have insisted that their nuclear ambitions are limited to generating electricity from atomic energy plants, not making bombs.Critics of the position of the United States point to Israel's

In [6]:
filename = '/home/george/salience-prediction/dexter-salience/saliency.svm'
entitiesMap = {}
with open(filename) as f:
    for line in f:
        idx = line.rfind('#')
        comment = line[idx+1:]
        elems = comment.split('\t')
        surfaceForms = elems[1]
        surfaceForms = [elem.strip() for elem in surfaceForms[1:-1].split(',')]
        entityId = elems[2]
        entityName = elems[3].strip()
        entitiesMap[entityId] = {'surfaceForms' : surfaceForms, 'entity' : entityName}
print(list(entitiesMap.items())[0])

('9350062', {'surfaceForms': ['powell'], 'entity': 'Arthur_William_Baden_Powell'})


In [7]:
import re

def getOffsets(surface, content):
    return [m.span() for m in re.finditer(surface, content)]

def getSurfaceForms(id, headline, body):
    if id not in entitiesMap:        
        forms = []
        entity = 'NegativeSample'
        headlineOffsets = []
        bodyOffsets = []
    else:
        forms = entitiesMap[id]['surfaceForms']
        entity = entitiesMap[id]['entity']
        headlineOffsets = [getOffsets(form, headline) for form in forms]
        bodyOffsets = [getOffsets(form, body) for form in forms]
    
    surfaceForms = [ {'form': form, 'headOffsets' : headOff, 'bodyOffsets' : bodyOff} 
                     for (form, headOff, bodyOff) in zip(forms, headlineOffsets, bodyOffsets)
                     if len(headOff) > 0 or len(bodyOff) > 0]
    return surfaceForms, entity    
    
def addSurfaceForms(story):
    entities = [entity for entity in story['entities'] if entity['id'] in entitiesMap or entity['score'] < 0]
    headline = story['headline'].lower()
    body = story['body'].lower()
    for entity in entities:
        entity['forms'], entity['name'] = getSurfaceForms(entity['id'], headline, body)
    story['entities'] = entities

[addSurfaceForms(story) for story in stories]
print(stories[0])

{'headline': 'Iran close to decision on nuclear program', 'body': "Iranian representatives say negotiations with Europe on its nuclear program are in the final stages.Iran's foreign minister, Kamal Kharazi, told state television Saturday Iranian negotiators have given their final response to a European Union proposal to suspend Iran's uranium enrichment program. He said it is now up to the Europeans to decide whether or not to accept their decision.Iran and the European Union's big three powers &mdash; Britain, Germany, and France &mdash; have been negotiating a deal under which Tehran would agree to freeze sensitive nuclear work to avoid possible U.N. Security Council sanctions.U.S. Secretary of State Colin Powell, says that Iran's nuclear program is intended to make nuclear weapons. Iran authorities have insisted that their nuclear ambitions are limited to generating electricity from atomic energy plants, not making bombs.Critics of the position of the United States point to Israel's

In [13]:
with open('./datasets/wikinews-standard.json', 'w') as f:
    f.write(json.dumps(stories))