In [5]:
import json
from functools import reduce
import numpy as np
import os 
from os import path
import re
import stemming.porter2 as porter
from stanfordcorenlp import StanfordCoreNLP

stem = porter.stem
tokenizer = StanfordCoreNLP("http://localhost")

leafs = ['test', 'train', 'trial']
root = '../datasets/keyphrase_datasets/SemEval2010'
stories = {}

for leaf in leafs:
    dirname = path.join(root, leaf)
    for file in os.listdir(dirname):
#         if not file.endswith('.txt.final'):
#             continue
        filename = path.join(dirname, file) 
        with open(filename, 'r') as f:
            filecontent = f.read()
        
        suid = file[:-len('.txt.final')]
        stories[suid] = {'headline' : '', 'body' : filecontent, 'suid' : suid, 'label' : leaf, 'entities' : []}
        
    labels = ['author', 'reader']
    for label in labels:
        stemmed = False
        labelfile = path.join(root, leaf, leaf + '.' + label + '.final')
        if not os.path.isfile(labelfile):
            stemmed = True
            labelfile = path.join(root, leaf, leaf + '.' + label + '.stem.final')
            
        with open(labelfile) as f:
            keyphrases = {}
            for line in f:
                idx = line.find(':')
                suid = line[0:idx].strip()
                rest = line[idx+1:].strip()

                entities = [{'id' : id.strip(), 'source' : label, 'stemmed' : stemmed} for id in rest.split(',') if len(id.strip()) > 0]
                stories[suid]['entities'].extend(entities)
                
stories = list(stories.values())
print(len(stories))
stories = [s for s in stories if len(s['entities']) > 0]
print(len(stories))
print(len([s for s in stories if len(s['entities']) > 0]))

from collections import Counter
print(Counter([len(s['entities']) for s in stories]))

def pre_process(sentence):
    sentence = re.sub('((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))',' ',sentence) #replace urls by <url>
    sentence = re.sub('(\@ [^\s]+)',' ',sentence) #replace @user268 by <user>
    return sentence

def getOffsets(surface, content):
    return [m.span() for m in re.finditer(re.escape(surface), content)]

def getSurfaceForms(form, content):
    offsets = [getOffsets(form, content)]
    surfaceForms = [{'form': form, 'bodyOffsets' : offsets}]
    return surfaceForms, form   
    
def addSurfaceForms(story, sidx):
    entities = [entity for entity in story['entities']]
    headline = story['headline']
    headline = pre_process(headline)
    body = story['body']
    body = pre_process(body)
    story['headline'] = headline
    story['body'] = body
    story_stemmed = ' '.join([stem(word).lower() for word in 
                         tokenizer.word_tokenize(headline + '\n' + body)])
    story['stemmed_content'] = story_stemmed
    
    for entity in entities:
        form = entity['id'].strip()
        form = ' '.join([stem(word).lower() for word in 
                         tokenizer.word_tokenize(form)])
        entity['stemmed'] = form
        entity['forms'], entity['name'] = getSurfaceForms(form, story_stemmed)
    story['entities'] = entities
    print('%d/%d' % (sidx, len(stories)))
[addSurfaceForms(story, sidx) for sidx, story in enumerate(stories)]
print(len(stories))
print(len([s for s in stories if len(s['entities']) > 0]))

261
244
244
Counter({14: 41, 15: 37, 16: 32, 13: 26, 18: 26, 17: 18, 20: 16, 19: 13, 21: 9, 12: 8, 10: 4, 11: 3, 22: 2, 25: 2, 23: 2, 24: 1, 28: 1, 31: 1, 36: 1, 38: 1})
0/244
1/244
2/244
3/244
4/244
5/244
6/244
7/244
8/244
9/244
10/244
11/244
12/244
13/244
14/244
15/244
16/244
17/244
18/244
19/244
20/244
21/244
22/244
23/244
24/244
25/244
26/244
27/244
28/244
29/244
30/244
31/244
32/244
33/244
34/244
35/244
36/244
37/244
38/244
39/244
40/244
41/244
42/244
43/244
44/244
45/244
46/244
47/244
48/244
49/244
50/244
51/244
52/244
53/244
54/244
55/244
56/244
57/244
58/244
59/244
60/244
61/244
62/244
63/244
64/244
65/244
66/244
67/244
68/244
69/244
70/244
71/244
72/244
73/244
74/244
75/244
76/244
77/244
78/244
79/244
80/244
81/244
82/244
83/244
84/244
85/244
86/244
87/244
88/244
89/244
90/244
91/244
92/244
93/244
94/244
95/244
96/244
97/244
98/244
99/244
100/244
101/244
102/244
103/244
104/244
105/244
106/244
107/244
108/244
109/244
110/244
111/244
112/244
113/244
114/244
115/244
116/244
117/

In [18]:
with open('../datasets/semeval-2010-standard.json', 'w') as f:
    f.write(json.dumps(stories))