In [None]:
import json
from functools import reduce
import numpy as np
import os 
from os import path
import re
import stemming.porter2 as porter
from stanfordcorenlp import StanfordCoreNLP

stem = porter.stem
root = '../datasets/keyphrase_datasets/NUS'
stories = []
tokenizer = StanfordCoreNLP("http://localhost")

for file in os.listdir(root):
    dirname = path.join(root, file)
    filename = path.join(dirname, file + '.txt') 
    
    with open(filename, 'r', encoding='iso-8859-15') as f:
        filecontent = f.read()
        
    entities = []
    keydir = path.join(root, file, 'KEY')
    for label in os.listdir(keydir):
        if label[-4:] != '.key':
            print('label file invalid format', label)
            continue
        
        labels = []
        labelfile = path.join(keydir, label)
        with open(labelfile, 'r', encoding='iso-8859-15') as fl:
            content = fl.read()
            labels =  [l.strip() for l in content.split('\n')]
            
        labels = list(set(labels))
        entities = entities + [{'id' : label} for label in labels if len(label) > 0]
        if len(entities) == 0:
            print(labels)
            assert(False)
        stories.append({'headline' : '', 'body' : filecontent, 'entities' : entities})
    
print(len(stories))
print(len([s for s in stories if len(s['entities']) > 0]))

from collections import Counter
print(Counter([len(s['entities']) for s in stories]))

def getOffsets(surface, content):
    return [m.span() for m in re.finditer(re.escape(surface), content)]

def getSurfaceForms(form, content):
    offsets = [getOffsets(form, content)]
    surfaceForms = [{'form': form, 'bodyOffsets' : offsets}]
    return surfaceForms, form   

def pre_process(sentence):
    sentence = re.sub('((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))',' ',sentence) #replace urls by <url>
    sentence = re.sub('(\@ [^\s]+)',' ',sentence) #replace @user268 by <user>
    return sentence

def addSurfaceForms(story, sidx):
    entities = [entity for entity in story['entities']]
    headline = story['headline']
    headline = pre_process(headline)
    body = story['body']
    body = pre_process(body)
    story['headline'] = headline
    story['body'] = body
    story_stemmed = ' '.join([stem(word).lower() for word in 
                         tokenizer.word_tokenize(headline + '\n' + body)])
    story['stemmed_content'] = story_stemmed
    
    for entity in entities:
        form = entity['id'].strip()
        form = ' '.join([stem(word).lower() for word in 
                         tokenizer.word_tokenize(form)])
        entity['stemmed'] = form
        entity['forms'], entity['name'] = getSurfaceForms(form, story_stemmed)
    story['entities'] = entities
    print('%d/%d' % (sidx, len(stories)))
[addSurfaceForms(story, sidx) for sidx, story in enumerate(stories)]
print(len(stories))
print(len([s for s in stories if len(s['entities']) > 0]))

183
183
Counter({10: 61, 6: 17, 7: 14, 8: 13, 5: 10, 12: 8, 4: 8, 11: 8, 9: 6, 15: 3, 22: 3, 21: 3, 3: 3, 25: 2, 18: 2, 20: 2, 28: 2, 23: 2, 33: 2, 13: 2, 16: 1, 42: 1, 52: 1, 36: 1, 46: 1, 41: 1, 24: 1, 34: 1, 17: 1, 27: 1, 31: 1, 19: 1})
0/183
1/183
2/183
3/183
4/183
5/183
6/183
7/183
8/183
9/183
10/183
11/183
12/183
13/183
14/183
15/183
16/183
17/183
18/183
19/183
20/183
21/183
22/183
23/183
24/183
25/183
26/183
27/183
28/183
29/183
30/183
31/183
32/183
33/183
34/183
35/183
36/183
37/183
38/183
39/183
40/183
41/183
42/183
43/183
44/183
45/183
46/183
47/183
48/183
49/183
50/183
51/183
52/183
53/183
54/183
55/183
56/183
57/183
58/183
59/183
60/183
61/183
62/183
63/183
64/183
65/183
66/183
67/183
68/183
69/183
70/183
71/183
72/183
73/183
74/183
75/183
76/183
77/183
78/183
79/183
80/183
81/183
82/183
83/183
84/183
85/183
86/183
87/183
88/183
89/183
90/183
91/183
92/183
93/183
94/183
95/183
96/183
97/183
98/183
99/183
100/183
101/183
102/183
103/183
104/183
105/183
106/183
107/183
108/18

In [6]:
with open('../datasets/nus-standard.json', 'w') as f:
    f.write(json.dumps(stories))