## Obtain seed set of document using Elasticsearch

In [1]:
import utils

In [2]:
import urllib.parse
import urllib.request
import requests

In [3]:
base = 'http://news.fii800.lod.labs.vu.nl/news?'
args = {
    'q' : 'disaster earthquake', # the query terms to look for
    'in' : 'content', # look in title, content or both (supported values are: "title", "content", "both")
    'from' : '2015-09-01T00:00:00Z', # from->starting datetime point
    'to' : '2015-10-01T00:00:00Z', # ending datetime point
    'source' : '', # source -> which source
    'media' : 'News', # media -> media type ("Blog" or "News")
    'size' : 1000, # size -> amount of results to return
    'offset' : 0,  # offset ->skip the first offset results (useful for pagination)
    'match' : 'conjunct'
}

In [4]:
all_results = utils.get_all_hits(base, args)
news_items_ids = set(all_results.keys())

In [5]:
print(len(news_items_ids))

401


In [6]:
ids=str(news_items_ids)[1:-1]

## SPARQLing earthquakes

In [7]:
boring_locations=["United_States", "United_Kingdom", "Africa", "Europe"]
boring_URIs=utils.to_dbpedia(boring_locations)
limit_secs=60*60*24 # seconds
limit_days=1 

In [8]:
results=utils.get_news_from_fun_locations(ids)

In [9]:
print(len(results))

401


In [10]:
from collections import defaultdict
def hypothesize_coreference(results):
    
    days_limit=1
    participant_overlap=0.2
    location_overlap=0.4
    
    c=0
    chain=1
    coreference_data = []
    coreference_sets = defaultdict(set)
    for result in results:
        new_result={'dct': result['dct']['value'], 'part': result['participants']['value'], 
                    'id':result['src']['value'], 'loc':result['locations']['value']}
        for past_result in coreference_data:
            if utils.coreferential(new_result['dct'],past_result['dct'],
                             new_result['part'].split('|'), past_result['part'].split('|'),
                             new_result['loc'].split('|'),past_result['loc'].split('|'), 
                             days_limit, participant_overlap, location_overlap):
                new_result['chain']=past_result['chain']
                coreference_sets[str(past_result['chain'])].append(new_result['id'])
        if 'chain' not in new_result:
            new_result['chain']=chain
            coreference_sets[str(chain)]=[new_result['id']]
            chain+=1
        coreference_data.append(new_result)
    return coreference_sets

In [11]:
coreference_sets=hypothesize_coreference(results)

## Please point **path_signalmedia_json** to where the jsonl is stored
* uncomment if you want to use it (is a bit slow)

In [12]:
my_ids=coreference_sets['3']
path_signalmedia_json = 'signalmedia-1m-2.jsonl'
results_in_json = utils.obtain_specific_identifiers(path_signalmedia_json, my_ids)

In [13]:
print(results_in_json)

{'eb4edb1a-3063-4b29-9536-916c887788b2': {'media-type': 'News', 'published': '2015-09-01T07:18:29Z', 'id': 'eb4edb1a-3063-4b29-9536-916c887788b2', 'title': 'Japan Holds Annual Disaster Response Drill', 'content': 'About 1.6 million people across Japan took part in an annual emergency drill on Tuesday, with schoolchildren taking cover under desks and top officials meeting in response to a mock natural… \r \nTokyo: About 1.6 million people across Japan took part in an annual emergency drill on Tuesday, with schoolchildren taking cover under desks and top officials meeting in response to a mock natural disaster. \n\nRescue operations and emergency medical exercises were carried out to simulate an emergency response to a strong 7.3 magnitude quake rocking Tokyo in the early morning. \n\nCabinet ministers dressed in blue uniforms held a meeting at the prime minister\'s office, while schoolchildren across the quake-prone nation hid under their desks or donned padded fireproof hoods. \n\nThe 