## Obtain seed set of document using Elasticsearch

In [1]:
import utils

In [2]:
import urllib.parse
import urllib.request
import requests

In [3]:
base = 'http://news.fii800.lod.labs.vu.nl/news?'
args = {
    'q' : 'disaster earthquake', # the query terms to look for
    'in' : 'content', # look in title, content or both (supported values are: "title", "content", "both")
    'from' : '2015-09-01T00:00:00Z', # from->starting datetime point
    'to' : '2015-10-01T00:00:00Z', # ending datetime point
    'source' : '', # source -> which source
    'media' : 'News', # media -> media type ("Blog" or "News")
    'size' : 1000, # size -> amount of results to return
    'offset' : 0,  # offset ->skip the first offset results (useful for pagination)
    'match' : 'conjunct'
}

In [4]:
all_results = utils.get_all_hits(base, args)
news_items_ids = set(all_results.keys())

In [5]:
print(len(news_items_ids))

401


In [6]:
ids=str(news_items_ids)[1:-1]

## SPARQLing earthquakes

In [7]:
boring_locations=["United_States", "United_Kingdom", "Africa", "Europe"]
boring_URIs=utils.to_dbpedia(boring_locations)
limit_secs=60*60*24 # seconds
limit_days=1 

In [8]:
results=utils.get_news_from_fun_locations(ids)

PREFIX dct: <http://purl.org/dc/terms/>
PREFIX nif: <http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#>
PREFIX gaf: <http://groundedannotationframework.org/gaf#>
SELECT ?n1 ?src (group_concat(?location;separator="|") as ?locations) ?dct WHERE {
  GRAPH <http://longtailcorpus.org> {
    ?n1 a nif:Context ;
    dct:source ?src .
    FILTER (?src IN ('4eb2ef55-0bd6-47d4-b2b3-94f72177a526', '4973ff82-f655-4982-8188-4e560ce68206', '126fbb6c-0be0-4499-a03b-76f26da209de', '9cb77ed7-af84-4a12-8de5-28bbda4387b0', 'a04a1644-b0bf-452b-a094-6182bf14cdd0', '1a777720-551b-41bb-8b24-c81d4efbeeb6', '0c5ad1f9-8e2d-4487-bb97-217d381938d5', '7b28a73d-2370-4d96-bb46-fe22ab1399ec', 'b8f21945-8a2d-4a41-a73c-a824357ef5fe', '60a53a8c-42ab-4531-a2f9-9d092ce4d388', 'd1e2143a-f237-4ffd-a9e1-285b5d14904d', 'e790275d-40bd-4771-9ae8-bc8b4a672237', 'd1ae2598-260c-4c78-acce-e054c1118dc8', 'be11b3fb-2a9d-4032-bb88-6d00c3386619', '2a109a88-a3ac-4e0d-a227-3ba99a813930', 'e8756fdf-bf5f-47c1-9bad-ca468cb2ae2

In [None]:
print(len(results))

In [None]:
from collections import defaultdict
def hypothesize_coreference(results):
    
    days_limit=1
    participant_overlap=0.2
    location_overlap=0.4
    
    c=0
    chain=1
    coreference_data = []
    coreference_sets = defaultdict(set)
    for result in results:
        new_result={'dct': result['dct']['value'], 'part': result['participants']['value'], 
                    'id':result['src']['value'], 'loc':result['locations']['value']}
        for past_result in coreference_data:
            if utils.coreferential(new_result['dct'],past_result['dct'],
                             new_result['part'].split('|'), past_result['part'].split('|'),
                             new_result['loc'].split('|'),past_result['loc'].split('|'), 
                             days_limit, participant_overlap, location_overlap):
                new_result['chain']=past_result['chain']
                coreference_sets[str(past_result['chain'])].append(new_result['id'])
        if 'chain' not in new_result:
            new_result['chain']=chain
            coreference_sets[str(chain)]=[new_result['id']]
            chain+=1
        coreference_data.append(new_result)
    return coreference_sets

In [None]:
coreference_sets=hypothesize_coreference(results)

## Please point **path_signalmedia_json** to where the jsonl is stored
* uncomment if you want to use it (is a bit slow)

In [None]:
my_ids=coreference_sets['3']
path_signalmedia_json = 'signalmedia-1m-2.jsonl'
results_in_json = utils.obtain_specific_identifiers(path_signalmedia_json, my_ids)

In [None]:
print(results_in_json)