## Obtain seed set of document using Elasticsearch

In [1]:
import utils

In [2]:
import urllib.parse
import urllib.request
import requests

In [3]:
base = 'http://news.fii800.lod.labs.vu.nl/news?'
args = {
    'q' : 'earthquake', # the query terms to look for
    'in' : 'content', # look in title, content or both (supported values are: "title", "content", "both")
    'from' : '2015-09-01T00:00:00Z', # from->starting datetime point
    'to' : '2015-10-01T00:00:00Z', # ending datetime point
    'source' : '', # source -> which source
    'media' : 'News', # media -> media type ("Blog" or "News")
    'size' : 1000, # size -> amount of results to return
    'offset' : 0,  # offset ->skip the first offset results (useful for pagination)
    'match' : 'conjunct'
}

In [4]:
all_results = utils.get_all_hits(base, args)
news_items_ids = set(all_results.keys())

## Please point **path_signalmedia_json** to where the jsonl is stored
* uncomment if you want to use it (is a bit slow)

In [5]:
#path_signalmedia_json = 'SignalMedia/signalmedia-1m.jsonl'
#results_in_json = utils.obtain_specific_identifiers(path_signalmedia_json, news_items_ids)

In [6]:
print(len(news_items_ids))

1913


In [7]:
ids=str(news_items_ids)[1:-1]

## SPARQLing earthquakes

In [8]:
def to_dbpedia(locs):
    uris=set()
    for l in locs:
        uris.add("'http://dbpedia.org/resource/%s'" % l)
    return ",".join(uris)

In [9]:
sparql_endpoint="http://sparql.fii800.lod.labs.vu.nl/sparql"
graph_uri="http://longtailcorpus.org"
boring_locations=["United_States", "United_Kingdom", "Africa", "Europe"]
boring_URIs=to_dbpedia(boring_locations)
limit_secs=60*60*24 # seconds
limit_days=1 

In [10]:
def get_sparql_top():
    return """PREFIX dct: <http://purl.org/dc/terms/>
PREFIX dt: <http://dbpedia.org/datatype/>
PREFIX dc: <http://purl.org/dc/elements/1.1/>
PREFIX longtail: <http://longtailcorpus.org/>
SELECT ?n1 ?src ?location ?dct WHERE {
  GRAPH <http://longtailcorpus.org> {
?n1 a longtail:NewsItem ;
    dct:source ?src .
    FILTER (?src IN (""" + ids + """)) .
    ?n1 dct:created ?dct ;
    dct:publisher ?pub .
    ?pub dct:spatial ?location .
    """

In [11]:
def get_sparql_bottom():
    return " } }"

In [12]:
def get_sparql_middle(factor):
    if factor=='location':
        return "FILTER (str(?location) NOT IN (%s) ) . " % boring_URIs

In [13]:
from SPARQLWrapper import SPARQLWrapper, JSON

def get_sparql_results(query):

    sparql = SPARQLWrapper(sparql_endpoint)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    sparql.method = 'POST'
    results = sparql.query().convert()
    return results["results"]["bindings"]
#    for result in results["results"]["bindings"]:
#        print(result["n1"]["value"])


In [14]:
def get_news_from_fun_locations():
    query = get_sparql_top() + get_sparql_middle('location') + get_sparql_bottom()
    print(query)
#    print(query)
#    import sys
#    sys.exit(0)
    res=get_sparql_results(query)
    return res
results=get_news_from_fun_locations()

PREFIX dct: <http://purl.org/dc/terms/>
PREFIX dt: <http://dbpedia.org/datatype/>
PREFIX dc: <http://purl.org/dc/elements/1.1/>
PREFIX longtail: <http://longtailcorpus.org/>
SELECT ?n1 ?src ?location ?dct WHERE {
  GRAPH <http://longtailcorpus.org> {
?n1 a longtail:NewsItem ;
    dct:source ?src .
    FILTER (?src IN ('da3d4859-0e9b-4ed7-b132-39a1302365cc', '0e5c2c57-e054-410c-8a7f-bbc13918499c', 'ccd90eac-6c2f-49ef-90a7-72c0cf8b98bb', 'ea531313-8e23-4523-b5e0-281eb66506da', 'e6f69446-77f0-4c73-9999-cce2aae6e5a4', 'f48c91c2-3eb1-4a3b-b8b9-bb9dbee85965', '31c854b5-89ea-4dd6-9ddc-d22b6021f689', '5785c538-9d34-478e-b611-b84b3798f7f4', '1fff56c6-8769-4ab3-89b0-7a450c11da93', 'a9a3a9a9-df4c-429c-9817-bc5c80a04b43', '873eb0b1-63a5-441b-bf8f-f0e9a4de183c', '4fc6f7fe-ffcd-44ad-9b23-73e732911061', '44b2a31e-ef23-4bd7-b2d0-a003c7d489d1', 'faae3d87-dd56-4866-b291-c8fac947f924', '82d81fdf-ede6-4eed-b0bd-1054438df9f9', 'fcceebb7-198a-4df6-af93-20c29a3b5be6', 'a31ec972-36be-45d6-85eb-83d922c1ad40', 

In [15]:
#print(results)


In [16]:
print(len(results))

126


In [17]:
import datetime
def time_diff(t1, t2):
    ta=datetime.datetime.strptime(t1, '%Y-%m-%dT%H:%M:%S')
    tb=datetime.datetime.strptime(t2, '%Y-%m-%dT%H:%M:%S')
    print(abs((ta-tb).days), ta, tb)
    return abs((ta-tb).days)

In [18]:
from collections import defaultdict
def hypothesize_coreference(results):
    c=0
    coreference_sets = defaultdict(list)
    for r1 in results:
        for r2 in results:
            src1=r1['src']['value']
            src2=r2['src']['value']
            all([r1!=r2, # not the same
                 r1['location']['value']==r2['location']['value'], # same location
                 time_diff(r1['dct']['value'], r2['dct']['value'])<=limit_days]): # less than a day
                key=r1['location']['value']
                found=False
                for news_set in coreference_sets[key]:
                    if any([src1 in news_set,
                            src2 in news_set]):
                        news_set.update([src1, src2])
                        found=True
                if not found:
                    coreference_sets[key].append(set([src1,src2]))
    print(coreference_sets)
    return coreference_sets

SyntaxError: invalid syntax (<ipython-input-18-dde1fdb20af0>, line 11)

In [None]:
coreference_sets=hypothesize_coreference(results)

In [None]:
loc='http://dbpedia.org/resource/Queensland'
print(coreference_sets[loc])

## Example defaultdict

In [None]:
coreference_sets = defaultdict(set)

In [None]:
key = ('2016-11', 'Baton_Rouge')
value = 'news_item_identifier'

In [None]:
coreference_sets[key].add(value)

In [None]:
coreference_sets