## Obtain seed set of document using Elasticsearch

@Filip: can you check the param values below?

In [1]:
import utils

In [2]:
import urllib.parse
import urllib.request
import requests

In [3]:
base = 'http://news.fii800.lod.labs.vu.nl/news?'
args = {
    'q' : 'earthquake', # the query terms to look for
    'in' : 'content', # look in title, content or both (supported values are: "title", "content", "both")
    'from' : '2015-09-01T00:00:00Z', # from->starting datetime point
    'to' : '2015-10-01T00:00:00Z', # ending datetime point
    'source' : '', # source -> which source
    'media' : 'News', # media -> media type ("Blog" or "News")
    'size' : 1000, # size -> amount of results to return
    'offset' : 0,  # offset ->skip the first offset results (useful for pagination)
    'match' : 'conjunct'
}

In [4]:
all_results = utils.get_all_hits(base, args)
news_items_ids = set(all_results.keys())

In [5]:
#print(news_items_ids)

In [6]:
print(len(news_items_ids))

1914


In [7]:
ids=str(news_items_ids)[1:-1]

## SPARQLing earthquakes

In [76]:
def to_dbpedia(locs):
    uris=set()
    for l in locs:
        uris.add("'http://dbpedia.org/resource/%s'" % l)
    return ",".join(uris)

In [77]:
sparql_endpoint="http://sparql.fii800.lod.labs.vu.nl/sparql"
graph_uri="http://longtailcorpus.org"
boring_locations=["United_States", "United_Kingdom", "Africa", "Europe"]
boring_URIs=to_dbpedia(boring_locations)
limit_secs=60*60*24 # seconds
limit_days=1 

In [78]:
def get_sparql_top():
    return """PREFIX dct: <http://purl.org/dc/terms/>
PREFIX dt: <http://dbpedia.org/datatype/>
PREFIX dc: <http://purl.org/dc/elements/1.1/>
PREFIX longtail: <http://longtailcorpus.org/>
SELECT ?n1 ?src ?location ?dct WHERE {
  GRAPH <http://longtailcorpus.org> {
?n1 a longtail:NewsItem ;
    dct:source ?src .
    FILTER (?src IN (""" + ids + """)) .
    ?n1 dct:created ?dct ;
    dct:publisher ?pub .
    ?pub dct:spatial ?location .
    """

In [79]:
def get_sparql_bottom():
    return " } }"

In [80]:
def get_sparql_middle(factor):
    if factor=='location':
        return "FILTER (str(?location) NOT IN (%s) ) . " % boring_URIs

In [81]:
from SPARQLWrapper import SPARQLWrapper, JSON

def get_sparql_results(query):

    sparql = SPARQLWrapper(sparql_endpoint)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    sparql.method = 'POST'
    results = sparql.query().convert()
    return results["results"]["bindings"]
#    for result in results["results"]["bindings"]:
#        print(result["n1"]["value"])


In [82]:
def get_news_from_fun_locations():
    query = get_sparql_top() + get_sparql_middle('location') + get_sparql_bottom()
#    print(query)
#    import sys
#    sys.exit(0)
    res=get_sparql_results(query)
    return res
results=get_news_from_fun_locations()

In [83]:
#print(results)


In [84]:
print(len(results))

126


In [85]:
import datetime
def time_diff(t1, t2):
    ta=datetime.datetime.strptime(t1, '%Y-%m-%dT%H:%M:%S')
    tb=datetime.datetime.strptime(t2, '%Y-%m-%dT%H:%M:%S')
    print(abs((ta-tb).days), ta, tb)
    return abs((ta-tb).days)

In [86]:
from collections import defaultdict
def hypothesize_coreference(results):
    c=0
    coreference_sets = defaultdict(list)
    for r1 in results:
        for r2 in results:
            src1=r1['src']['value']
            src2=r2['src']['value']
            if r1!=r2 and r1['location']['value']==r2['location']['value'] and time_diff(r1['dct']['value'], r2['dct']['value'])<=limit_days:
                key=r1['location']['value']
                found=False
                for news_set in coreference_sets[key]:
                    if src1 in news_set or src2 in news_set:
                        news_set.add(src1)
                        news_set.add(src2)
                        found=True
                if not found:
                    coreference_sets[key].append(set([src1,src2]))
    print(coreference_sets)
    return coreference_sets

In [87]:
coreference_sets=hypothesize_coreference(results)

22 2015-09-23 19:58:38 2015-09-01 00:03:27
3 2015-09-23 19:58:38 2015-09-20 05:43:23
19 2015-09-23 19:58:38 2015-09-03 21:52:06
7 2015-09-23 19:58:38 2015-09-16 01:45:55
3 2015-09-23 19:58:38 2015-09-20 14:31:47
10 2015-09-23 19:58:38 2015-09-13 05:39:52
6 2015-09-23 19:58:38 2015-09-17 04:54:31
6 2015-09-23 19:58:38 2015-09-17 02:43:40
2 2015-09-23 19:58:38 2015-09-25 06:40:37
6 2015-09-23 19:58:38 2015-09-17 02:33:23
11 2015-09-23 19:58:38 2015-09-11 22:44:16
20 2015-09-23 19:58:38 2015-09-03 01:23:49
3 2015-09-23 19:58:38 2015-09-26 03:36:45
6 2015-09-23 19:58:38 2015-09-17 14:24:26
2 2015-09-23 19:58:38 2015-09-24 23:20:40
5 2015-09-23 19:58:38 2015-09-18 00:00:57
5 2015-09-23 19:58:38 2015-09-18 02:29:50
17 2015-09-23 19:58:38 2015-09-06 04:16:59
10 2015-09-23 19:58:38 2015-09-13 05:46:21
11 2015-09-23 19:58:38 2015-09-11 22:44:16
23 2015-09-01 00:03:27 2015-09-23 19:58:38
20 2015-09-01 00:03:27 2015-09-20 05:43:23
3 2015-09-01 00:03:27 2015-09-03 21:52:06
16 2015-09-01 00:03:27 2

In [88]:
loc='http://dbpedia.org/resource/Queensland'
print(coreference_sets[loc])

[{'bdc0cec7-7392-4fb8-9775-731aef47a1b0', '8a6426f6-57a4-4f16-aecd-525d57e57a7f'}]


## Example defaultdict

In [None]:
coreference_sets = defaultdict(set)

In [None]:
key = ('2016-11', 'Baton_Rouge')
value = 'news_item_identifier'

In [None]:
coreference_sets[key].add(value)

In [None]:
coreference_sets