In [53]:
import utils
import operator
# topics: http://www.newsreader-project.eu/files/2013/01/NWR-2014-1.pdf

In [54]:
base = 'http://news.fii800.lod.labs.vu.nl/news?'
args = {
    'q' : 'fired worker', # the query terms to look for
    'in' : 'content', # look in title, content or both (supported values are: "title", "content", "both")
    'from' : '2015-09-01T00:00:00Z', # from->starting datetime point
    'to' : '2015-09-30T00:00:00Z', # ending datetime point
    'source' : '', # source -> which source
    'media' : 'News', # media -> media type ("Blog" or "News")
    'size' : 1000, # size -> amount of results to return
    'offset' : 0,  # offset ->skip the first offset results (useful for pagination)
    'match' : 'conjunct'
}

In [55]:
all_results = utils.get_all_hits(base, args)
len(all_results)

245

In [11]:
# ignore this
utils.extract_size('http://news.fii800.lod.labs.vu.nl/news?offset=0&to=2015-09-27T00:00:00Z&media=News&size=1&from=2015-09-20T00:00:00Z&in=content&q=crash')

2473

### Which articles are too long/short or have the exact same content

In [12]:
to_remove=set()
min_len=300
max_len=4000
for e1, val1 in all_results.items():
    data1=all_results[e1]['_source']['content']
    if len(data1)>max_len or len(data1)<min_len:# if too long or short
        to_remove.add(e1)
        continue
    for e2, val2 in all_results.items():
        if e1<e2: # this is a trick to avoid checking the same thing twice
            data2=all_results[e2]['_source']['content']
            if data1==data2:
                to_remove.add(e1)
                break

In [13]:
len(to_remove)

170

In [14]:
for k in to_remove:
    del all_results[k]

In [16]:
print(len(all_results))

75


### Which articles have similarity higher than a threshold -> create chains

In [17]:
import spacy
from collections import defaultdict
nlp = spacy.load('en')
entities=defaultdict(set)
for key, value in all_results.items():
    data=value['_source']
    doc = nlp(data['title'] + '\n' + data['content'])
    for ent in doc.ents:
        entities[key].add(ent.text)
#        print(ent.label_, ent.text)

In [43]:
c=0
o=0
l=0
coref=[]
for e1, ents1 in entities.items():
    data1=all_results[e1]['_source']['content']
    for e2, ents2 in entities.items():
        if e1<e2:
            overlap=ents1 & ents2
            if len(overlap)>min(len(ents1),len(ents2))*2/3:
                found=False
                for chain in coref:
                    if e1 in chain or e2 in chain:
                        chain.add(e1)
                        chain.add(e2)
                        found=True
                if not found: coref.append(set([e1,e2]))
                o+=1
            c+=1
print(l,o,c)

0 48 2775


### Merge chains

In [39]:
# merge chains
for index, chain in enumerate(coref):
    for index2, chain2 in enumerate(coref):
        if index!=index2 and chain & chain2:
            chain |=chain2
        if index!=index2 and chain==chain2:
            chain2.clear()
            print("YO")
#            print(chain, chain2)

YO


### Inspect chains

In [42]:
len(coref)
for chain in coref:
    print(len(chain))
# 1) Jeremy Clarkson's comeback (2)
# 2) Taco Bell firing an employee (2)
# 3) Taylor Swift's bottom (10)
# 4) Students angry over tax increase (2)
# 5) Worker killed in South Africa (2)
# 6) Mixed reports and promotions (2)
# 7) Bombarding in Aleppo (2)
# 8) Taylor Swift's bottom AGAIN (0)
# 9) Someone won in soccer (fired used here in an entirely different sense) (2)
# 10) Firing a weapon on someone (2)
# 11) Sheriff's deputy shot and killed (4)

2
2
10
2
2
2
2
0
2
2
4


In [30]:
chain=coref[10]
for e in chain:
    data=all_results[e]['_source']['content']
    print("DOCUMENT\n",data)
    print()

DOCUMENT
 A gunman fatally shot a sheriff's deputy outside a lawyer's office Tuesday and then barricaded himself inside a motel, where he exchanged gunfire with other deputies and was killed, authorities said. 
 
Joel Dixon Smith, 33, was being served a domestic violence restraining order at the lawyer's office and was supposed to be turning over his guns to Okaloosa County Deputy Bill Myers, 64, when he pulled out a concealed weapon and shot the deputy multiple times in the back of the head and back, authorities said. 
 
Sheriff Larry Ashley called Smith "a sick little coward." 
 
Smith, a postal worker, fled in his vehicle, heading to a Comfort Suites about 10 miles away in Niceville, where he had previously booked a room. He barricaded himself inside and deputies fired tear gas into his room. He charged out firing and was shot by deputies, Ashley said. 
 
No one else was hurt. 
 
Smith had been arrested in 2008 for domestic battery, Ashley said, but had no other details. The restrai

### Store the data to redis

In [52]:
import uuid
import redis

def make_redis_key():
    return "incinitstr:BU%s" % uuid.uuid4().hex

pool = redis.ConnectionPool(host='localhost', port=6379, db=0)
r = redis.Redis(connection_pool=pool)

stored = set()
for chain in coref:
    incident_data={"estimated_incident_date": "", "estimated_location": "", "articles":[]}
    for article in chain:
        this_article={}
        this_article["body"]=all_results[article]['_source']['content']
        this_article["title"]=all_results[article]['_source']['title']
        this_article["dct"]=all_results[article]['_source']['published']
        incident_data["articles"].append(this_article)
        stored.add(article)
    rkey=make_redis_key()
    rval=json.dumps(incident_data)
    r.set(rkey, rval)
print(len(stored))

# non-chained
for article, val in all_results.items():
    incident_data={"estimated_incident_date": "", "estimated_location": "", "articles":[]}
    if article not in stored:
        this_article={}
        this_article["body"]=all_results[article]['_source']['content']
        this_article["title"]=all_results[article]['_source']['title']
        this_article["dct"]=all_results[article]['_source']['published']
        incident_data["articles"].append(this_article)
        stored.add(article)
        rkey=make_redis_key()
        rval=json.dumps(incident_data)
        r.set(rkey, rval)
print(len(stored))

2015-09-18T16:29:00Z Jeremy Clarkson back on BBC show
2015-09-18T16:29:00Z Jeremy Clarkson back on BBC show
2015-09-24T18:28:54Z Pic shows Taco Bell worker with hands down pants
2015-09-24T21:18:20Z Pic shows Taco Bell worker with hands down pants
2015-09-14T18:25:13Z Colorado DJ sues Taylor Swift
2015-09-13T05:31:11Z Colorado DJ sues Taylor Swift over accusation
2015-09-13T04:03:42Z Colorado DJ sues Taylor Swift over accusation
2015-09-13T02:26:29Z Colorado DJ fired over Taylor Swift accusation sues singer
2015-09-13T08:42:50Z Former radio host files lawsuit against Taylor Swift
2015-09-13T03:55:49Z Fired Colorado DJ sues Taylor Swift over accusation
2015-09-13T18:46:03Z Colorado DJ fired over Taylor Swift accusation sues singer
2015-09-13T16:43:19Z Taylor Swift Sued by Fired DJ Over Groping Claim
2015-09-13T03:55:49Z Fired Colorado DJ sues Taylor Swift over accusation
2015-09-10T10:08:12Z Thousands of Bangladeshi students protest against tax on university fees
2015-09-10T14:08:26Z Th