In [None]:
%matplotlib inline

In [1]:
import json

articles = json.load(open('data/articles_processed.json', 'r'))
len(articles)

4752

In [None]:
# ==TAKES A LONG TIME!===========================================
# extract the title & text of the articles
docs = ['\n'.join([a['title'], a['text']]) for a in articles]

# we need to tell the computer how to break up a text
# into meaningful pieces ("tokens")
# we'll use spacy, a natural language processing library
# that will take care of the details for us
import spacy
from spacy.parts_of_speech import VERB, NUM, NOUN
nlp = spacy.load('en')

# now we'll define the function that breaks up text into keywords
# in particular, the only parts we care about are verbs, numbers,
# nouns, and 'named entities' (people, places, organizations, etc)
def extract_keywords(doc):
    # here we run spacy on the text
    # it will identify named entities and tag parts-of-speech
    doc = nlp(doc)
    ents = [ent.text for ent in doc.ents]
    toks = [tok.text for tok in doc
            if not tok.is_stop and tok.pos in [VERB, NUM, NOUN]]
    return [t.lower() for t in ents + toks]

# now we'll convert our original article text into just these keywords
docs = ['||'.join(extract_keywords(d)) for d in docs]

In [4]:
# but this takes awhile so we'll load precomputed ones
docs = json.load(open('data/docs_processed.json', 'r'))

In [5]:
docs[0]

"scaredy-cat’s investigation into||halloween||chicken||frank farley||temple university||the american psychological association||farley||mount everest||the 1980s||david zald||vanderbilt university||one||zald||margee kerr||’s||kerr||the science times newsletter||nyt newsletters morning briefing||news||opinion today thought-||5||daily||the new york times's||new york times||josh randall||kristjan thor||blackout||thor||blackout||randall||kerr||kerr||this weekend||’s||people||enjoy||fear||photo||means||workers||planted||surprise||spiders||office||invited||haunted||hayride||neighbor||’s||yard||cemetery||rigged||motion||detectors||pop||zombies||livered||start||dreaded||time||year||houses||ghost||tours||horror||film||fests||thing||people||love||having||daylights||scared||escapes||decided||try||understand||friends||lookout||thrills||time||year||turns||reasons||people||person||threshold||experiences||provoke||fear||recipe||blends||nature||nurture||ingredients||vary||person||person||said||psycholo

In [6]:
def tokenize(doc):
    return doc.split('||')

In [None]:
# ==TAKES A LONG TIME!===========================================
# we need to convert this text into some numerical representation
# the computer can work with. We'll use "TF-IDF bag-of-words".
# this process of turning text->numbers is called "vectorization"
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    strip_accents='ascii', # remove accents from characters
    lowercase=False,       # don't make things lowercase, this will mess up the NER step
    use_idf=True,          # we want to use IDF
    smooth_idf=True,       # we want to "smooth" the IDF (avoiding division by 0)
    max_df=1.0, # ignore terms w/ DF higher than this (int=absolute, float=percent)
    min_df=1,   # ignore terms w/ DF lower than this (int=absolute, float=percent)
    stop_words='english',  # remove very common English words (e.g. the, a, an)
    tokenizer=tokenize     # use our tokenization function
)

# now let's run the vectorizer on our articles
# (this'll take a sec)
vecs = vectorizer.fit_transform(docs)

In [7]:
# instead of running that, we're just going to load a pre-trained vectorizer
# (it would take too long & take up too much memory if we all did it)
from sklearn.externals import joblib
vectorizer = joblib.load('data/vectorizer.pkl')

In [8]:
# and we're going to load precomputed vectors
from scipy import io
vecs = io.mmread('data/vecs.mtx').tocsr()

In [10]:
# the result is that each article is represented by a list of numbers
vecs.toarray()[0]

array([ 0.,  0.,  0., ...,  0.,  0.,  0.])

In [11]:
# so now we can apply an algorithm to ~cluster~ these articles
# i.e. group them so that articles talking about the same/similar things are
# in the same group
# we'll use a method called DBSCAN
from sklearn.cluster import DBSCAN

model = DBSCAN(
    eps=0.3,            # max distance for two points to be considered as the same neighborhood
    min_samples=2,      # how many points necessary to define a neighborhood?
    metric='cosine',    # how do we define "distance"?
    algorithm='brute',  # required for cosine metric
    n_jobs=-1           # parallelize across all cores
)

# we can tweak the `eps` value to be more lenient (higher) or stricter (lower)
# in how it groups articles

In [12]:
clusters = model.fit_predict(vecs)

# each article is given a label,
# if its above -1, the label is a cluster id
# if it is -1, then it's "noise"
clusters

array([-1, -1, -1, ..., -1, -1, -1])

In [13]:
# now we'll take our articles and group them according to these labels
# skip those labeled "noise" (-1)
from collections import defaultdict

events = defaultdict(list)
for i, clus in enumerate(clusters):
    if clus == -1:
        continue
    events[str(clus)].append((articles[i], docs[i]))

In [14]:
# print out the clusters
for id, mems in events.items():
    print('-{}-------------'.format(id))
    for a, kws in mems:
        print('\t{}'.format(a['title']))

-123-------------
	Californians Legalize Marijuana in Vote That Could Echo Nationally
	Marijuana wins big on election night
-36-------------
	Santa Monica considers highly restrictive growth limits. Is L.A. next?
	Santa Monica could pass highly restrictive growth limits. Is L.A. next?
-14-------------
	Campaign 2016 updates: Can the warrant for Clinton's emails upend the race?
	Former Atty. Gen. Eric Holder: Comey must 'correct his mistake'
-9-------------
	Campaign 2016 updates: FBI gets warrant to search newfound emails
	New emails? So far, voters seem mostly unmoved
-137-------------
	California voters reject measure to repeal death penalty, approve plan to expedite it
	Measure to speed up the death penalty leads, and the bid to end executions fails
-178-------------
	Meet the potential Trump Cabinet picks most likely to make liberals squirm
	These possible Trump Cabinet picks could make liberals squirm
-116-------------
	Trump campaign files lawsuit in Nevada aimed at early voting


In [15]:
# so we have "event" clusters
# now we want to cluster _these clusters_ into "story" clusters!
# we basically follow a similar process.
# we tokenize the whole event (we just mash its articles together)

# first we extract the article texts
events = list(events.values())
docs = ['||'.join([kws for a, kws in e]) for e in events]

In [16]:
# we vectorize again
vecs = vectorizer.transform(docs)

In [17]:
# we create another DBSCAN model
story_model = DBSCAN(
    eps=0.6,
    min_samples=2,
    metric='cosine',
    algorithm='brute',
    n_jobs=-1
)

In [18]:
# and we cluster again!
clusters = story_model.fit_predict(vecs)
clusters

array([ 0, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        2, -1,  3, -1, -1,  4, -1, -1,  5, -1,  2, -1, -1, -1,  6,  4, -1,
       -1,  7, -1, -1, -1, -1, -1, -1,  5, -1, -1, -1,  8, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1,  1,  9, -1, 10, -1, -1, -1, -1, -1,
        6, -1, -1, -1,  9, -1, 11, -1, -1, -1, -1,  9, -1, -1, -1, -1, 10,
       -1, -1, -1, 10, -1, -1, -1, -1,  0, -1, -1, -1, -1, -1, -1, -1, 12,
       -1, -1, -1,  9, -1, -1, 13, -1, 10, -1, -1, -1, -1, -1,  1, -1, -1,
       -1, -1, 14, -1, -1, -1, -1, -1,  7, -1, 12, -1, -1, 10, 14, -1, -1,
       -1, -1, -1, -1, -1,  3, -1, -1, -1, -1, -1, 14, 10, -1, -1, -1, -1,
        8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 10, 14, -1, 13, -1,
        1, -1, -1, -1, -1,  9, -1, -1, -1, -1, -1, -1,  0, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, 11])

In [19]:
# again, we can group them by cluster id
# (skipping noise)
stories = defaultdict(list)
for i, clus in enumerate(clusters):
    if clus == -1:
        continue
    stories[str(clus)].append(events[i])

In [21]:
# now we can print out the stories
from datetime import datetime

# we'll write a little helper function to give us a date and time for an event
# we just assume the time the first article in the event was published is when
# the event happened. this isn't necessarily true, but it'll be fine for this.
def ev_created_at(e):
    return min([a['created_at'] for a, _ in e])

for id, mems in stories.items():
    print('-{}-------------'.format(id))
    for e in sorted(mems, key=lambda x: ev_created_at(x)):
        created_at = ev_created_at(e)
        print('\t{} : {} ({} articles)'.format(
            datetime.fromtimestamp(created_at).strftime('%c'),
            e[0][0]['title'],
            len(e)
        ))
        for a, _ in e:
            print('\t\t{} ({})'.format(
                a['title'],
                datetime.fromtimestamp(a['created_at']).strftime('%c'),
            ))
        print('\n')
    print('\n')

-6-------------
	Mon Nov  7 00:00:00 2016 : 7 shot in attacks on West, South sides (3 articles)
		7 shot in attacks on West, South sides (Mon Nov  7 00:00:00 2016)
		2 killed, 12 others wounded in Chicago shootings (Mon Nov  7 00:00:00 2016)
		3 killed, 15 wounded in Chicago shootings (Mon Nov  7 00:00:00 2016)


	Wed Nov  9 00:00:00 2016 : 3 teen boys among 10 wounded in city shootings (2 articles)
		3 teen boys among 10 wounded in city shootings (Wed Nov  9 00:00:00 2016)
		14 shot, including 4 teens, over 14 hours in Chicago (Wed Nov  9 00:00:00 2016)




-12-------------
	Fri Oct 28 00:00:00 2016 : Obama to make his first appearance on HBO's 'Real Time With Bill Maher' (3 articles)
		Obama to make his first appearance on HBO's 'Real Time With Bill Maher' (Sat Oct 29 00:00:00 2016)
		World Series cuts into 'Inferno' audience as 'Madea' reclaims top spot at weekend box office (Fri Oct 28 00:00:00 2016)
		'Inferno' burns cold at the box office (Sat Oct 29 00:00:00 2016)


	Tue Nov  1 

In [22]:
# save
with open('data.json', 'w') as f:
    json.dump(stories, f)