In [None]:
%matplotlib inline

In [None]:
import json

articles = json.load(open('data/articles_processed.json', 'r'))
len(articles)

In [None]:
# to make things a bit quicker, let's just work with a subset of articles
articles = articles[1000:10000]

In [None]:
# extract the title & text of the articles
docs = ['\n'.join([a['title'], a['text']]) for a in articles]

In [None]:
# we need to tell the computer how to break up a text
# into meaningful pieces ("tokens")
# we'll use spacy, a natural language processing library
# that will take care of the details for us
import spacy
from spacy.parts_of_speech import VERB, NUM, NOUN
nlp = spacy.load('en')

In [None]:
# now we'll define the function that breaks up text into tokens
# in particular, the only parts we care about are verbs, numbers,
# and 'named entities' (people, places, organizations, etc)
def tokenize(doc):
    # here we run spacy on the text
    # it will identify named entities and tag parts-of-speech
    doc = nlp(doc)
    ents = [ent.text for ent in doc.ents]
    toks = [tok.text for tok in doc
            if not tok.is_stop and tok.pos in [VERB, NUM, NOUN]]
    return [t.lower() for t in ents + toks]

In [None]:
# we need to convert this text into some numerical representation
# the computer can work with. We'll use "TF-IDF bag-of-words".
# this process of turning text->numbers is called "vectorization"
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(
    strip_accents='ascii', # remove accents from characters
    lowercase=False,       # don't make things lowercase, this will mess up the NER step
    use_idf=True,          # we want to use IDF
    smooth_idf=True,       # we want to "smooth" the IDF (avoiding division by 0)
    max_df=1.0, # ignore terms w/ DF higher than this (int=absolute, float=percent)
    min_df=1,   # ignore terms w/ DF lower than this (int=absolute, float=percent)
    stop_words='english',  # remove very common English words (e.g. the, a, an)
    tokenizer=tokenize     # use our tokenization function
)

In [None]:
# now let's run the vectorizer on our articles
# (this'll take a sec)
vecs = vectorizer.fit_transform(docs)

In [None]:
# the result is that each article is represented by a list of numbers
list(vecs.toarray()[0])

In [None]:
# so each article is represented by 75,425 numbers, most of which are 0.0
# each number corresponds to a token encountered in the dataset
vecs.shape

In [None]:
# we can get an idea of what's happening using a technique called tSNE
from sklearn.manifold import TSNE

# `n_components` is the number of dimensions to reduce to
tsne = TSNE(n_components=2)

# apply the dimensionality reduction
# to our embeddings to get our 2d points
# this doesn't scale well and the graph gets easily crowded,
# so we'll just look at a small amount of articles
points = tsne.fit_transform(vecs[:200].todense())

In [None]:
import matplotlib.pyplot as plt

# plot our results
# make it quite big so we can see everything
fig, ax = plt.subplots(figsize=(40, 20))

# extract x and y values separately
xs = points[:,0]
ys = points[:,1]

# plot the points
# we don't actually care about the point markers,
# just want to automatically set the bounds of the plot
ax.scatter(xs, ys, alpha=1)

# annotate each point with its word
for i, point in enumerate(points):
    ax.annotate(articles[i]['title'],
                (xs[i], ys[i]),
                fontsize=8)
    
plt.plot()

In [None]:
# so now we can apply an algorithm to ~cluster~ these articles
# i.e. group them so that articles talking about the same/similar things are
# in the same group
# we'll use a method called DBSCAN
from sklearn.cluster import DBSCAN

model = DBSCAN(
    eps=0.2,            # max distance for two points to be considered as the same neighborhood
    min_samples=2,      # how many points necessary to define a neighborhood?
    metric='cosine',    # how do we define "distance"?
    algorithm='brute',  # required for cosine metric
    n_jobs=-1           # parallelize across all cores
)

# we can tweak the `eps` value to be more lenient (higher) or stricter (lower)
# in how it groups articles

In [None]:
clusters = model.fit_predict(vecs)

# each article is given a label,
# if its above -1, the label is a cluster id
# if it is -1, then it's "noise"
clusters

In [None]:
# now we'll take our articles and group them according to these labels
# skip those labeled "noise" (-1)
from collections import defaultdict

events = defaultdict(list)
for i, clus in enumerate(clusters):
    if clus == -1:
        continue
    events[str(clus)].append(articles[i])

In [None]:
# print out the clusters
for id, mems in events.items():
    print('-{}-------------'.format(id))
    for a in mems:
        print('\t{}'.format(a['title']))

In [None]:
# so we have "event" clusteres
# now we want to cluster _these clusters_ into "story" clusters!
# we basically follow a similar process.
# we tokenize the whole event (we just mash its articles together)

# first we extract the article texts
events = list(events.values())
docs = ['\n'.join([a['text'] for a in e]) for e in events]

In [None]:
# we vectorize again
vecs = vectorizer.fit_transform(docs)

In [None]:
# we create another DBSCAN model
story_model = DBSCAN(
    eps=0.2,
    min_samples=2,
    metric='cosine',
    algorithm='brute',
    n_jobs=-1
)

In [None]:
# and we cluster again!
clusters = model.fit_predict(vecs)
clusters

In [None]:
# again, we can group them by cluster id
# (skipping noise)
stories = defaultdict(list)
for i, clus in enumerate(clusters):
    if clus == -1:
        continue
    stories[str(clus)].append(events[i])

In [None]:
# we'll write a little helper function to give us a date and time for an event
# we just assume the time the first article in the event was published is when
# the event happened. this isn't necessarily true, but it'll be fine for this.
def ev_created_at(e):
    return min([a['created_at'] for a in e])

In [None]:
# now we can print out the stories
from datetime import datetime

for id, mems in stories.items():
    print('-{}-------------'.format(id))
    for e in sorted(mems, key=lambda x: ev_created_at(x)):
        created_at = ev_created_at(e)
        print('\t{} : {} ({} articles)'.format(
            datetime.fromtimestamp(created_at).strftime('%c'),
            e[0]['title'],
            len(e)
        ))
        for a in e:
            print('\t\t{} ({})'.format(
                a['title'],
                datetime.fromtimestamp(a['created_at']).strftime('%c'),
            ))