In [None]:
%matplotlib inline

In [2]:
import json

articles = json.load(open('data/articles_processed.json', 'r'))
len(articles)

4752

In [None]:
# extract the title & text of the articles
docs = ['\n'.join([a['title'], a['text']]) for a in articles]

In [None]:
# we need to tell the computer how to break up a text
# into meaningful pieces ("tokens")
# we'll use spacy, a natural language processing library
# that will take care of the details for us
import spacy
from spacy.parts_of_speech import VERB, NUM, NOUN
nlp = spacy.load('en')

In [None]:
# now we'll define the function that breaks up text into keywords
# in particular, the only parts we care about are verbs, numbers,
# nouns, and 'named entities' (people, places, organizations, etc)
def extract_keywords(doc):
    # here we run spacy on the text
    # it will identify named entities and tag parts-of-speech
    doc = nlp(doc)
    ents = [ent.text for ent in doc.ents]
    toks = [tok.text for tok in doc
            if not tok.is_stop and tok.pos in [VERB, NUM, NOUN]]
    return [t.lower() for t in ents + toks]

In [None]:
# now we'll convert our original article text into just these keywords
docs = ['||'.join(extract_keywords(d)) for d in docs]

In [3]:
# but this takes awhile so we'll load precomputed ones
docs = json.load(open('data/docs_processed.json', 'r'))

In [4]:
def tokenize(doc):
    return doc.split('||')

In [None]:
# we need to convert this text into some numerical representation
# the computer can work with. We'll use "TF-IDF bag-of-words".
# this process of turning text->numbers is called "vectorization"
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    strip_accents='ascii', # remove accents from characters
    lowercase=False,       # don't make things lowercase, this will mess up the NER step
    use_idf=True,          # we want to use IDF
    smooth_idf=True,       # we want to "smooth" the IDF (avoiding division by 0)
    max_df=1.0, # ignore terms w/ DF higher than this (int=absolute, float=percent)
    min_df=1,   # ignore terms w/ DF lower than this (int=absolute, float=percent)
    stop_words='english',  # remove very common English words (e.g. the, a, an)
    tokenizer=tokenize     # use our tokenization function
)

In [None]:
# now let's run the vectorizer on our articles
# (this'll take a sec)
vecs = vectorizer.fit_transform(docs)

In [5]:
# instead of running that, we're just going to load a pre-trained vectorizer
# (it would take too long & take up too much memory if we all did it)
from sklearn.externals import joblib
vectorizer = joblib.load('data/vectorizer.pkl')

In [6]:
# and we're going to load precomputed vectors
from scipy import io
vecs = io.mmread('data/vecs.mtx').tocsr()

In [None]:
# the result is that each article is represented by a list of numbers
list(vecs.toarray()[0])

In [None]:
# so each article is represented by 75,425 numbers, most of which are 0.0
# each number corresponds to a token encountered in the dataset
vecs.shape

In [None]:
# we can get an idea of what's happening using a technique called tSNE
from sklearn.manifold import TSNE

# `n_components` is the number of dimensions to reduce to
tsne = TSNE(n_components=2)

# apply the dimensionality reduction
# to our embeddings to get our 2d points
# this doesn't scale well and the graph gets easily crowded,
# so we'll just look at a small amount of articles
points = tsne.fit_transform(vecs[:200].todense())

In [None]:
import matplotlib.pyplot as plt

# plot our results
# make it quite big so we can see everything
fig, ax = plt.subplots(figsize=(40, 20))

# extract x and y values separately
xs = points[:,0]
ys = points[:,1]

# plot the points
# we don't actually care about the point markers,
# just want to automatically set the bounds of the plot
ax.scatter(xs, ys, alpha=1)

# annotate each point with its word
for i, point in enumerate(points):
    ax.annotate(articles[i]['title'],
                (xs[i], ys[i]),
                fontsize=8)
    
plt.plot()

In [13]:
# so now we can apply an algorithm to ~cluster~ these articles
# i.e. group them so that articles talking about the same/similar things are
# in the same group
# we'll use a method called DBSCAN
from sklearn.cluster import DBSCAN

model = DBSCAN(
    eps=0.4,            # max distance for two points to be considered as the same neighborhood
    min_samples=2,      # how many points necessary to define a neighborhood?
    metric='cosine',    # how do we define "distance"?
    algorithm='brute',  # required for cosine metric
    n_jobs=-1           # parallelize across all cores
)

# we can tweak the `eps` value to be more lenient (higher) or stricter (lower)
# in how it groups articles

In [14]:
clusters = model.fit_predict(vecs)

# each article is given a label,
# if its above -1, the label is a cluster id
# if it is -1, then it's "noise"
clusters

array([-1, -1, -1, ..., -1, -1, -1])

In [15]:
# now we'll take our articles and group them according to these labels
# skip those labeled "noise" (-1)
from collections import defaultdict

events = defaultdict(list)
for i, clus in enumerate(clusters):
    if clus == -1:
        continue
    events[str(clus)].append((articles[i], docs[i]))

In [16]:
# print out the clusters
for id, mems in events.items():
    print('-{}-------------'.format(id))
    for a, kws in mems:
        print('\t{}'.format(a['title']))

-224-------------
	Antonio Villaraigosa jumps into 2018 California governor's race
	Antonio Villaraigosa, former L.A. mayor, jumps into the California governor's race
-156-------------
	Disciplinary action looms against Marist students after ‘racially charged post’
	Marist students face disciplinary action after ‘racially charged post’
-46-------------
	Essential Politics: Trump tweets his support for Issa, former California schools chief to run for governor
	Kamala Harris picks up the slack for Santa Barbara Democrat who can't make campaign rally
-228-------------
	Principal on leave for alleged anti-Trump comments; student who voiced support for Trump attacked
	Muslim and Latino students in California are targeted following Trump's election
-172-------------
	Americans fight over politics, because that's what freedom looks like
	Vote falls Trump's way, but fighting far from over
-198-------------
	'Almost Christmas' review: Mo'Nique brings the cheer to holiday movie
	Mo'Nique is the 

In [17]:
# so we have "event" clusteres
# now we want to cluster _these clusters_ into "story" clusters!
# we basically follow a similar process.
# we tokenize the whole event (we just mash its articles together)

# first we extract the article texts
events = list(events.values())
docs = ['||'.join([kws for a, kws in e]) for e in events]

In [18]:
# we vectorize again
vecs = vectorizer.transform(docs)

In [61]:
# we create another DBSCAN model
story_model = DBSCAN(
    eps=0.6,
    min_samples=2,
    metric='cosine',
    algorithm='brute',
    n_jobs=-1
)

In [62]:
# and we cluster again!
clusters = story_model.fit_predict(vecs)
clusters

array([-1, -1, -1, -1, -1, -1, -1, -1,  0, -1, -1,  1,  2, -1, -1, -1, -1,
       -1, -1,  3,  1, -1, -1,  1, -1, -1,  2, -1, -1, -1, -1, -1, -1, -1,
       -1, -1,  4, -1,  5, -1,  6, -1, -1, -1, -1,  1,  1, -1,  1, -1,  1,
        1, -1,  7, -1, -1, -1,  1, -1,  1, -1, -1,  1, -1,  8,  1,  4, -1,
        1, -1, -1, -1, -1, -1,  9, 10, -1, -1, -1, -1, 11, -1, -1, -1, -1,
       -1, -1, -1, -1,  1, -1, -1, -1, 11, -1, -1, -1, 12, -1, -1, -1, -1,
       -1,  1, -1, -1, -1, -1, -1, -1,  8, -1, -1, 13, -1, -1, -1, -1, -1,
       -1, -1, 12, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1,  8, -1, -1, -1,
        6,  1, -1, -1, -1, -1,  1, -1, -1,  9, -1, -1, -1, -1,  1, -1, -1,
       -1, -1, -1,  1, -1, -1, -1, -1, -1, -1,  2, -1, -1, -1, -1, -1,  1,
       10, -1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, 14, -1, -1, -1,  1, -1, -1, -1,
       -1, -1, -1, -1, -1,  1, -1,  1, -1, -1, -1,  5, -1, -1, -1, -1, -1,
       13, -1, -1, -1,  5

In [63]:
# again, we can group them by cluster id
# (skipping noise)
stories = defaultdict(list)
for i, clus in enumerate(clusters):
    if clus == -1:
        continue
    stories[str(clus)].append(events[i])

In [38]:
# we'll write a little helper function to give us a date and time for an event
# we just assume the time the first article in the event was published is when
# the event happened. this isn't necessarily true, but it'll be fine for this.
def ev_created_at(e):
    return min([a['created_at'] for a, _ in e])

In [64]:
# now we can print out the stories
from datetime import datetime

for id, mems in stories.items():
    print('-{}-------------'.format(id))
    for e in sorted(mems, key=lambda x: ev_created_at(x)):
        created_at = ev_created_at(e)
        print('\t{} : {} ({} articles)'.format(
            datetime.fromtimestamp(created_at).strftime('%c'),
            e[0][0]['title'],
            len(e)
        ))
        for a, _ in e:
            print('\t\t{} ({})'.format(
                a['title'],
                datetime.fromtimestamp(a['created_at']).strftime('%c'),
            ))
        print('\n')
    print('\n')

-4-------------
	Sun Nov  6 00:00:00 2016 : What 12 State Schools Are Cutting, or Creating (2 articles)
		What 12 State Schools Are Cutting, or Creating (Sun Nov  6 00:00:00 2016)
		Bottom Line: How State Budget Cuts Affect Your Education (Sun Nov  6 00:00:00 2016)


	Tue Nov 15 00:00:00 2016 : Cal State trustees could raise tuition by $270, and hundreds of students plan to disrupt their meeting (2 articles)
		Cal State trustees could raise tuition by $270, and hundreds of students plan to disrupt their meeting (Tue Nov 15 00:00:00 2016)
		'We are the walking debt': Cal State students protest tuition hike proposal (Tue Nov 15 00:00:00 2016)




-6-------------
	Mon Oct 31 00:00:00 2016 : Iraqi forces prepare to break into Mosul in battle against Islamic State (2 articles)
		Iraqi forces prepare to break into Mosul in battle against Islamic State (Mon Oct 31 00:00:00 2016)
		Iraq has never seen this kind of fighting in its battles with ISIS (Fri Nov 11 00:00:00 2016)


	Mon Nov  7 00:00