In [1]:
import os
import re
from collections import defaultdict

import pandas as pd

data = defaultdict(list)
addresses = []

# When we grab the date from the URL. It is hidden in the form '{year}/{month}`.
date_pattern = re.compile(r'\d{4}/\d{2}')

for file in os.listdir('articles/'):
    with open('articles/{}'.format(file), 'r') as f:
        address = f.readlines()[0]
        if address not in addresses:
            addresses.append(address)
            f.seek(0)
            data['text'].append(' '.join(f.readlines()[2:]))
            try:
                d = date_pattern.search(address).group(0)
                data['date'].append(d)
            except:
                print("There was an error processing the following address: {}".format(address))
                print("This was located in the file: {}".format(f))
                date.append(None)
        else:
            print("Article at {} already loaded. Skipping.".format(address))

df = pd.DataFrame(data=data)

Article at https://www.theatlantic.com/magazine/archive/2019/03/how-humans-tamed-themselves/580447/
 already loaded. Skipping.
Article at https://www.theatlantic.com/magazine/archive/2019/03/how-humans-tamed-themselves/580447/
 already loaded. Skipping.
Article at https://www.theatlantic.com/magazine/archive/2019/03/how-humans-tamed-themselves/580447/
 already loaded. Skipping.
Article at https://www.theatlantic.com/magazine/archive/2019/03/bryan-singers-accusers-speak-out/580462/
 already loaded. Skipping.
Article at https://www.theatlantic.com/magazine/archive/2019/03/how-humans-tamed-themselves/580447/
 already loaded. Skipping.
Article at https://www.theatlantic.com/magazine/archive/2019/03/how-humans-tamed-themselves/580447/
 already loaded. Skipping.
Article at https://www.theatlantic.com/magazine/archive/2019/03/how-humans-tamed-themselves/580447/
 already loaded. Skipping.
Article at https://www.theatlantic.com/magazine/archive/2019/03/bryan-singers-accusers-speak-out/580462/
 

Article at https://www.theatlantic.com/magazine/archive/2019/03/bryan-singers-accusers-speak-out/580462/
 already loaded. Skipping.
Article at https://www.theatlantic.com/magazine/archive/2019/03/how-humans-tamed-themselves/580447/
 already loaded. Skipping.
Article at https://www.theatlantic.com/magazine/archive/2019/03/bryan-singers-accusers-speak-out/580462/
 already loaded. Skipping.
Article at https://www.theatlantic.com/magazine/archive/2019/03/how-humans-tamed-themselves/580447/
 already loaded. Skipping.
Article at https://www.theatlantic.com/magazine/archive/2019/03/bryan-singers-accusers-speak-out/580462/
 already loaded. Skipping.
Article at https://www.theatlantic.com/magazine/archive/2019/03/bryan-singers-accusers-speak-out/580462/
 already loaded. Skipping.
Article at https://www.theatlantic.com/magazine/archive/2019/03/bryan-singers-accusers-speak-out/580462/
 already loaded. Skipping.
Article at https://www.theatlantic.com/magazine/archive/2017/03/how-to-build-an-autocr

In [2]:
df['year'] = pd.DatetimeIndex(df['date']).year
df['month'] = pd.DatetimeIndex(df['date']).month

In [3]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

from bokeh.plotting import figure, show
from bokeh.io import output_notebook


# import plotly.offline as py
# py.init_notebook_mode(connected=True)
# import plotly.graph_objs as go
# import plotly.tools as tls

def plot_frequent_word(count_vec, subtitle, feature_names):
    zipped = list(zip(feature_names, count_vec))
    x, y = (list(x) for x in zip(*sorted(zipped, key=lambda x: x[1], reverse=True)))
    X = np.concatenate([x[0:15], x[-16:-1]])
    Y = np.concatenate([y[0:15], y[-16:-1]])
    words = X[0:25]
    counts = Y[0:25]

    p = figure(x_range=words, plot_height=350, title="Word Counts",
               toolbar_location=None, tools="")

    p.vbar(x=words, top=counts, width=0.1)

    p.xgrid.grid_line_color = None
    p.y_range.start = 0

    show(p)
#     # Plotting the Plot.ly plot for the Top 50 word frequencies
#     data = [go.Bar(
#                 x = x[0:50],
#                 y = y[0:50],
#                 marker= dict(colorscale='Jet',
#                              color = y[0:50]
#                             ),
#                 text='Word counts'
#         )]
#     layout = go.Layout(
#     title='Top 50 Word frequencies (%s)' % (subtitle)
#     )

#     fig = go.Figure(data=data, layout=layout)

#     py.iplot(fig, filename='basic-bar')

In [4]:
output_notebook()

In [5]:
text_by_year = dict()

for year in range(2010, 2018 + 1):
    text_by_year[year] = df[df['year']==year]['text'].values

In [7]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

tf_vectorizer = dict()
tf = dict()
feature_names = dict()
count_vec = dict()

for year in range(2010, 2018 + 1):
    txt = text_by_year[year]

    tf_vectorizer[year] = CountVectorizer(max_df=0.95,
                                        min_df=2,
                                        stop_words='english',
                                        decode_error='ignore')

    tf[year] = tf_vectorizer[year].fit_transform(txt)

    feature_names[year] = tf_vectorizer[year].get_feature_names()

    count_vec[year] = np.asarray(tf[year].sum(axis=0)).ravel()
    
    plot_frequent_word(count_vec[year], "Year {}".format(year), feature_names[year])

In [10]:
import pickle

pickle.dump((tf, feature_names, count_vec), open('atlantic-word-count.p', 'wb'))

In [67]:
x = feature_names[2010]
y = count_vec[2010]

# x = [hash(word) for word in x]

zipped = list(zip(x, y))
x, y = (list(x) for x in zip(*sorted(zipped, key=lambda x: x[1], reverse=True)))

x, y = x[:50], y[:50]
N = 10

In [69]:
from bokeh.models import ColumnDataSource

# Create a ColumnDataSource object for each team
rockets_cds = ColumnDataSource(data=dict(x=x, y=y))

# Create and configure the figure
p = figure(x_range=x, plot_height=250, plot_width=5 * 250, title="Top Word Counts")

p.vbar(x=x, top=y, width=0.9)

show(p)

In [37]:
from bokeh.models import ColumnDataSource, Plot, LinearAxis, Grid
from bokeh.models.glyphs import VBar

source = ColumnDataSource(dict(x=x,top=y,))

plot = Plot(
    title=None, plot_width=800, plot_height=300,
    h_symmetry=False, v_symmetry=False, min_border=0, toolbar_location=None)

glyph = VBar(x="x", top="top", bottom=0, width=0.5, fill_color="#b3de69")
plot.add_glyph(source, glyph)

xaxis = LinearAxis()
plot.add_layout(xaxis, 'below')

yaxis = LinearAxis()
plot.add_layout(yaxis, 'left')

plot.add_layout(Grid(dimension=0, ticker=xaxis.ticker))
plot.add_layout(Grid(dimension=1, ticker=yaxis.ticker))

show(plot)

In [9]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)

In [10]:
from sklearn.decomposition import NMF, LatentDirichletAllocation

lda = LatentDirichletAllocation(n_components=5, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)

for year in range(2010, 2018 + 1):
    lda.fit(tf[year])

    print("For the year of {}, the top five topics:".format(year))
    print_top_words(lda, feature_names[year], 20)
    print("")

For the year of 2010, the top five topics:
Topic #0: marc jonathan said father like matthew cigarette dye toby years time just don long way know new man vats son
Topic #1: like new people time just said years world way good life says school percent year american told day work long
Topic #2: said gaga like years soldiers just time way new world people fame good patrol says right little don news man
Topic #3: howell naomi like hugo said bronx concourse little pa mr just passage years new father mother didn people long time
Topic #4: donald autism like people time new says said years way kanner told just world year don percent school day news

For the year of 2011, the top five topics:
Topic #0: like men said women years people new college percent time ncaa way year american work man just long told says
Topic #1: cleary like said people new just years know don way mother long time says book say told larry life day
Topic #2: like said time new people just way years says told year work long