In [None]:
import pandas as pd
import matplotlib.pyplot as plt

with open('files.txt') as f:
    files = [x.strip() for x in f.read().splitlines() if x.strip() != '']

In [None]:
s = pd.DataFrame(pd.to_datetime(files), columns=['date']).date.value_counts()

In [None]:
fig, ax = plt.subplots(figsize=(16, 8))
s.plot(ax = ax, marker='o', linestyle='none', markersize=1)

## Filter articles

### Read in metadata

In [None]:
import csv
import numpy as np
file = 'meta.csv'
header = ['filename', 'title', 'print_section', 'desk', 'online_sections', 'word_count']

with open(file) as f:
    reader = csv.DictReader(f, fieldnames=header)
    rows = list(reader)

df = pd.DataFrame(rows).set_index('filename')
for c in df:
    df[df[c] == '_'] = np.nan
df['word_count'] = pd.to_numeric(df.word_count)

### Filter by wordcount

Use only articles with more than 3000 words

In [None]:
def get_articles_with_word_counts(df, low, high):
    return df[(df.word_count > low) & (df.word_count < high)]

df_filtered = get_articles_with_word_counts(df, 3000, df.word_count.quantile(0.9999))

In [None]:
fig, ax = plt.subplots(figsize=(16, 6))
df_filtered.word_count.plot(kind='hist', ax = ax, bins=120, title='Histogram of # words per article - after word count filter, #articles: {}'.format(len(df_filtered)))
fig.tight_layout()

### Filter out by the online section

Remove all articles that have been posted in multiple online sections.

In [None]:
f = df_filtered[df_filtered.online_sections.str.contains(';') == False].online_sections.value_counts().to_frame()

And remove all articles that belong to a online section that has less than 250 articles in it.

In [None]:
filtered_online_section = f[f.online_sections > 250].index.values
df_filtered_filtered = df_filtered[df_filtered.online_sections.apply(lambda x: x in filtered_online_section)]
df_filtered_filtered.head()

### Plot articles per class after filtering

In [None]:
fig, ax = plt.subplots(figsize=(10, 5))
df_filtered_filtered.online_sections.value_counts().plot(kind='barh', ax = ax, title='# articles per class')
fig.tight_layout()

## Save filenames to disk

In [None]:
with open('filtered_articles.txt', 'w') as f:
    f.write('\n'.join(df_filtered_filtered.index.values))

## Get filtered elements

In [None]:
prefix = 'filtered_articles/'
filtered_files = glob('{}*/*/*/*.xml'.format(prefix))
filtered_files_ = ['/'.join(x.rsplit('/', 4)[-4:]) for x in filtered_files]
# Test whether all articles are there
assert len(filtered_files_) == len(set(filtered_files_) & set(df_filtered_filtered.index.values))

In [None]:
from bs4 import BeautifulSoup
import sys
import re

def get_body_of_article(file):
    assert os.path.exists(file)
    with open(file) as f:
        content = f.read()
    body = re.findall(r'<block class="full_text">(.+?)</block>', content, re.DOTALL | re.MULTILINE)
    assert len(body) == 1
    body = body[0].strip().replace('<p>', '').replace('</p>', '')
    return body

bodies = {}
for idx, file in enumerate(filtered_files):
    sys.stdout.write('\r{:9}/{}'.format(idx + 1, len(filtered_files)))
    body = get_body_of_article(file)
    bodies[file.replace(prefix, '')] = body

In [None]:
bodies_sorted = []
for filename, df_ in df_filtered_filtered.iterrows():
    assert filename in bodies
    bodies_sorted.append(bodies[filename])

In [None]:
df_filtered_filtered['body'] = bodies_sorted

In [None]:
import pickle
X = df_filtered_filtered.body.values
Y = df_filtered_filtered.online_sections.values

with open('dataset_nyt.npy', 'wb') as f:
    pickle.dump((X, Y), f)