In [1]:
import os
from bs4 import BeautifulSoup
import metapy

In [2]:
# assuming there is a directory structure `nyt_corpus/data/2000/` 
# with xml documents available
base_dir = 'nyt_corpus/data/2000/'

In [3]:
def extract_data(filename):
    return BeautifulSoup(open(filename, encoding = 'utf8'))

def list_files(dir):
    return [os.path.join(r, n) for r, _, f in os.walk(dir) for n in f]

In [4]:
# putting all of the xml documents into a list
blobs = []
for file_path in list_files(base_dir):
    blobs.append(extract_data(file_path))

In [5]:
# filtering the documents and leaving only those that contain 
# `Bush` or `Gore` in them
filtered = [
    blob 
    for blob in blobs 
    if [
        person.get_text() 
        for person in blob.find_all('person') 
        if person.get_text() in ['Bush, George W (Gov)', 'Gore, Al (Vice Pres)']
    ]
]

In [6]:
# processing and tokenizing the documents and putting them into a dictionary
tokenized = {}
for i, article in enumerate(filtered):
    doc = metapy.index.Document()
    doc.content(article.body.get_text())
    tok = metapy.analyzers.ICUTokenizer(suppress_tags=True)
    tok = metapy.analyzers.LowercaseFilter(tok)
    tok = metapy.analyzers.LengthFilter(tok, min=2, max=30)
    tok.set_content(doc.content())
    tokenized[i] = [token for token in tok]

In [7]:
print(tokenized[0])

