Creates an HTML document for an individual person in the corpus containing:
- A date-ordered list of documents in which they appear, and the relevant paper
- Which link to full-text copies of those articles

In [16]:
import os
from lxml import etree
import pandas as pd
from multiprocessing import Pool
import random
import time

AttributeError: module 'pandas' has no attribute '_libs'

In [None]:
TEMPLATE = '''<!DOCTYPE html>
<html lang="en">
  <head>
    <meta charset="utf-8">
    <title>{celeb_name}</title>
  </head>
  <body>
  
    <header>
        <h1>{celeb_name}</h1>
    </header>
    
    <ul>
        {toc}
    </ul>
    
    {documents}
    
  </body>
</html>
'''

In [None]:
META = '/oak/stanford/groups/malgeehe/celebs/chicago_results/names_annual'
META = [os.path.join(META, x) for x in os.listdir(META) if '._' not in x]
META = sorted(META)

In [None]:
def filter_meta(files, name):
    L = []
    for file in files:
#         fn = os.path.split(file)[1]
#         print('\rchecking {}'.format(fn), end = '')
        with open(file) as f:
            for line in f:
                if name in line:
                    L.append(file)
#                     print('\rfound {} in {}'.format(name, fn), end = '')
                    break
            continue
    return L

In [None]:
def get_xml_paths(csv, name):
    df = pd.read_csv(csv)
    subset = df[df['person'].str.match(name)]
    return subset, list(subset['xml_path'].unique())

In [None]:
def make_p(path, name):
    with open(path) as f:
        tree = etree.parse(f)
        # get all objects matching type
        words = [x.text for x in tree.iterfind(".//word")]
        names = [x for x in name.strip().lower().split(' ') if x]
        # mark target terms
        L = []
        for word in words:
            if '<' or '>' in word: # prevent unintended html formatting
                word = word.replace('<', '').replace('>', '')
            if word.lower() in names:
                fmt = '<mark>' + word + '</mark>'
                L.append(fmt)
            else:
                L.append(word)
        text = ' '.join(L)
        text = '<p>' + text + '</p>'
        return text

In [None]:
def make_header_bullet(path, subset, id_num, p):
    meta = subset[subset['xml_path'] == path][:1] # take top row in case multiple rows
    d = meta.to_dict('index')
    d = [x for x in d.values()][0] # dumb
    data = str(d['year']) + '\t<i>' + d['paper'].strip() + '</i>\t' + d['doc_type']
    header = '<a id ="{}"><h3>{}</h3></a>\n<p>{}</p>\n{}'.format(id_num, d['doc_title'].title(),
                                                                      data, p) 
    bullet = '<li><a href="#{}">{data}:\t"{title}"</a>'.format(id_num, data = data,
                                                              title = d['doc_title'].title())
    return header, bullet

In [None]:
def update_docs(header, bullet):
    headers += '\n' + header
    bullets += '\n' + bullet
    return headers, bullets

In [3]:
def customize_template(template, name, headers, bullets):
    return template.format(celeb_name = name, toc = bullets, documents = headers)

In [9]:
def get_html_texts(tup): # dumb
    name, META, TEMPLATE = tup
    csvs = filter_meta(META, name)
    id_num = 0
    headers = ''
    bullets = ''
    
    for i, csv in enumerate(csvs):
#         print('\rcsv #{} of {}'.format(i + 1, len(csvs)), end = '')
        subset, paths = get_xml_paths(csv, name)
        for i, path in enumerate(paths):
            p = make_p(path, name)
            header, bullet = make_header_bullet(path, subset, id_num, p)
            id_num += 1
            headers += '\n' + header
            bullets += '\n' + bullet
#             pct = round((i/len(paths)) * 100)
#             if pct % 10 == 0:
#                 print('\r{}% of paths complete'.format(pct), end = '')
    
    html = customize_template(TEMPLATE, name, headers, bullets)
    fn = name.lower().strip().split(' ')
    fn = '_'.join(fn) + '.html'
    fn = os.path.join('/oak/stanford/groups/malgeehe/celebs/chicago_results/names_html', fn)
    
    with open(fn, 'w') as f:
        f.write(html)
        f.close()
        
    return fn

In [10]:
people = '''Joe Louis
Robert S. Abbott
Booker T. Washington
Duke Ellington
Jack Johnson
Phyllis Wheatley
Jim Crow
Bill Robinson
Abraham Lincoln
John Brown
Jack Ellis
John Smith
William Johnson
Louis Armstrong
Frederick Douglass
Billy Jones
Bob Hayes
J. Wesley Jones
Haile Selassie
Henry Brown
William Smith
Mary Smith
Earl Hines
Jay Gould
James Smith
Cab Calloway
James Johnson
William Campbell
Oscar Depriest
Jesse Owens
Clarence Muse
Fletcher Henderson
Al Monroe
John Williams
William Brown
Mary Williams
James Brown
Mary Jones
George Williams
John Henry Lewis
William Ii
George Washington
Robert Smith
Paul Robeson
William Jones
Charles Johnson
M. Williams
Roland Hayes
George Smith
Mary Johnson
Charles Williams
Walter White
Julian Black
George Johnson
John Johnson
Henry Johnson
Wendell Phillips
George Jones
Marian Anderson
Charles Smith
Robert Johnson
Henry Smith
A. Johnson
Walter Barnes
Henry Armstrong
John Harris
Mary Brown
George Brown
John Henry
John Thomas
Josephine Baker
Charles Jackson
Bessie Smith
Max Schmeling
John Ii
Bert Williams
Ethel Waters
Harry Wills
John Davis
James Weldon Johnson
Robert Brown
William Davis
John Lewis
Jack Dempsey
J. Johnson
John Taylor
Clarence Williams
Louise Beavers
Mary Davis
Edward J. Kelly
William Jackson
Richard B. Harrison
L. K. Williams
James Jones
Dave Peyton
Charles Anderson
James Harris
William Thomas
Charles Jones
Thelma Williams
Warren Williams
J. Smith
Julius Rosenwald
Mike Jacobs
Ralph Cooper
John Robinson
Tom Wilson
William White
M. Jones
John Adams
Sam Taylor
John Jackson
John Anderson
Salem Tutt Whitney
Jack Blackburn
James Jackson
George Harris
Mary Jackson
Franklin D. Roosevelt
Helen Smith
Robert Jones
William Harris
Charles Moore
Andy Kirk
T. Arnold Hill
Robert Williams
Chick Webb
Henry Williams
Morris Lewis
C. J. Walker
Maude Roberts George
C. Johnson
John Jones
M. Johnson
William Taylor
Frank Edwards
Marcus Garvey
A. L. Jackson
Clarence Cameron White
James Ii
Joe Brown
M. Smith
Charles Brown
A. Wayman Ward
Arthur Johnson
Jim Taylor
Jesus Christ
A. Jones
C. Williams
Max Baer
George White
Frank Williams
Arthur W. Mitchell
James Thomas
Willie Bryant
R. R. Wright
Dorothy Smith
Anna Johnson
V. Johnson
James Robinson
Lydia E. Pinkham
Billy King
Edward Smith
Fred Johnson
Charles Thompson
John Wilson
A. Williams
Willie Smith
William Turner
Maude Roberts
C. Smith
John Roxborough
Ruth Johnson
Charles Young
A. L. Foster
J. Williams
George Robinson
Langston Hughes
William H. Jackson
Ernest Hall
A. Smith
Ralph Metcalfe
John White
James Taylor
Frank Jones
Anna Jones
J. C. Austin
Helen Woodward
Alice Williams
William Green
J. Brown
William Anderson
Charles Harris
James Brooks
Alice Jones
Gus Greenlee
Frank Johnson
Clarence Smith
James Davis
James Wilson
Robert Jackson
Walter Smith
George Walker
William Robinson
Bud Harris
K. Williams
Theodore Roosevelt
Frank Smith
John Moore
L. Johnson
James Anderson
Clarence Jones
Edward Jones
Willie Jones
Claude Hopkins
Etta Moten
H. Johnson
Robert Taylor
H. Smith
A. Brown
Walter Johnson
William Walker
Charles Davis
Erskine Tate
Charles Ii
John Scott
Charles Wilson
Andrew Jackson
L. Smith
George Wilson
John Allen
Eddie Tolan
George Lewis
V. Brown
Henry Ford
Uncle Sam
V. Smith
Harry Smith
Fred Smith
Mary Lewis
Henry Wilson
John W. Davis
Edward Johnson
Frank Wilson
C. Jones
C. H. Thomas
Benny Goodman
Roger Williams
Walter Brown
Albert Jones'''

In [11]:
people = [x for x in people.split('\n') if x]

In [12]:
# remove people who have already been processed
done = '/oak/stanford/groups/malgeehe/celebs/chicago_results/names_html'
done = [x.split('.html')[0].replace('_', ' ').title() for x in os.listdir(done) if x.endswith('.html')]

In [13]:
todo = list(set(people) - set(done))

In [15]:
stars = list(zip(todo, [META]*len(people), [TEMPLATE]*len(people))) # unnecessary but whatever

NameError: name 'META' is not defined

In [None]:
with Pool() as p:
    fns = p.map(get_html_texts, stars)

# Linear process: old code

In [38]:
def make_stars(META, name):
    return list(zip(META, [name] * len(META)))

In [39]:
def filter_meta(file, name):
    with open(file) as f:
        for line in f:
            if name in line:
                return file

In [40]:
def filter_metas(stars):
    start = time.time()
    
    with Pool() as p:
        out = p.starmap(filter_meta, stars)
        
    print('\r{} seconds to filter {}'.format(round(time.time() - start), stars[0][1]))
    return [x for x in out if x]

In [46]:
def get_html_texts(META, name, TEMPLATE):
    stars = make_stars(META, name)
    csvs = filter_metas(stars)
    id_num = 0
    headers = ''
    bullets = ''
    
    for i, csv in enumerate(csvs):
        print('\rcsv #{} of {}'.format(i + 1, len(csvs)), end = '')
        subset, paths = get_xml_paths(csv, name)
        for i, path in enumerate(paths):
            p = make_p(path, name)
            header, bullet = make_header_bullet(path, subset, id_num, p)
            id_num += 1
            headers += '\n' + header
            bullets += '\n' + bullet
            pct = round((i/len(paths)) * 100)
            if pct % 10 == 0:
                print('\r{}% of paths complete'.format(pct), end = '')
    
    html = customize_template(TEMPLATE, name, headers, bullets)
    fn = name.lower().strip().split(' ')
    fn = '_'.join(fn) + '.html'
    fn = os.path.join('/oak/stanford/groups/malgeehe/celebs/chicago_results/names_html', fn)
    
    with open(fn, 'w') as f:
        f.write(html)
        f.close()
        
    return fn