Creates an HTML document for an individual person in the corpus containing:
- A date-ordered list of documents in which they appear, and the relevant paper
- Which link to full-text copies of those articles

In [2]:
import os
from lxml import etree
import pandas as pd
from multiprocessing import Pool
import random
import time

In [3]:
TEMPLATE = '''<!DOCTYPE html>
<html lang="en">
  <head>
    <meta charset="utf-8">
    <title>{celeb_name}</title>
  </head>
  <body>
  
    <header>
        <h1>{celeb_name}</h1>
    </header>
    
    <ul>
        {toc}
    </ul>
    
    {documents}
    
  </body>
</html>
'''

In [4]:
META = '/oak/stanford/groups/malgeehe/celebs/chicago_results/names_annual'
META = [os.path.join(META, x) for x in os.listdir(META) if '._' not in x]
META = sorted(META)

In [16]:
def filter_meta(files, name):
    L = []
    for file in files:
        fn = os.path.split(file)[1]
#         print('\rchecking {}'.format(fn), end = '')
        with open(file) as f:
            for line in f:
                if name in line:
                    L.append(file)
#                     print('\rfound {} in {}'.format(name, fn), end = '')
                    break
            continue
    return L

In [41]:
def get_xml_paths(csv, name):
    df = pd.read_csv(csv)
    subset = df[df['person'].str.match(name)]
    return subset, list(subset['xml_path'].unique())

In [42]:
def make_p(path, name):
    with open(path) as f:
        tree = etree.parse(f)
        # get all objects matching type
        words = [x.text for x in tree.iterfind(".//word")]
        names = [x for x in name.strip().lower().split(' ') if x]
        # mark target terms
        L = []
        for word in words:
            if word.lower() in names:
                fmt = '<mark>' + word + '</mark>'
                L.append(fmt)
            else:
                L.append(word)
        text = ' '.join(L)
        text = '<p>' + text + '</p>'
        return text

In [43]:
def make_header_bullet(path, subset, id_num, p):
    meta = subset[subset['xml_path'] == path][:1] # take top row in case multiple rows
    d = meta.to_dict('index')
    d = [x for x in d.values()][0] # dumb
    data = str(d['year']) + '\t<i>' + d['paper'].strip() + '</i>\t' + d['doc_type']
    header = '<a id ="{}"><h3>{}</h3></a>\n<p>{}</p>\n{}'.format(id_num, d['doc_title'].title(),
                                                                      data, p) 
    bullet = '<li><a href="#{}">{data}:\t"{title}"</a>'.format(id_num, data = data,
                                                              title = d['doc_title'].title())
    return header, bullet

In [44]:
def update_docs(header, bullet):
    headers += '\n' + header
    bullets += '\n' + bullet
    return headers, bullets

In [45]:
def customize_template(template, name, headers, bullets):
    return template.format(celeb_name = name, toc = bullets, documents = headers)

In [46]:
def get_html_texts(META, name, TEMPLATE):
    stars = make_stars(META, name)
    csvs = filter_metas(stars)
    id_num = 0
    headers = ''
    bullets = ''
    
    for i, csv in enumerate(csvs):
        print('\rcsv #{} of {}'.format(i + 1, len(csvs)), end = '')
        subset, paths = get_xml_paths(csv, name)
        for i, path in enumerate(paths):
            p = make_p(path, name)
            header, bullet = make_header_bullet(path, subset, id_num, p)
            id_num += 1
            headers += '\n' + header
            bullets += '\n' + bullet
            pct = round((i/len(paths)) * 100)
            if pct % 10 == 0:
                print('\r{}% of paths complete'.format(pct), end = '')
    
    html = customize_template(TEMPLATE, name, headers, bullets)
    fn = name.lower().strip().split(' ')
    fn = '_'.join(fn) + '.html'
    fn = os.path.join('/oak/stanford/groups/malgeehe/celebs/chicago_results/names_html', fn)
    
    with open(fn, 'w') as f:
        f.write(html)
        f.close()
        
    return fn

In [47]:
people = '''Uncle Sam
Charles G. Dawes
Bobby Jones
Joan Crawford
Westbrook Pegler
Clark Gable
Quin Ryan
Len Small
Potter Palmer
Frederick Stock
Gary Cooper
Mary Pickford
Eddie Cantor
Barney Ross
Max Baer
Al Capone
Charles A. Lindbergh
A. A. Sprague
Gene Tunney
Greta Garbo
Norma Shearer
Harry Cooper
John S. Clark
Edward J. Kelly
Ted Lyons
Santa Claus
Alfred E. Smith
Gabby Hartnett
Tex Rickard
Joseph Sabath
Calvin Coolidge
Helen Hayes
Charlie Chaplin
Edward Barry
Mark Twain
William Green
Al Jolson
Gloria Swanson
Frank Parker
Bing Crosby
Jane Addams
Will Rogers
Elmer Douglass
Bob Elson
Al Simmons
William J.
Rudy Vallee
Claudette Colbert
John Powell
Joe Davis
James Simpson
Harold L. Ickes
Herman N. Bundesen
John Barrymore
Huey Long
Lou Gehrig
Walter Hagen
Dizzy Dean
Jack Benny
Jacob Baur
Paul Whiteman
Janet Gaynor
George Vi
Bette Davis
Tommy Armour
Harold Lloyd
Myrna Loy
Connie Mack
Max Schmeling
James A. Farley
Ely Culbertson
George White
Bill Terry
James Hamilton Lewis
Joe E. Brown
Jack Sharkey
Harold Stokes
Julius Rosenwald
Elmer Layden
Anthony Eden
Michael L. Igoe
John P. Barnes
Cordell Hull
Jack Holt
Leonard Wood
John Marshall
Robert J. Dunham
Ted Weems
Kay Francis
William E. Borah
Joe Mccarthy
Ann Harding
Henry A. Miller
John Thompson
Walter Dill Scott
Lionel Barrymore
Ben Bernie
Neville Chamberlain
Abel Davis
J. Smith
Bill Lee
John Simon
John A.
George Craig Stewart
Robert E. Crowe
Spencer Tracy
Joan Blondell
Eddie Collins
Irene Dunne
John Alden Carpenter
Francisco Franco
Jean Harlow
Jim Londos
William Powell
Loretta Young
Dick Powell
Wallace Beery
Hugh S. Johnson
Frank J. Loesch
Rufus C. Dawes
Walter Johnson
D. F. Kelly
Anton J. Cermak
Charles S. Deneen
Marion Davies
Pat O'Brien
John Timothy Stone
Jack Kearns
Frank L. Smith
Harry L. Hopkins
Ronald Colman
Jackie Coogan
Mickey Walker
Haile Selassie
J. Ogden Armour
Bud Taylor
John W. Davis
Douglas Fairbanks
Joseph B. David
Oscar Nelson
Lon Warneke
C. Smith
Constance Bennett
Clayton F. Smith
Earl Carroll
Mary Garden
Lynn Waldorf
Cary Grant
Joan Bennett
John Dillinger
Robert Taylor
Cecil Smith
Floyd Gibbons
Chick Evans
Frank Murphy
Grace Moore
Jimmy Dykes
Fred Allen
Sammy Mandell
Dick Hanley
Bebe Daniels
Babe Herman
Carole Lombard
Jan Garber
Edward Hines
Chauncey Mccormick
John J. Pershing
John D. Rockefeller
Norma Talmadge
Conrad Nagel
John L. Sullivan
Philip J. Finnegan
Alice Brady
George M. Cohan
Helen Jacobs
John Smith
Mike Jacobs
Thomas Jefferson
Frank Smith
George Smith
William Ii
Charles Ii
John Brown
James A.
Jesus Christ
Jack Johnson
Harry Wills
A. Smith
John Ii
Andrew Jackson
John Anderson
George Williams
Edward Johnson
Jesse Owens
James Ii
L. Smith
John Lewis'''

In [48]:
people = people.split('\n')

In [None]:
for i, person in enumerate(people):
    fn = get_html_texts(META, person, TEMPLATE)
    print(fn)
    print('\r{} of {}'.format(i + 1, len(people)), end = '')

1 seconds to filter Antoinette Donnelly
70% of paths complete

# Linear process: old code

In [38]:
def make_stars(META, name):
    return list(zip(META, [name] * len(META)))

In [39]:
def filter_meta(file, name):
    with open(file) as f:
        for line in f:
            if name in line:
                return file

In [40]:
def filter_metas(stars):
    start = time.time()
    
    with Pool() as p:
        out = p.starmap(filter_meta, stars)
        
    print('\r{} seconds to filter {}'.format(round(time.time() - start), stars[0][1]))
    return [x for x in out if x]

In [46]:
def get_html_texts(META, name, TEMPLATE):
    stars = make_stars(META, name)
    csvs = filter_metas(stars)
    id_num = 0
    headers = ''
    bullets = ''
    
    for i, csv in enumerate(csvs):
        print('\rcsv #{} of {}'.format(i + 1, len(csvs)), end = '')
        subset, paths = get_xml_paths(csv, name)
        for i, path in enumerate(paths):
            p = make_p(path, name)
            header, bullet = make_header_bullet(path, subset, id_num, p)
            id_num += 1
            headers += '\n' + header
            bullets += '\n' + bullet
            pct = round((i/len(paths)) * 100)
            if pct % 10 == 0:
                print('\r{}% of paths complete'.format(pct), end = '')
    
    html = customize_template(TEMPLATE, name, headers, bullets)
    fn = name.lower().strip().split(' ')
    fn = '_'.join(fn) + '.html'
    fn = os.path.join('/oak/stanford/groups/malgeehe/celebs/chicago_results/names_html', fn)
    
    with open(fn, 'w') as f:
        f.write(html)
        f.close()
        
    return fn