Creates an HTML document for an individual person in the corpus containing:
- A date-ordered list of documents in which they appear, and the relevant paper
- Which link to full-text copies of those articles

In [1]:
import os
from lxml import etree
import pandas as pd
from multiprocessing import Pool
import random
import time

In [2]:
TEMPLATE = '''<!DOCTYPE html>
<html lang="en">
  <head>
    <meta charset="utf-8">
    <title>{celeb_name}</title>
  </head>
  <body>
  
    <header>
        <h1>{celeb_name}</h1>
    </header>
    
    <ul>
        {toc}
    </ul>
    
    {documents}
    
  </body>
</html>
'''

In [3]:
META = '/oak/stanford/groups/malgeehe/celebs/chicago_results/names_annual'
META = [os.path.join(META, x) for x in os.listdir(META) if '._' not in x]
META = sorted(META)

In [4]:
def filter_meta(files, name):
    L = []
    for file in files:
#         fn = os.path.split(file)[1]
#         print('\rchecking {}'.format(fn), end = '')
        with open(file) as f:
            for line in f:
                if name in line:
                    L.append(file)
#                     print('\rfound {} in {}'.format(name, fn), end = '')
                    break
            continue
    return L

In [5]:
def get_xml_paths(csv, name):
    df = pd.read_csv(csv)
    subset = df[df['person'].str.match(name)]
    return subset, list(subset['xml_path'].unique())

In [6]:
def make_p(path, name):
    with open(path) as f:
        tree = etree.parse(f)
        # get all objects matching type
        words = [x.text for x in tree.iterfind(".//word")]
        names = [x for x in name.strip().lower().split(' ') if x]
        # mark target terms
        L = []
        for word in words:
            if '<' or '>' in word: # prevent unintended html formatting
                word = word.replace('<', '').replace('>', '')
            if word.lower() in names:
                fmt = '<mark>' + word + '</mark>'
                L.append(fmt)
            else:
                L.append(word)
        text = ' '.join(L)
        text = '<p>' + text + '</p>'
        return text

In [7]:
def make_header_bullet(path, subset, id_num, p):
    meta = subset[subset['xml_path'] == path][:1] # take top row in case multiple rows
    d = meta.to_dict('index')
    d = [x for x in d.values()][0] # dumb
    data = str(d['year']) + '\t<i>' + d['paper'].strip() + '</i>\t' + d['doc_type']
    header = '<a id ="{}"><h3>{}</h3></a>\n<p>{}</p>\n{}'.format(id_num, d['doc_title'].title(),
                                                                      data, p) 
    bullet = '<li><a href="#{}">{data}:\t"{title}"</a>'.format(id_num, data = data,
                                                              title = d['doc_title'].title())
    return header, bullet

In [8]:
def update_docs(header, bullet):
    headers += '\n' + header
    bullets += '\n' + bullet
    return headers, bullets

In [9]:
def customize_template(template, name, headers, bullets):
    return template.format(celeb_name = name, toc = bullets, documents = headers)

In [10]:
def get_html_texts(tup): # dumb
    name, META, TEMPLATE = tup
    csvs = filter_meta(META, name)
    id_num = 0
    headers = ''
    bullets = ''
    
    for i, csv in enumerate(csvs):
#         print('\rcsv #{} of {}'.format(i + 1, len(csvs)), end = '')
        subset, paths = get_xml_paths(csv, name)
        for i, path in enumerate(paths):
            p = make_p(path, name)
            header, bullet = make_header_bullet(path, subset, id_num, p)
            id_num += 1
            headers += '\n' + header
            bullets += '\n' + bullet
#             pct = round((i/len(paths)) * 100)
#             if pct % 10 == 0:
#                 print('\r{}% of paths complete'.format(pct), end = '')
    
    html = customize_template(TEMPLATE, name, headers, bullets)
    fn = name.lower().strip().split(' ')
    fn = '_'.join(fn) + '.html'
    fn = os.path.join('/oak/stanford/groups/malgeehe/celebs/chicago_results/names_html', fn)
    
    with open(fn, 'w') as f:
        f.write(html)
        f.close()
        
    return fn

In [12]:
people = '''Al Monroe
Robert S. Abbott
Phyllis Wheatley
Salem Tutt Whitney
Maude Roberts George
Clarence Cameron White
A. Wayman Ward
R. R. Wright
Billy King
Ernest Hall
Marjorie Stewart Joyner
Willie Williams
A. Philip Randolph
Nina Mae Mckinney
Walter Speedy
Walter Barnes
Ada Brown
Emmett J. Scott
Virgil Williams
Anita Patti Brown
Lulu Coates
Robert R. Moton
Charlie Smith
Gladys Brown
Frederick Douglass
Willie Mae
Billy Tucker
Thelma Williams
C. J. Walker
Lydia E. Pinkham
Maude Roberts
William H. Jackson
Jack Ellis
Alice Williams
James A. Cobb
Erskine Tate
Jessie Smith
Dave Peyton
Louis Armstrong
William Pickens
Booker T. Washington
Morris Lewis
Fats Waller
Alma Smith
Mary F. Waring
L. K. Williams
Bob Hayes
Clarence Muse
Etta Moten
T. Arnold Hill
Jim Taylor
Willie Bryant
Bessie Smith
Helen Woodward
Mae Johnson
K. Williams
V. Brown
Henry Brown
Louise Beavers
C. H. Thomas
J. C. Austin
Albert Jones
Perry W. Howard
Mary Jackson
Alice Jones
Gus Greenlee
Oscar Depriest
A. L. Jackson
Jim Crow
Walter White
James Weldon Johnson
Duke Ellington
M. Williams
Charles S. Johnson
Paul Laurence Dunbar
Lillian Brown
Anna Jones
Claude Hopkins
Clarence Williams
Cab Calloway
Eddie Green
Satchel Paige
Elizabeth Johnson
V. Williams
Fletcher Henderson
William Turner
Mildred Johnson
Richard Allen
Edna Brown
Warren Williams
James Brooks
Mary Davis
Bud Harris
Willie Johnson
Russell Brown
Andy Kirk
Henry Lincoln Johnson
Henry Williams
Julia Smith
Laura Smith
Langston Hughes
John Harris
George Garner
James Thomas
Alice Turner
Richard B. Harrison
Ralph Cooper
Mary Williams
Emma Smith
Mary Robinson
James Robinson
Willie Jones
Chick Webb
Charles Jackson
Mary Carter
M. Jones
Mary Johnson
Frank Jones
George Jackson
George Harris
Willie Smith
Rube Foster
Elizabeth Williams
Josephine Baker
Ethel Waters
James Harris
Marcus Garvey
Mary White
Mary Jones
John Jackson
Fred Williams
Ruth Jones
George Jones
Mary Scott
Henry Harris
James Carter
Henry Thomas
James Johnson
William White
James Taylor
Frank Edwards
James Jones
Charles Moore
Tom Wilson
J. Wesley Jones
Henry Smith
James Jackson
Charles Thompson
William Robinson
William Jackson
A. L. Foster
J. Williams
George Robinson
Robert Thomas
Albert B. George
Charles Lewis
Mary Brown
Helen Taylor
Robert Harris
James Lewis
William Jones
Joe Johnson
Frank Jackson
Dorothy Williams
Mary Harris
William Davis
Alice Johnson
Arthur W. Mitchell
V. Johnson
Bill Robinson
Helen Jones
Henry Jackson
Mary Thomas
John Green
Willie Lewis
Charles Taylor
James Moore
Sam Taylor
Wesley Jones
George Walker
John Williams
William Johnson
William Bell
William Lewis
John Robinson
James Allen
T. Williams
John Hill
Mary Moore
Anna Johnson
H. Williams
Roland Hayes
C. Williams
J. Jones
Joseph Johnson
Arthur Williams
Henry Jones
James Mitchell
Charles Young
A. Thomas
Charles Williams
Robert Brown
Robert Smith
Lillian Smith
Paul Robeson
William Harris
Edward Jones
Billy Jones
Ruth Johnson
Fred Johnson
John Wright
Clarence Jones
Dorothy Johnson
Mary Smith
Dorothy Smith
William Smith
A. Williams
Charles Jones
James A. Mundy
Earl Hines
Robert Williams
Frank Williams
Henry Johnson
James Brown
Charles Harris
Robert Johnson
Marian Anderson
James Scott
William Anderson
William Brown
Frank Brown
William Taylor
George Johnson
Robert Jones
B. Williams
James White
Arthur Smith
Charles Anderson
Clara Smith
Elizabeth Smith
James Smith
Joe Williams
James Anderson
Walter Smith
John Taylor
A. Davis
Charles Johnson
Helen Smith
John Henry
Julian Black
M. Johnson
John Johnson
William Thomas
M. Brown
Charles Brown
William Moore
Frank Young
Willie Davis
Charles Smith
William Allen
John Davis
Jack Johnson
H. Jones
George Allen
M. Smith
Helen Johnson
John Wilson
Robert Jackson
J. Brown
George Lewis
Henry Wilson
James Wilson
William E. King
John Roxborough
A. Jones
L. Johnson
James Davis
Jay Gould
Walter Brown
A. Jackson
Charles Davis
Mary Lewis
John Jones
H. Johnson
William Wilson
Bert Williams
George Brown
Fred Smith
Eddie Tolan
John Henry Lewis
John Allen
Wendell Phillips
George Taylor
Arthur Johnson
John Carter
Roger Williams
William Walker
Jesse Owens
Ben Johnson
Jack Blackburn
Jim Brown
Joe Brown
John Thomas
V. Smith
Clarence Smith
Walter Williams
John Brown
George Bell
Charles Wilson
J. Johnson
John Wesley
John Moore
H. Brown
John White
C. Jones
Harry Smith
John Scott
Edward Smith
Henry Armstrong
William Clark
Albert Johnson
A. Johnson
A. Brown
Joe Louis
George Williams
Ralph Metcalfe
Frank Johnson
George Wilson
L. Jones
John Adams
John Smith
John Lewis
William Ii
George Smith
H. Smith
John Ii
C. Johnson
Harry Wills
L. Brown
Haile Selassie
Joseph Smith
James Ii
John Anderson
L. Smith
Frank Wilson
Benny Goodman
A. Smith
Jesus Christ
Edward Johnson
Andrew Jackson
Mike Jacobs
Frank Smith
Charles Ii
Max Schmeling
J. Smith
James A.
C. Smith
Julius Rosenwald
Robert Taylor
Abraham Lincoln
Thomas Jefferson
Walter Johnson
George White
Edward J. Kelly
John W. Davis
William Green
George Washington
Max Baer
John Thompson
John Marshall
Franklin D. Roosevelt
Mickey Walker
Fred Allen
Uncle Sam
John Powell
John D. Rockefeller
Frank Murphy
Santa Claus
Al Jolson
Henry Ford
Woodrow Wilson
Jack Sharkey
Theodore Roosevelt
John A.
Harry L. Hopkins
Bing Crosby
Jack Dempsey
Jack Kearns
Will Rogers
Harold L. Ickes
Joe Davis
James A. Farley
Leonard Wood
William Powell
George Vi
Cecil Smith
Barney Ross
John L. Sullivan
Eddie Cantor
Jane Addams
Claudette Colbert
Joseph B. David
William E. Borah
Herbert Hoover
Huey Long
James Simpson
Paul Whiteman
John Evans
Jack Benny
Rufus C. Dawes
Rudy Vallee
Calvin Coolidge
Mary Garden
Charles Collins
Anthony Eden
Jan Garber
Pat O'Brien
Dick Powell
Henry Horner
William J.
Gene Tunney
Ben Bernie
John J. Pershing
Alfred E. Smith
Jack Holt
Robert E. Crowe
James Hamilton Lewis
Babe Ruth
Douglas Fairbanks
Bobby Jones
Bud Taylor
Neville Chamberlain
Harold Lloyd
Len Small
Mary Pickford
George M. Cohan
Frank Parker
Gloria Swanson
Joan Crawford
Edward Hines
Mark Twain
Cordell Hull
Helen Hayes
Tex Rickard
Francisco Franco
John Dillinger
John J.
Lionel Barrymore
Al Smith
Greta Garbo
Mayor Kelly
Joe E. Brown
Philip J. Finnegan
Jean Harlow
Dick Hanley
Sammy Mandell
John Simon
Earl Carroll
John L. Lewis
Bill Terry
Hugh S. Johnson
Clark Gable
Carole Lombard
Ann Harding
Chick Evans
Adolf Hitler
Joan Bennett
Constance Bennett
John P. Barnes
Frederick Stock
Jackie Coogan
Connie Mack
Myrna Loy
Joseph Sabath
Charles S. Deneen
John Barrymore
Harry Cooper
John Alden Carpenter
Norma Shearer
Gary Cooper
Kay Francis
Alice Brady
Charles A. Lindbergh
Norma Talmadge
Al Capone
Charlie Chaplin
Grace Moore
Elmer Layden
Cary Grant
Clayton F. Smith
Oscar Nelson
Walter Hagen
John Timothy Stone
Herman N. Bundesen
Frank J. Loesch
Loretta Young
Charles G. Dawes
Bob Elson
Glen Ellyn
Irene Dunne
Joan Blondell
Spencer Tracy
Walter Dill Scott
Joe Mccarthy
Janet Gaynor
Helen Jacobs
Dizzy Dean
Conrad Nagel
Lou Gehrig
Chauncey Mccormick
Bebe Daniels
Edward Moore
Floyd Gibbons
Al Simmons
J. Ogden Armour
Ronald Colman
Frank L. Smith
Samuel Insull
Marion Davies
Wallace Beery
Ted Lyons
Eddie Collins
George Craig Stewart
Abel Davis
Arthur Evans
Robert J. Dunham
Harold Stokes
Lloyd George
Bette Davis
John Steele
Westbrook Pegler
Fred Harvey
La Follette
A. A. Sprague
Babe Herman
Jimmy Dykes
Lynn Waldorf
Lon Warneke
Anton J. Cermak
D. F. Kelly
Jim Londos
Bill Lee
Henry A. Miller
Ted Weems
Michael L. Igoe
Ely Culbertson
Tommy Armour
Jacob Baur
Walter Eckersall
Elmer Douglass
Edward Barry
Gabby Hartnett
John S. Clark
Potter Palmer
Quin Ryan
Harvey T. Woodruff
Oscar Hewitt
Stephen Decatur
Winnie Winkle
Sally Joy Brown
Antoinette Donnelly'''

In [13]:
people = [x for x in people.split('\n') if x]

In [14]:
# remove people who have already been processed
done = '/oak/stanford/groups/malgeehe/celebs/chicago_results/names_html'
done = [x.split('.html')[0].replace('_', ' ').title() for x in os.listdir(done) if x.endswith('.html')]

In [15]:
len(done)

441

In [16]:
todo = list(set(people) - set(done))

In [17]:
len(todo)

121

In [18]:
stars = list(zip(todo, [META]*len(people), [TEMPLATE]*len(people))) # unnecessary but whatever

In [None]:
with Pool() as p:
    fns = p.map(get_html_texts, stars)

# Linear process: old code

In [None]:
def make_stars(META, name):
    return list(zip(META, [name] * len(META)))

In [None]:
def filter_meta(file, name):
    with open(file) as f:
        for line in f:
            if name in line:
                return file

In [None]:
def filter_metas(stars):
    start = time.time()
    
    with Pool() as p:
        out = p.starmap(filter_meta, stars)
        
    print('\r{} seconds to filter {}'.format(round(time.time() - start), stars[0][1]))
    return [x for x in out if x]

In [None]:
def get_html_texts(META, name, TEMPLATE):
    stars = make_stars(META, name)
    csvs = filter_metas(stars)
    id_num = 0
    headers = ''
    bullets = ''
    
    for i, csv in enumerate(csvs):
        print('\rcsv #{} of {}'.format(i + 1, len(csvs)), end = '')
        subset, paths = get_xml_paths(csv, name)
        for i, path in enumerate(paths):
            p = make_p(path, name)
            header, bullet = make_header_bullet(path, subset, id_num, p)
            id_num += 1
            headers += '\n' + header
            bullets += '\n' + bullet
            pct = round((i/len(paths)) * 100)
            if pct % 10 == 0:
                print('\r{}% of paths complete'.format(pct), end = '')
    
    html = customize_template(TEMPLATE, name, headers, bullets)
    fn = name.lower().strip().split(' ')
    fn = '_'.join(fn) + '.html'
    fn = os.path.join('/oak/stanford/groups/malgeehe/celebs/chicago_results/names_html', fn)
    
    with open(fn, 'w') as f:
        f.write(html)
        f.close()
        
    return fn