### PRP

Local directory `millercenter.org` is a simple `wget` mirror. Each page in the `exhibits` directory represents an exhibit.

For each exhibit, if it has a swf/flash embed, we would like to have structured information about each swf/flash file:

* URL
* HTML page title
* title
* date of recording
* participants
* convesation number
* administration

In [1]:
# Prepare a logger.

import logging

logger = logging.getLogger()
logger.handlers = []
logger.addHandler(logging.StreamHandler())
logger.setLevel(logging.DEBUG)

In [2]:
# Prepare a file for context information.

log = open('millercenter.log', 'w')

In [3]:
# Prepare a list of dict as a table to provide as CSV in the end.

exhibit_list = []

In [4]:
# Get a list of all filepaths of pages.

import glob
import os

basepath = 'millercenter.org/presidentialclassroom/exhibits/'
curdir = os.path.abspath(os.curdir)
filepath_list = sorted(glob.glob(basepath + '*'))

In [5]:
# Define parser: filepath -> exhibit dict.

import os

from pyquery import PyQuery as pq

def parse_exhibit(filepath, curdir=curdir, logger=logger, log=log):
    exhibit = {'url': filepath}
    dom = pq(url='file:///'+curdir+'/'+filepath)

    if not dom('article iframe'):
        msg = 'no iframe found: {}'.format(os.path.basename(filepath))
        logger.warning(msg)
        log.write(msg + '\n')
        return

    exhibit['page_title'] = dom('title').text()
    exhibit['title'] = dom('article h1').text()

    try:
        date, participants, conversation_number = parse_struct(dom, logger=logger, log=log)
    except IndexError:
        msg = 'structure unknown: {}'.format(os.path.basename(filepath))
        logger.warning(msg)
        log.write(msg + '\n')
        return
    except AttributeError:
        msg = 'structure inconsistent: {}'.format(os.path.basename(filepath))
        logger.warning(msg)
        log.write(msg + '\n')
        return

    exhibit['date_of_recording'] = date
    exhibit['conversation_number'] = conversation_number
    
    for i, participant in enumerate(participants.split(',')):
        exhibit['participant'+str(i+1)] = participant.strip()

    return exhibit    


def parse_struct(dom, logger=logger, log=log):
    "Return date, participants, conversation_number if feasible, else IndexError."
    
    for p in dom('article p'):
        subdom = pq(p)
        if len(subdom('strong')) >= 3:
            break
    else:
        raise IndexError('while looking for p')

    contents = subdom.contents()
    for i, x in enumerate(contents):
        if hasattr(x, 'tag') and x.tag == 'strong':
            break
    else:
        raise IndexError('while looking for strong')

    contents = contents[i:]
    contents = [x for x in contents if x != '\n\t']

    if dom('article p').text().find('Time') >= 0:
        # Time is contents[4].
        contents = [contents[1], contents[7], contents[10]]
    else:
        contents = [contents[1], contents[4], contents[7]]

    contents = [x.lstrip(':').strip() for x in contents]
    return contents

In [6]:
# Parse all pages.

for filepath in filepath_list:
    if not os.path.isfile(filepath):
        # Not a regular file, likely a directory.
        continue
    exhibit = parse_exhibit(filepath)
    if exhibit is not None:
        exhibit_list.append(exhibit)

no iframe found: 1963_0108_status
no iframe found: 1963_0115_dispatching
no iframe found: 1963_0202_wheelerreport
no iframe found: 1963_1002_reporting
no iframe found: 1963_1029_robertkennedy
no iframe found: 1963_1104_overthrow
no iframe found: 1965_0707_king
structure unknown: a-japanese-ultimatum
no iframe found: a-rough-guide-to-richard-nixons-conspiracy-theories
no iframe found: administrations
structure unknown: albert-thomas-gets-the-johnson-treatment
no iframe found: assassination-dr-martin-luther-king
structure unknown: beating-mcgovern
no iframe found: behind-scenes-election-night
no iframe found: behind-the-scenes-election-night
no iframe found: cancer-presidency
structure unknown: civil-rights-act
no iframe found: civil-rights-lyndon-johnson-voter-intimidation-1964-election
no iframe found: conspiracy
no iframe found: delaying-indictments
no iframe found: destroying-tapes
no iframe found: destroying-watergate-tapes
structure unknown: dispatching-the-wheeler-mission
no ifram

In [7]:
# Write table to CSV.

import csv
import sys

fieldnames = (
    'url',
    'page_title',
    'title',
    'date_of_recording',
    'conversation_number',
    'participant1',
    'participant2',
    'participant3',
    'participant4',
    'participant5',
    'participant6',
)

def write_csv(exhibit_list, fd, fieldnames=fieldnames):
    writer = csv.DictWriter(fd, fieldnames=fieldnames)
    writer.writeheader()
    for exhibit in exhibit_list:
        writer.writerow(exhibit)

write_csv(exhibit_list, open('exhibits.csv', 'w'))
log.close()

In [8]:
# Preview the CSV.

import pandas as pd

pd.read_csv('exhibits.csv')

Unnamed: 0,url,page_title,title,date_of_recording,conversation_number,participant1,participant2,participant3,participant4,participant5,participant6
0,millercenter.org/presidentialclassroom/exhibit...,"October 5, 1963: 1,000 Troop Withdrawal from S...","October 5, 1963: 1,000 Troop Withdrawal from S...","Oct 05, 1963",114/A50.1 and 114/A50.2,John Kennedy,Robert McNamara,,,,
1,millercenter.org/presidentialclassroom/exhibit...,"October 2, 1963: 1,000 Troop Withdrawal from S...","October 2, 1963: 1,000 Troop Withdrawal from S...","Oct 02, 1963",114/A49,John Kennedy,Robert McNamara,Maxwell Taylor,McGeorge Bundy,et al.,
2,millercenter.org/presidentialclassroom/exhibit...,A 3 A.M. Phone Call—Miller Center,A 3 A.M. Phone Call,"Mar 28, 1964",WH6403.17-2681,Lyndon Johnson,George Reedy,,,,
3,millercenter.org/presidentialclassroom/exhibit...,A Bipartisan Transportation Bill—Miller Center,A Bipartisan Transportation Bill,"Oct 05, 1966",WH6610-03-10925,Lyndon Johnson,Robert Anderson,,,,
4,millercenter.org/presidentialclassroom/exhibit...,A Global Enemies List—Miller Center,A Global Enemies List,"Oct 27, 1971",604-009,Richard Nixon,Bob Haldeman,Charles Colson,,,
5,millercenter.org/presidentialclassroom/exhibit...,African Americans and the U.S. Military—Miller...,African Americans and the U.S. Military,"Sep 27, 1940",48-61(1),Franklin Roosevelt,A. Philip Randolph,Walter White,,,
6,millercenter.org/presidentialclassroom/exhibit...,All the Incentives are Toward Less Medical Car...,All the Incentives are Toward Less Medical Care,"Feb 17, 1971",450-023,Richard Nixon,John Ehrlichman,,,,
7,millercenter.org/presidentialclassroom/exhibit...,An Optimistic Budget and Solid Poverty Program...,An Optimistic Budget and Solid Poverty Programs,"Dec 23, 1963",K6312-15-01,Lyndon Johnson,Walter Heller,,,,
8,millercenter.org/presidentialclassroom/exhibit...,“As Far As We Can Tell”—Miller Center,“As Far As We Can Tell”,"Aug 04, 1964","NMCC Tapes, VN01.10",Admiral U.S. Grant Sharp,General David Burchinal,,,,
9,millercenter.org/presidentialclassroom/exhibit...,Assessing the War—Miller Center,Assessing the War,"Nov 02, 1965",WH6511.01-9103,Lyndon Johnson,Robert McNamara,,,,
