In [237]:
import lxml.html
import re
import pandas as pd

from lxml.html.clean import Cleaner

In [257]:
with open('grtbloom.html') as fh:
    lines = fh.read().replace('\t', '').splitlines()

In [258]:
def close_li(line):
    if re.match('<li>[^<]*$', line):
        line += '</li>'
    return line

In [259]:
lines = list(map(close_li, lines))

In [260]:
raw = '\n'.join(lines)

In [261]:
tree = lxml.html.document_fromstring(raw)

In [262]:
cleaner = Cleaner(remove_tags=['cite'])

In [267]:
tree = cleaner.clean_html(tree)

In [268]:
rows = []
age, region, author = None, None, None

for el in tree.iter():
    
    if el.tag == 'h2':
        age = el.text
        
    elif el.tag == 'h3':
        region = el.text

    elif el.tag == 'li':
        
        next_el = el.getnext()
        
        if next_el.tag == 'dd':
            author = el.text
            
        else:
            author = None
            rows.append((age, region, author, el.text))
            
    elif el.tag == 'dd':
        rows.append((age, region, author, el.text))

In [269]:
df = pd.DataFrame(rows, columns=('age', 'region', 'author', 'title'))

In [270]:
df

Unnamed: 0,age,region,author,title
0,A. The Theocratic Age,The Ancient Near East,,Gilgamesh
1,A. The Theocratic Age,The Ancient Near East,,Egyptian Book of the Dead
2,A. The Theocratic Age,The Ancient Near East,,Holy Bible (King James Version)
3,A. The Theocratic Age,The Ancient Near East,,The Apocrypha
4,A. The Theocratic Age,The Ancient Near East,,Sayings of the Fathers (Pirke Aboth)
5,A. The Theocratic Age,Ancient India (Sanskrit),,Mahabharata
6,A. The Theocratic Age,Ancient India (Sanskrit),,Bhagavad-Gita
7,A. The Theocratic Age,Ancient India (Sanskrit),,Ramayana
8,A. The Theocratic Age,The Ancient Greeks,Homer,Iliad
9,A. The Theocratic Age,The Ancient Greeks,Homer,Odyssey


In [254]:
def parse_surname(author):
    if author:
        return author.split(' ')[-1]

In [255]:
df['surname'] = df['author'].apply(parse_surname)

In [256]:
df

Unnamed: 0,age,region,author,title,surname
0,A. The Theocratic Age,The Ancient Near East,,Gilgamesh,
1,A. The Theocratic Age,The Ancient Near East,,Egyptian Book of the Dead,
2,A. The Theocratic Age,The Ancient Near East,,Holy Bible (King James Version),
3,A. The Theocratic Age,The Ancient Near East,,The Apocrypha,
4,A. The Theocratic Age,The Ancient Near East,,Sayings of the Fathers (Pirke Aboth),
5,A. The Theocratic Age,Ancient India (Sanskrit),,Mahabharata,
6,A. The Theocratic Age,Ancient India (Sanskrit),,Bhagavad-Gita,
7,A. The Theocratic Age,Ancient India (Sanskrit),,Ramayana,
8,A. The Theocratic Age,The Ancient Greeks,Homer,Iliad,Homer
9,A. The Theocratic Age,The Ancient Greeks,Homer,Odyssey,Homer
