In [1]:
# Imports

import os

from collections import defaultdict, Counter

import urllib.request
import urllib.parse

import lxml.html
from lxml import etree

from tqdm import tqdm

from pprint import pprint

In [2]:
# URIs for Perseus texts of Terence

uris = [
    'https://raw.githubusercontent.com/PerseusDL/canonical-latinLit/master/data/phi0134/phi001/phi0134.phi001.perseus-lat2.xml',
    'https://raw.githubusercontent.com/PerseusDL/canonical-latinLit/master/data/phi0134/phi002/phi0134.phi002.perseus-lat2.xml',
    'https://raw.githubusercontent.com/PerseusDL/canonical-latinLit/master/data/phi0134/phi003/phi0134.phi003.perseus-lat2.xml',
    'https://raw.githubusercontent.com/PerseusDL/canonical-latinLit/master/data/phi0134/phi004/phi0134.phi004.perseus-lat2.xml',
    'https://raw.githubusercontent.com/PerseusDL/canonical-latinLit/master/data/phi0134/phi005/phi0134.phi005.perseus-lat2.xml',
    'https://raw.githubusercontent.com/PerseusDL/canonical-latinLit/master/data/phi0134/phi006/phi0134.phi006.perseus-lat2.xml',    
]

In [3]:
# Get texts
#
# Commented out—only need to do once

# for uri in tqdm(uris):
#     ns = {'tei': 'http://www.tei-c.org/ns/1.0'}
#     connection = urllib.request.urlopen(uri)
#     tree = etree.parse(connection)
#     root = tree.getroot()
#     xml_string = etree.tostring(root, pretty_print=True).decode()
#     name = root.xpath('.//tei:title', namespaces=ns)[0].text
#     with open(f'texts/{name.lower()}.xml', 'w') as f:
#          f.write(xml_string)

In [4]:
def get_speaker_data(file):
    def get_text(node):
        return ' '.join(node.itertext()).strip()
    ns = {'tei': 'http://www.tei-c.org/ns/1.0'}
    tree = etree.parse(file)
    root = tree.getroot()
    speaker_nodes = root.xpath('.//tei:sp', namespaces=ns)
    speaker_blocks = defaultdict(list)
    for node in speaker_nodes:
        speaker = node.xpath('tei:speaker', namespaces=ns)[0].text
        lines = node.xpath('tei:l', namespaces=ns)
        for line in lines:
            speaker_blocks[speaker].append(line.text.strip())
    return speaker_blocks

In [5]:
# Get Andria data

andria = get_speaker_data('texts/andria.xml')

In [6]:
# Dramatis Personae as told through defaultdict keys
andria.keys()

dict_keys(['Prologus', 'Si.', 'So.', 'Da.', 'Pa.', 'My.', 'Ch.', 'By.', 'Le.', 'Gl.', 'Cr.', 'Dr.', 'Om.'])

In [7]:
# First 10 lines for speaker Da. (= Davus)

pprint(andria['Da.'][:10])

['Mirabar hoc si sic abiret, et heri semper lenitas',
 'Verebar quorsum evaderet:',
 'Qui postquam audierat non datum iri filio uxorem suo',
 'Numquam cuiquam nostrum verbum fecit, neque id aegre tulit.',
 'Id voluit, nos sic nec opinantes duci falso gaudio,',
 'Sperantes, iam amoto metu, interea oscitantes opprimi,',
 'Ut ne esset spatium cogitandi ad disturbandas nuptias:',
 'Astute.',
 'Herus est, neque provideram.',
 'Hem, quid est?']


In [8]:
# Line counts for Andria speakers; nb: includes partial lines

for k, v in andria.items():
    print(f'{k}: {len(v)}')

Prologus: 27
Si.: 394
So.: 36
Da.: 341
Pa.: 239
My.: 88
Ch.: 200
By.: 30
Le.: 10
Gl.: 1
Cr.: 51
Dr.: 2
Om.: 1


In [9]:
# Word counts for Andria speakers

for k, v in andria.items():
    running = 0
    for line in v:
        running += len(line)
    print(f'{k}: {running}')

Prologus: 1110
Si.: 13850
So.: 950
Da.: 11059
Pa.: 8244
My.: 2436
Ch.: 6372
By.: 996
Le.: 411
Gl.: 42
Cr.: 1716
Dr.: 14
Om.: 9
