In [2]:
import os
from lxml import etree

In [40]:
def parseFile(infile, prefix, split_books=False):
    '''read xml, write txt'''

    doc = etree.parse(infile).getroot()
    abbr = doc.find('.//Metadata/Abbr').xpath('string()')

    lines = {}
    
    for l in doc.findall('.//TextUnit'):
        loc = l.get('loc')
        verse = l.xpath('string()').strip()
        
        if split_books:
            try:
                book = int(loc.split('.')[0])
            except ValueError:
                print("Error: Can't parse {} in {}".format(loc, infile))
                continue
            suff = '_{:02d}'.format(book)
        else:
            suff = ''
        filename = '{}{}.txt'.format(prefix, suff)

        if filename not in lines:
            lines[filename] = []
        
        lines[filename].append('{} {}\t{}'.format(abbr, loc, verse))

    for filename in lines:
        print(filename)
        with open(filename, 'w') as fh:
            for l in lines[filename]:
                fh.write(l + '\n')

In [41]:
files = [
    ('catullus.carmina', False),
    ('lucan.bellum_civile', True),
    ('ovid.amores', True),
    ('ovid.ars_amatoria', True),
    ('ovid.remedia_amoris', False),
    ('propertius.elegies', True),
    ('statius.thebaid', True),
    ('tibullus.elegies', True),
    ('valerius_flaccus.argonautica', True),
    ('vergil.aeneid', True),
]

for file, split_books in files:
    infile = os.path.join('texts', 'xml', file + '.xml')
    outfile = os.path.join('texts', 'raw', file)
    parseFile(infile, outfile, split_books)

texts/raw/catullus.carmina.txt
texts/raw/lucan.bellum_civile_01.txt
texts/raw/lucan.bellum_civile_02.txt
texts/raw/lucan.bellum_civile_03.txt
texts/raw/lucan.bellum_civile_04.txt
texts/raw/lucan.bellum_civile_05.txt
texts/raw/lucan.bellum_civile_06.txt
texts/raw/lucan.bellum_civile_07.txt
texts/raw/lucan.bellum_civile_08.txt
texts/raw/lucan.bellum_civile_09.txt
texts/raw/lucan.bellum_civile_10.txt
texts/raw/ovid.amores_01.txt
texts/raw/ovid.amores_02.txt
texts/raw/ovid.amores_03.txt
texts/raw/ovid.ars_amatoria_01.txt
texts/raw/ovid.ars_amatoria_02.txt
texts/raw/ovid.ars_amatoria_03.txt
texts/raw/ovid.remedia_amoris.txt
texts/raw/propertius.elegies_01.txt
texts/raw/propertius.elegies_02.txt
texts/raw/propertius.elegies_03.txt
texts/raw/propertius.elegies_04.txt
texts/raw/statius.thebaid_01.txt
texts/raw/statius.thebaid_02.txt
texts/raw/statius.thebaid_03.txt
texts/raw/statius.thebaid_04.txt
texts/raw/statius.thebaid_05.txt
texts/raw/statius.thebaid_06.txt
texts/raw/statius.thebaid_07.tx

In [81]:
# download texts from Perseus
from lxml import etree
import re
from MyCapytain.resolvers.cts.api import HttpCtsResolver
from MyCapytain.retrievers.cts5 import HttpCtsRetriever

SERVER = 'http://cts.perseids.org/api/cts/'
resolver = HttpCtsResolver(HttpCtsRetriever(SERVER))

In [53]:
def retrieveXML(resolver, urn):
    '''Download a remote work and extract xml'''

    print('Downloading {}'.format(urn))
        
    # get references 
    reffs = resolver.getReffs(urn)
    
    # stores parsed xml
    xml_reffs = []

    # download one book at a time
    for i, reff in enumerate(reffs):

        print(" - fetching reff {}/{}".format(i+1, len(reffs)))
        ctsPassage = resolver.getTextualNode(urn, subreference=reff)

        # extract xml
        xml = ctsPassage.export('python/lxml')
        xml_reffs.append(xml)
    
    return xml_reffs

### Catullus

In [None]:
# catullus
catul = retrieveXML(resolver, 'urn:cts:latinLit:phi0472.phi001.perseus-eng4')

In [107]:
# process all individual poems of Catullus
# and paste them together into a single text

dest = os.path.join('catullus.elegiacs.txt')

poems = []

for doc in catul[66:]:
    div = doc.find('.//{http://www.tei-c.org/ns/1.0}div[@subtype="textpart"]')
    paras = div.findall('.//{http://www.tei-c.org/ns/1.0}p')
    if len(paras) > 1:
        for p in paras:
            if p.tail:
                p.tail = '==PARA==' + p.tail
            else:
                p.tail = '==PARA=='

    # remove speaker indications from dialogue
    for speaker in div.findall('.//{http://www.tei-c.org/ns/1.0}speaker'):
        speaker.clear()

    poem = div.xpath('string()')
    poem = re.sub(pattern='\s+', repl=' ', string=poem)
    poem = re.sub(pattern='(==PARA==)+', repl='\n\n    ', string=poem)
    poem = poem.strip()
    poem = re.sub(pattern='[—]', repl=' – ', string=poem)
    poems.append(poem)

with open(dest, 'w') as fh:
    fh.write('\n\n---\n\n'.join(poems))

Downloading urn:cts:latinLit:phi0472.phi001.perseus-eng4
 - fetching reff 1/120
 - fetching reff 2/120
 - fetching reff 3/120
 - fetching reff 4/120
 - fetching reff 5/120
 - fetching reff 6/120
 - fetching reff 7/120
 - fetching reff 8/120
 - fetching reff 9/120
 - fetching reff 10/120
 - fetching reff 11/120
 - fetching reff 12/120
 - fetching reff 13/120
 - fetching reff 14/120
 - fetching reff 15/120
 - fetching reff 16/120
 - fetching reff 17/120
 - fetching reff 18/120
 - fetching reff 19/120
 - fetching reff 20/120
 - fetching reff 21/120
 - fetching reff 22/120
 - fetching reff 23/120
 - fetching reff 24/120
 - fetching reff 25/120
 - fetching reff 26/120
 - fetching reff 27/120
 - fetching reff 28/120
 - fetching reff 29/120
 - fetching reff 30/120
 - fetching reff 31/120
 - fetching reff 32/120
 - fetching reff 33/120
 - fetching reff 34/120
 - fetching reff 35/120
 - fetching reff 36/120
 - fetching reff 37/120
 - fetching reff 38/120
 - fetching reff 39/120
 - fetching reff

'/Users/chris/Documents/git/clas-3991-fa18/Week 02 - Flow Control and Functions'

* *  *
## Cut from the lab

### 3D PCA plot 

In [1]:
%matplotlib notebook
from mpl_toolkits.mplot3d import axes3d
from sklearn import decomposition

# leave out lucan
some_files = numpy.array([f for f in files if not f.startswith('lucan')])

# build feature vectors
data = []

for file in some_files:
    path = os.path.join('texts', 'raw', file)
    wc = wordCount(path)

    this_vec = [wc.get(w, 0) for w in top_500]
    data.append(this_vec)
    
data = numpy.array(data)

# create author labels
authors = []
for f in some_files:
    author = f.split('.')[0]
    authors.append(author)

# reduce dimensionality with PCA
npcs = 10
print('Calculating {} principal components'.format(npcs))
pcmodel = decomposition.PCA(npcs)
pca = pcmodel.fit_transform(data)

# create a graph
fig = pyplot.figure()
ax = fig.add_subplot(111, projection='3d')
ax.set_xlabel('PC1')
ax.set_ylabel('PC2')
ax.set_zlabel('PC3')

# use numpy for easier array slicing
x = pca[:,0]
y = pca[:,1]
z = pca[:,2]
g = numpy.array(genres)
a = numpy.array(authors)

# plot each author as a separate series
for auth in set(a):
    ax.scatter(x[a==auth], y[a==auth], z[a==auth], marker='o', label=auth)
    
ax.legend()

NameError: name 'numpy' is not defined