In [34]:
from lxml import etree

In [69]:
NAMESPACES = {
    'dcterms': 'http://purl.org/dc/terms/',
    'pgterms': 'http://www.gutenberg.org/2009/pgterms/',
    'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
}

In [None]:
def safe_int(val):
    try: return int(val)
    

In [149]:
class BookXML:
    
    @classmethod
    def from_file(cls, path):
        return cls(etree.parse(path))
    
    def __init__(self, tree):
        self.tree = tree
        
    def xpath(self, query, root=None):
        return (root or self.tree).xpath(query, namespaces=NAMESPACES)
    
    def title(self):
        return self.xpath('//dcterms:title/text()')[0]
        
    def subjects(self):
        return self.xpath('//dcterms:subject//rdf:value/text()')
    
    def creators_iter(self):
        
        for root in self.xpath('//dcterms:creator/pgterms:agent'):
            
            fields = []
            for f in self.xpath('pgterms:*', root):
                key = etree.QName(f.tag).localname
                val = 
            
            yield [
                (etree.QName(f.tag).localname, f.text)
                for f in self.xpath('pgterms:*', root)
            ]

In [150]:
b = BookXML.from_file('cache/epub/2600/pg2600.rdf')

In [151]:
b.title()

'War and Peace'

In [152]:
b.subjects()

['Aristocracy (Social class) -- Russia -- Fiction',
 'PG',
 'Napoleonic Wars, 1800-1815 -- Campaigns -- Russia -- Fiction',
 'War stories',
 'Historical fiction',
 'Russia -- History -- Alexander I, 1801-1825 -- Fiction']

In [153]:
list(b.creators_iter())

  # This is added back by InteractiveShellApp.init_path()


[[('deathdate', '1910'),
  ('name', 'Tolstoy, Leo, graf'),
  ('alias', 'Tolstoï, Léon'),
  ('birthdate', '1828'),
  ('alias', 'Толстой, Лев Николаевич'),
  ('alias', 'Tolstoi, Leo'),
  ('alias', 'Tolstoy, Graf Leo'),
  ('alias', 'Tolstoy, Lev N.'),
  ('webpage', None)]]

In [118]:
creator = wp.tree.xpath('//dcterms:creator/pgterms:agent', namespaces=NAMESPACES)[0]

In [143]:
for f in creator.xpath('pgterms:', namespaces=NAMESPACES):
    print(f.tag, f.text)

XPathEvalError: Invalid expression

In [120]:
f = creator.xpath('pgterms:*', namespaces=NAMESPACES)[0]

In [138]:
etree.QName(f.tag).localname

'deathdate'

In [74]:
wp.tree.xpath('pgterms:*', namespaces=NAMESPACES)

[<Element {http://www.gutenberg.org/2009/pgterms/}ebook at 0x10e03fb88>]

In [77]:
wp.tree.xpath('//dcterms:subject//rdf:value/text()', namespaces=NAMESPACES)

['Aristocracy (Social class) -- Russia -- Fiction',
 'PG',
 'Napoleonic Wars, 1800-1815 -- Campaigns -- Russia -- Fiction',
 'War stories',
 'Historical fiction',
 'Russia -- History -- Alexander I, 1801-1825 -- Fiction']

In [183]:
from rdflib import Graph, Namespace

In [184]:
g = Graph()

In [156]:
b = g.parse('cache/epub/2600/pg2600.rdf')

In [195]:
dcterms = Namespace('http://purl.org/dc/terms/')

In [202]:
dcterms.LCSH

rdflib.term.URIRef('http://purl.org/dc/terms/LCSH')

In [198]:
list(b.objects(predicate=dcterms.title))[0].value

'War and Peace'

In [215]:
list(b.objects(predicate=dcterms.description))[0].value

'en.wikipedia'

In [216]:
c = list(b.objects(predicate=dcterms.creator))[0]

In [236]:
for p, o in b.predicate_objects(subject=c):
    print(p, o.toPython())

http://www.gutenberg.org/2009/pgterms/deathdate 1910
http://www.gutenberg.org/2009/pgterms/alias Tolstoï, Léon
http://www.gutenberg.org/2009/pgterms/alias Tolstoy, Graf Leo
http://www.gutenberg.org/2009/pgterms/alias Tolstoy, Lev N.
http://www.gutenberg.org/2009/pgterms/alias Толстой, Лев Николаевич
http://www.gutenberg.org/2009/pgterms/name Tolstoy, Leo, graf
http://www.gutenberg.org/2009/pgterms/birthdate 1828
http://www.gutenberg.org/2009/pgterms/alias Tolstoi, Leo
http://www.gutenberg.org/2009/pgterms/webpage http://en.wikipedia.org/wiki/Leo_Tolstoy
http://www.w3.org/1999/02/22-rdf-syntax-ns#type http://www.gutenberg.org/2009/pgterms/agent
