In [34]:
from lxml import etree

In [303]:
def numberify(val):
    """Try to cast str -> int/float.
    
    Args:
        val (str)
    """
    try: return int(val)
    except: pass
    
    try: return float(val)
    except: pass
    
    return val

In [304]:
NAMESPACES = {
    'dcterms': 'http://purl.org/dc/terms/',
    'pgterms': 'http://www.gutenberg.org/2009/pgterms/',
    'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
}

In [355]:
class BookXML:
    
    @classmethod
    def from_file(cls, path):
        return cls(etree.parse(path))
    
    def __init__(self, tree):
        self.tree = tree
        
    def xpath(self, query, root=None):
        return (root or self.tree).xpath(query, namespaces=NAMESPACES)
    
    def id(self):
        return self.xpath('//pgterms:ebook/@rdf:about')[0]
    
    def title(self):
        return self.xpath('//dcterms:title/text()')[0]
        
    def subjects(self):
        return self.xpath('//dcterms:subject//rdf:value/text()')
    
    def creators_iter(self):
        
        for root in self.xpath('//dcterms:creator/pgterms:agent'):
            
            fields = root.xpath('pgterms:*', namespaces=NAMESPACES)

            yield [
                (etree.QName(f.tag).localname, numberify(f.text))
                for f in fields if f.text
            ]
            
    def bookshelves(self):
        return self.xpath('//pgterms:bookshelf//rdf:value/text()')
    
    def formats_iter(self):
        
        for root in self.xpath('//dcterms:hasFormat'):
            
            fmt = root.xpath('.//dcterms:format//rdf:value/text()', namespaces=NAMESPACES)
            extent = root.xpath('.//dcterms:extent/text()', namespaces=NAMESPACES)
            modified = root.xpath('.//dcterms:modified/text()', namespaces=NAMESPACES)
            
            yield dict(format=fmt, extent=extent, modified=modified)

In [356]:
b = BookXML.from_file('cache/epub/2600/pg2600.rdf')

In [357]:
b.id()

'ebooks/2600'

In [358]:
b.title()

'War and Peace'

In [359]:
b.subjects()

['Aristocracy (Social class) -- Russia -- Fiction',
 'PG',
 'Napoleonic Wars, 1800-1815 -- Campaigns -- Russia -- Fiction',
 'War stories',
 'Historical fiction',
 'Russia -- History -- Alexander I, 1801-1825 -- Fiction']

In [360]:
list(b.creators_iter())

[[('deathdate', 1910),
  ('name', 'Tolstoy, Leo, graf'),
  ('alias', 'Tolstoï, Léon'),
  ('birthdate', 1828),
  ('alias', 'Толстой, Лев Николаевич'),
  ('alias', 'Tolstoi, Leo'),
  ('alias', 'Tolstoy, Graf Leo'),
  ('alias', 'Tolstoy, Lev N.')]]

In [361]:
b.bookshelves()

['Napoleonic(Bookshelf)',
 'Best Books Ever Listings',
 'Historical Fiction',
 'Opera',
 'Movie Books']

In [362]:
list(b.formats_iter())

[{'format': ['application/epub+zip'],
  'extent': ['1342035'],
  'modified': ['2018-11-02T01:37:23.869730']},
 {'format': ['application/x-mobipocket-ebook'],
  'extent': ['5470783'],
  'modified': ['2018-11-02T01:37:36.164505']},
 {'format': ['application/x-mobipocket-ebook'],
  'extent': ['5470775'],
  'modified': ['2018-11-02T01:37:48.416302']},
 {'format': ['application/epub+zip'],
  'extent': ['1342035'],
  'modified': ['2018-11-02T01:37:22.847773']},
 {'format': ['application/zip', 'text/html; charset=utf-8'],
  'extent': ['1290237'],
  'modified': ['2018-07-13T07:04:14']},
 {'format': ['application/rdf+xml'],
  'extent': ['14409'],
  'modified': ['2018-11-22T05:01:39.923247']},
 {'format': ['text/plain; charset=utf-8', 'application/zip'],
  'extent': ['1226208'],
  'modified': ['2018-07-13T07:04:14']},
 {'format': ['text/plain; charset=utf-8'],
  'extent': ['3359542'],
  'modified': ['2018-07-13T07:02:18']},
 {'format': ['text/html; charset=utf-8'],
  'extent': ['4072499'],
  'mo