In [34]:
from lxml import etree

In [303]:
def numberify(val):
    """Try to cast str -> int/float.
    
    Args:
        val (str)
    """
    try: return int(val)
    except: pass
    
    try: return float(val)
    except: pass
    
    return val

In [304]:
NAMESPACES = {
    'dcterms': 'http://purl.org/dc/terms/',
    'pgterms': 'http://www.gutenberg.org/2009/pgterms/',
    'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
}

In [636]:
class BookXML:
    
    @classmethod
    def from_file(cls, path):
        return cls(etree.parse(path))
    
    def __init__(self, tree):
        self.tree = tree
        
    def xpath(self, query, root=None, first=False, parser=None):
        """Query text.
        """
        root = root if root is not None else self.tree
    
        res = root.xpath(query, namespaces=NAMESPACES)

        if parser:
            res = list(map(parser, res))

        if first:
            res = res[0] if res else None

        return res
    
    def id(self):
        raw = self.xpath('//pgterms:ebook/@rdf:about', first=True)
        return int(raw.split('/')[-1])
    
    def title(self):
        return self.xpath('//dcterms:title/text()', first=True)
        
    def subjects(self):
        return self.xpath('//dcterms:subject//rdf:value/text()')
    
    def creators_iter(self):
        
        for root in self.xpath('//dcterms:creator/pgterms:agent'):
            
            creator = []
            for field in self.xpath('pgterms:*', root):
                
                key = etree.QName(field.tag).localname
                
                val = (self.xpath('./text()', field, first=True) or
                       self.xpath('./@rdf:resource', field, first=True))
                
                if val:
                    creator.append((key, val))
                    
            yield creator
            
    def bookshelves(self):
        return self.xpath('//pgterms:bookshelf//rdf:value/text()')

    # TODO: parse dt
    def issued(self):
        return self.xpath('//dcterms:issued/text()', first=True)
    
    def rights(self):
        return self.xpath('//dcterms:rights/text()', first=True)
    
    def downloads(self):
        return self.xpath('//pgterms:downloads/text()', first=True, parser=numberify)
    
    def publisher(self):
        return self.xpath('//dcterms:publisher/text()', first=True)
    
    def language(self):
        return self.xpath('//dcterms:language//rdf:value/text()', first=True)
    
    # TODO: top-level author + surname
    # TODO: formats?

In [637]:
b = BookXML.from_file('cache/epub/2600/pg2600.rdf')

In [638]:
b.id()

2600

In [639]:
b.title()

'War and Peace'

In [640]:
b.subjects()

['Aristocracy (Social class) -- Russia -- Fiction',
 'PG',
 'Napoleonic Wars, 1800-1815 -- Campaigns -- Russia -- Fiction',
 'War stories',
 'Historical fiction',
 'Russia -- History -- Alexander I, 1801-1825 -- Fiction']

In [641]:
list(b.creators_iter())

[[('deathdate', '1910'),
  ('name', 'Tolstoy, Leo, graf'),
  ('alias', 'Tolstoï, Léon'),
  ('birthdate', '1828'),
  ('alias', 'Толстой, Лев Николаевич'),
  ('alias', 'Tolstoi, Leo'),
  ('alias', 'Tolstoy, Graf Leo'),
  ('alias', 'Tolstoy, Lev N.'),
  ('webpage', 'http://en.wikipedia.org/wiki/Leo_Tolstoy')]]

In [642]:
b.bookshelves()

['Napoleonic(Bookshelf)',
 'Best Books Ever Listings',
 'Historical Fiction',
 'Opera',
 'Movie Books']

In [643]:
b.issued()

'2001-04-01'

In [644]:
b.rights()

'Public domain in the USA.'

In [645]:
b.downloads()

7243

In [646]:
b.publisher()

'Project Gutenberg'

In [647]:
b.language()

'en'