In [34]:
from lxml import etree

In [303]:
def numberify(val):
    """Try to cast str -> int/float.
    
    Args:
        val (str)
    """
    try: return int(val)
    except: pass
    
    try: return float(val)
    except: pass
    
    return val

In [304]:
NAMESPACES = {
    'dcterms': 'http://purl.org/dc/terms/',
    'pgterms': 'http://www.gutenberg.org/2009/pgterms/',
    'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
}

In [363]:
def xpath(root, query, first=False, parser=None):
    """Query text, parse.
    """
    res = root.xpath(query, namespaces=NAMESPACES)
    
    if parser:
        res = list(map(parser, res))
        
    if first:
        res = res[0] if res else None
        
    return res

In [605]:
class BookXML:
    
    @classmethod
    def from_file(cls, path):
        return cls(etree.parse(path))
    
    def __init__(self, tree):
        self.tree = tree
    
    def id(self):
        raw = xpath(self.tree, '//pgterms:ebook/@rdf:about', first=True)
        return int(raw.split('/')[-1])
    
    def title(self):
        return xpath(self.tree, '//dcterms:title/text()', first=True)
        
    def subjects(self):
        return xpath(self.tree, '//dcterms:subject//rdf:value/text()')
    
    def creators_iter(self):
        
        for root in xpath(self.tree, '//dcterms:creator/pgterms:agent'):
            
            creator = []
            for field in xpath(root, 'pgterms:*'):
                
                key = etree.QName(field.tag).localname
                
                val = (xpath(field, './text()', first=True) or
                       xpath(field, './@rdf:resource', first=True))
                
                if val:
                    creator.append((key, val))
                    
            yield creator
            
    def bookshelves(self):
        return xpath(self.tree, '//pgterms:bookshelf//rdf:value/text()')

    # TODO: parse dt
    def issued(self):
        return xpath(self.tree, '//dcterms:issued/text()', first=True)
    
    def rights(self):
        return xpath(self.tree, '//dcterms:rights/text()', first=True)
    
    def downloads(self):
        return xpath(self.tree, '//pgterms:downloads/text()', first=True, parser=numberify)
    
    def publisher(self):
        return xpath(self.tree, '//dcterms:publisher/text()', first=True)
    
    def language(self):
        return xpath(self.tree, '//dcterms:language//rdf:value/text()', first=True)
    
    # TODO: formats?

In [594]:
b = BookXML.from_file('cache/epub/2600/pg2600.rdf')

In [595]:
b.id()

2600

In [596]:
b.title()

'War and Peace'

In [597]:
b.subjects()

['Aristocracy (Social class) -- Russia -- Fiction',
 'PG',
 'Napoleonic Wars, 1800-1815 -- Campaigns -- Russia -- Fiction',
 'War stories',
 'Historical fiction',
 'Russia -- History -- Alexander I, 1801-1825 -- Fiction']

In [598]:
list(b.creators_iter())

[[('deathdate', '1910'),
  ('name', 'Tolstoy, Leo, graf'),
  ('alias', 'Tolstoï, Léon'),
  ('birthdate', '1828'),
  ('alias', 'Толстой, Лев Николаевич'),
  ('alias', 'Tolstoi, Leo'),
  ('alias', 'Tolstoy, Graf Leo'),
  ('alias', 'Tolstoy, Lev N.'),
  ('webpage', 'http://en.wikipedia.org/wiki/Leo_Tolstoy')]]

In [599]:
b.bookshelves()

['Napoleonic(Bookshelf)',
 'Best Books Ever Listings',
 'Historical Fiction',
 'Opera',
 'Movie Books']

In [600]:
b.issued()

'2001-04-01'

In [601]:
b.rights()

'Public domain in the USA.'

In [602]:
b.downloads()

7243

In [603]:
b.publisher()

'Project Gutenberg'

In [604]:
b.language()

'en'