In [713]:
import maya

from lxml import etree
from functools import lru_cache

In [697]:
def parse_numeric(val):
    """Try to cast str -> int/float.
    
    Args:
        val (str)
    """
    try: return int(val)
    except: pass
    
    try: return float(val)
    except: pass
    
    return val

In [698]:
def parse_datetime(val):
    """Try to cast str -> datetime.
    """
    try:
        return maya.parse(val).datetime()
    except:
        return val

In [833]:
def split_mime(text):
    return text.split(';')[0]

In [699]:
NAMESPACES = {
    'dcterms': 'http://purl.org/dc/terms/',
    'pgterms': 'http://www.gutenberg.org/2009/pgterms/',
    'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
}

In [816]:
class BookXML:
    
    @classmethod
    def from_file(cls, path):
        return cls(etree.parse(path))
    
    def __init__(self, tree):
        self.tree = tree
        
    def xpath(self, query, root=None, first=False, parser=None):
        """Query text.
        """
        root = root if root is not None else self.tree
    
        res = root.xpath(query, namespaces=NAMESPACES)

        if parser:
            res = list(map(parser, res))

        if first:
            res = res[0] if res else None

        return res
    
    def id(self):
        raw = self.xpath('//pgterms:ebook/@rdf:about', first=True)
        return int(raw.split('/')[-1])
    
    def title(self):
        return self.xpath('//dcterms:title/text()', first=True)
        
    def subjects(self):
        return self.xpath('//dcterms:subject//rdf:value/text()')
    
    def creators_iter(self):
        
        for root in self.xpath('//dcterms:creator/pgterms:agent'):
            
            creator = []
            for field in self.xpath('pgterms:*', root):
                
                key = etree.QName(field.tag).localname
                
                val = (self.xpath('./text()', field, first=True) or
                       self.xpath('./@rdf:resource', field, first=True))
                
                if val:
                    creator.append((key, val))
                    
            yield creator
            
    @lru_cache()
    def creators(self):
        return list(self.creators_iter())
    
    def formats_iter(self):
        
        for root in self.xpath('//dcterms:hasFormat/pgterms:file'):
            
            url = self.xpath('./@rdf:about', root, first=True)
            formats = self.xpath('./dcterms:format//rdf:value/text()', root)
            extent = self.xpath('./dcterms:extent/text()', root, first=True, parser=parse_numeric)
            
            yield dict(url=url, formats=formats, extent=extent)
            
    @lru_cache()
    def formats(self):
        return list(self.formats_iter())
    
    def links(self):
        return {split_mime(f['formats'][0]): f['url'] for f in self.formats() if len(f['formats'])==1}
            
    def bookshelves(self):
        return self.xpath('//pgterms:bookshelf//rdf:value/text()')

    def issued(self):
        return self.xpath('//dcterms:issued/text()', first=True, parser=parse_datetime)
    
    def rights(self):
        return self.xpath('//dcterms:rights/text()', first=True)
    
    def downloads(self):
        return self.xpath('//pgterms:downloads/text()', first=True, parser=parse_numeric)
    
    def publisher(self):
        return self.xpath('//dcterms:publisher/text()', first=True)
    
    def language(self):
        return self.xpath('//dcterms:language//rdf:value/text()', first=True)
    
    def author(self):
        return dict(self.creators()[0])['name']
    
    def surname(self):
        return self.author().split(', ')[0]

    # TODO: formats?

In [817]:
b = BookXML.from_file('cache/epub/2600/pg2600.rdf')

In [818]:
b.id()

2600

In [819]:
b.title()

'War and Peace'

In [820]:
b.subjects()

['Aristocracy (Social class) -- Russia -- Fiction',
 'PG',
 'Napoleonic Wars, 1800-1815 -- Campaigns -- Russia -- Fiction',
 'War stories',
 'Historical fiction',
 'Russia -- History -- Alexander I, 1801-1825 -- Fiction']

In [821]:
b.creators()

[[('deathdate', '1910'),
  ('name', 'Tolstoy, Leo, graf'),
  ('alias', 'Tolstoï, Léon'),
  ('birthdate', '1828'),
  ('alias', 'Толстой, Лев Николаевич'),
  ('alias', 'Tolstoi, Leo'),
  ('alias', 'Tolstoy, Graf Leo'),
  ('alias', 'Tolstoy, Lev N.'),
  ('webpage', 'http://en.wikipedia.org/wiki/Leo_Tolstoy')]]

In [822]:
b.bookshelves()

['Napoleonic(Bookshelf)',
 'Best Books Ever Listings',
 'Historical Fiction',
 'Opera',
 'Movie Books']

In [823]:
b.issued()

datetime.datetime(2001, 4, 1, 0, 0, tzinfo=<UTC>)

In [824]:
b.rights()

'Public domain in the USA.'

In [825]:
b.downloads()

7243

In [826]:
b.publisher()

'Project Gutenberg'

In [827]:
b.language()

'en'

In [828]:
b.issued()

datetime.datetime(2001, 4, 1, 0, 0, tzinfo=<UTC>)

In [829]:
b.author()

'Tolstoy, Leo, graf'

In [830]:
b.surname()

'Tolstoy'

In [831]:
b.formats()

[{'url': 'http://www.gutenberg.org/ebooks/2600.epub.noimages',
  'formats': ['application/epub+zip'],
  'extent': 1342035},
 {'url': 'http://www.gutenberg.org/ebooks/2600.kindle.images',
  'formats': ['application/x-mobipocket-ebook'],
  'extent': 5470783},
 {'url': 'http://www.gutenberg.org/ebooks/2600.kindle.noimages',
  'formats': ['application/x-mobipocket-ebook'],
  'extent': 5470775},
 {'url': 'http://www.gutenberg.org/ebooks/2600.epub.images',
  'formats': ['application/epub+zip'],
  'extent': 1342035},
 {'url': 'http://www.gutenberg.org/files/2600/2600-h.zip',
  'formats': ['application/zip', 'text/html; charset=utf-8'],
  'extent': 1290237},
 {'url': 'http://www.gutenberg.org/ebooks/2600.rdf',
  'formats': ['application/rdf+xml'],
  'extent': 14409},
 {'url': 'http://www.gutenberg.org/files/2600/2600-0.zip',
  'formats': ['text/plain; charset=utf-8', 'application/zip'],
  'extent': 1226208},
 {'url': 'http://www.gutenberg.org/files/2600/2600-0.txt',
  'formats': ['text/plain; 

In [832]:
b.links()

{'application/epub+zip': 'http://www.gutenberg.org/ebooks/2600.epub.images',
 'application/x-mobipocket-ebook': 'http://www.gutenberg.org/ebooks/2600.kindle.noimages',
 'application/rdf+xml': 'http://www.gutenberg.org/ebooks/2600.rdf',
 'text/plain; charset=utf-8': 'http://www.gutenberg.org/files/2600/2600-0.txt',
 'text/html; charset=utf-8': 'http://www.gutenberg.org/files/2600/2600-h/2600-h.htm'}