In [921]:
import ujson
import maya

from functools import lru_cache
from lxml import etree
from glob import glob
from tqdm import tqdm

In [697]:
def parse_numeric(val):
    """Try to cast str -> int/float.
    
    Args:
        val (str)
    """
    try: return int(val)
    except: pass
    
    try: return float(val)
    except: pass
    
    return val

In [904]:
def parse_datetime(val):
    """Try to cast str -> datetime.
    """
    try:
        return maya.parse(val).datetime()
    except:
        return val

In [833]:
def split_mime(text):
    return text.split(';')[0]

In [699]:
NAMESPACES = {
    'dcterms': 'http://purl.org/dc/terms/',
    'pgterms': 'http://www.gutenberg.org/2009/pgterms/',
    'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
}

In [905]:
class BookXML:
    
    _json_keys = ('id', 'title', 'creators', 'author', 'surname',
                  'subjects', 'formats', 'links', 'bookshelves', 'issued', 
                  'rights', 'downloads', 'publisher', 'language',)
    
    @classmethod
    def from_file(cls, path):
        return cls(etree.parse(path))
    
    def __init__(self, tree):
        self.tree = tree
        
    def xpath(self, query, root=None, first=False, parser=None):
        """Query text.
        """
        root = root if root is not None else self.tree
    
        res = root.xpath(query, namespaces=NAMESPACES)

        if parser:
            res = list(map(parser, res))

        if first:
            res = res[0] if res else None

        return res
    
    def id(self):
        raw = self.xpath('//pgterms:ebook/@rdf:about', first=True)
        return int(raw.split('/')[-1])
    
    def title(self):
        return self.xpath('//dcterms:title/text()', first=True)
    
    def creators_iter(self):
        
        for root in self.xpath('//dcterms:creator/pgterms:agent'):
            
            creator = []
            for field in self.xpath('pgterms:*', root):
                
                key = etree.QName(field.tag).localname
                
                val = (self.xpath('./text()', field, first=True) or
                       self.xpath('./@rdf:resource', field, first=True))
                
                if val:
                    creator.append((key, val))
                    
            yield creator
            
    @lru_cache()
    def creators(self):
        return list(self.creators_iter())
    
    def author(self):
        return dict(self.creators()[0])['name']
    
    def surname(self):
        return self.author().split(', ')[0]
        
    def subjects(self):
        return self.xpath('//dcterms:subject//rdf:value/text()')
    
    def formats_iter(self):
        
        for root in self.xpath('//dcterms:hasFormat/pgterms:file'):
            
            url = self.xpath('./@rdf:about', root, first=True)
            formats = self.xpath('./dcterms:format//rdf:value/text()', root)
            extent = self.xpath('./dcterms:extent/text()', root, first=True, parser=parse_numeric)
            
            yield dict(url=url, formats=formats, extent=extent)
            
    @lru_cache()
    def formats(self):
        return list(self.formats_iter())
    
    def links(self):
        return {
            split_mime(f['formats'][0]): f['url']
            for f in self.formats() if len(f['formats'])==1
        }
            
    def bookshelves(self):
        return self.xpath('//pgterms:bookshelf//rdf:value/text()')

    def issued(self):
        return self.xpath('//dcterms:issued/text()', first=True, parser=parse_datetime)
    
    def rights(self):
        return self.xpath('//dcterms:rights/text()', first=True)
    
    def downloads(self):
        return self.xpath('//pgterms:downloads/text()', first=True, parser=parse_numeric)
    
    def publisher(self):
        return self.xpath('//dcterms:publisher/text()', first=True)
    
    def language(self):
        return self.xpath('//dcterms:language//rdf:value/text()', first=True)

    def to_dict(self):
        return {key: getattr(self, key)() for key in self._json_keys}
    
    def to_json(self):
        return ujson.dumps(self.to_dict())

In [906]:
b = BookXML.from_file('cache/epub/2600/pg2600.rdf')

In [907]:
b.to_dict()

{'id': 2600,
 'title': 'War and Peace',
 'creators': [[('deathdate', '1910'),
   ('name', 'Tolstoy, Leo, graf'),
   ('alias', 'Tolstoï, Léon'),
   ('birthdate', '1828'),
   ('alias', 'Толстой, Лев Николаевич'),
   ('alias', 'Tolstoi, Leo'),
   ('alias', 'Tolstoy, Graf Leo'),
   ('alias', 'Tolstoy, Lev N.'),
   ('webpage', 'http://en.wikipedia.org/wiki/Leo_Tolstoy')]],
 'author': 'Tolstoy, Leo, graf',
 'surname': 'Tolstoy',
 'subjects': ['Aristocracy (Social class) -- Russia -- Fiction',
  'PG',
  'Napoleonic Wars, 1800-1815 -- Campaigns -- Russia -- Fiction',
  'War stories',
  'Historical fiction',
  'Russia -- History -- Alexander I, 1801-1825 -- Fiction'],
 'formats': [{'url': 'http://www.gutenberg.org/ebooks/2600.epub.noimages',
   'formats': ['application/epub+zip'],
   'extent': 1342035},
  {'url': 'http://www.gutenberg.org/ebooks/2600.kindle.images',
   'formats': ['application/x-mobipocket-ebook'],
   'extent': 5470783},
  {'url': 'http://www.gutenberg.org/ebooks/2600.kindle.no

In [908]:
b.to_json()

'{"id":2600,"title":"War and Peace","creators":[[["deathdate","1910"],["name","Tolstoy, Leo, graf"],["alias","Tolsto\\u00ef, L\\u00e9on"],["birthdate","1828"],["alias","\\u0422\\u043e\\u043b\\u0441\\u0442\\u043e\\u0439, \\u041b\\u0435\\u0432 \\u041d\\u0438\\u043a\\u043e\\u043b\\u0430\\u0435\\u0432\\u0438\\u0447"],["alias","Tolstoi, Leo"],["alias","Tolstoy, Graf Leo"],["alias","Tolstoy, Lev N."],["webpage","http:\\/\\/en.wikipedia.org\\/wiki\\/Leo_Tolstoy"]]],"author":"Tolstoy, Leo, graf","surname":"Tolstoy","subjects":["Aristocracy (Social class) -- Russia -- Fiction","PG","Napoleonic Wars, 1800-1815 -- Campaigns -- Russia -- Fiction","War stories","Historical fiction","Russia -- History -- Alexander I, 1801-1825 -- Fiction"],"formats":[{"url":"http:\\/\\/www.gutenberg.org\\/ebooks\\/2600.epub.noimages","formats":["application\\/epub+zip"],"extent":1342035},{"url":"http:\\/\\/www.gutenberg.org\\/ebooks\\/2600.kindle.images","formats":["application\\/x-mobipocket-ebook"],"extent":547078

In [918]:
paths = glob('cache/**/*.rdf', recursive=True)

In [919]:
len(paths)

58377