In [141]:
import os
import ujson
import attr
import glob

import pandas as pd

from cached_property import cached_property
from itertools import islice
from tqdm import tqdm_notebook
from htrc_features.utils import download_file
from boltons.iterutils import chunked

In [119]:
class GenreNovel:
    
    @classmethod
    def from_path(cls, path):
        with open(path) as fh:
            return cls(ujson.load(fh))
        
    def __init__(self, data):
        self.data = data
    
    def __repr__(self):
        return 'GenreJSON<%s>' % self.htid
    
    @property
    def htid(self):
        return self.data['hathi_metadata']['htid']
    
    @cached_property
    def page_genres(self):
        
        pages = sorted(
            [k for k in self.data['page_genres'].keys()],
            key=lambda k: int(k)
        )
        
        return [
            self.data['page_genres'][page]
            for page in pages
        ]
    
    @cached_property
    def bi1(self):
        i = 0
        while self.page_genres[i] == 'front' and i < len(self.page_genres):
            i += 1
        return i
    
    @cached_property
    def bi2(self):
        i = len(self.page_genres) - 1
        while self.page_genres[i] == 'back' and i > 0:
            i -= 1
        return i
    
    def body_genres(self):
        return self.page_genres[self.bi1:self.bi2+1]
    
    def df_row(self):
        bg = self.body_genres()
        if len(set(bg)) == 1:
            return (self.htid, bg[0], self.bi1, self.bi2)

In [120]:
@attr.s
class GenreDir:
    
    root = attr.ib()
    
    def paths(self):
        pattern = os.path.join(self.root, '**/*.json')
        return glob.iglob(pattern)
    
    def novels(self):
        for path in self.paths():
            yield GenreNovel.from_path(path)
            
    def df_rows(self):
        for novel in tqdm_notebook(self.novels()):
            row = novel.df_row()
            if row: yield row

In [124]:
gd = GenreDir('../../data/htrc-genre/all')

In [None]:
rows = list(gd.df_rows())

In [128]:
df = pd.DataFrame(rows, columns=('htid', 'genre', 'p1', 'p2'))

In [146]:
len(df[df.genre=='non'])

144337

In [147]:
len(df[df.genre=='fic'])

17002

In [152]:
with open('vols.json', 'w') as fh:
    print(df.to_json(orient='records', lines=True), file=fh)

In [None]:
for htids in tqdm_notebook(chunked(list(df.htid), 1000)):
    download_file(htids, outdir='htrc')

In [150]:
download_file?

In [151]:
df.to_json?

In [153]:
df

Unnamed: 0,htid,genre,p1,p2
0,chi.086332362,non,8,401
1,chi.086333415,non,10,92
2,chi.19292203,non,8,87
3,chi.21126276,non,9,196
4,chi.22373971,non,3,65
5,chi.33683991,non,10,84
6,chi.47677749,non,4,51
7,chi.57124997,non,17,283
8,chi.77478676,non,8,476
9,coo.31924018344220,non,8,26
