In [4]:
import json
import zipfile
import os
import gutenberg
import pprint
import re

In [5]:
records = None
def load_data(fh=None):
    global records
    if records is not None:
        return
    records = list()
    if fh is None:
        fh = open('47000_metadata.json')
    for line in fh:
        records.append(json.loads(line))

def search(fn):
    if records is None:
        load_data()
    return [rec for rec in records if fn(rec)]

def pluck(exprs, rec):
    results = list()
    for item in exprs:
        if hasattr(item, '__call__'):
            results.append(item(rec))
        else:
            results.append(rec[item])
    return results

def get_iso_text(gutenberg_id, iso_path="/Volumes/PGDVD_2010_04_RC2"):
    import re, glob
    sid = str(gutenberg_id)
    path = '/'.join([iso_path] + list(sid[:-1]) + [sid])
    for fname in glob.glob(path + "/*.ZIP"):
        zf = zipfile.ZipFile(fname, 'r')
        txtfiles = [f for f in zf.infolist() \
                if re.search(r'\.txt$', f.filename, re.I)]
        if len(txtfiles) > 0:
            return zf.read(txtfiles[0].filename).decode('latin1')
    raise ValueError("couldn't fetch " + sid) 

def get_tar_text(gutenberg_id, tar_path="/gutenberg/files"):
    import glob
    sid = str(gutenberg_id)
    path = '/'.join([tar_path] + list(sid[:-1]) + [sid])
    fnames = sorted(glob.glob(path + "/%s*.txt" % sid), reverse=True)
    try:
        return gutenberg.GutenbergCleaner(open(fnames[0], encoding='latin1').read()).extract_text()
    except (ValueError, IndexError):
        raise
    

In [6]:
subjs = 'Fiction'
books = [pluck(['gutenberg_id', 'title'], r) for r in \
            search(lambda x: any([re.search(subjs, t['identifier'], re.I)
                for t in x['subjects']]))]
print(len(books))
pprint.pprint(books)

13452
[[15, 'Moby Dick; Or, The Whale'],
 [16, 'Peter Pan in Kensington Gardens'],
 [24, 'O Pioneers!'],
 [27, 'Far from the Madding Crowd'],
 [32, 'Herland'],
 [33, 'The Scarlet Letter'],
 [35, 'The Time Machine'],
 [36, 'The War of the Worlds'],
 [41, 'The Legend of Sleepy Hollow'],
 [42, 'The Strange Case of Dr. Jekyll and Mr. Hyde'],
 [43, 'The Strange Case of Dr. Jekyll and Mr. Hyde'],
 [44, 'The Song of the Lark'],
 [45, 'Anne of Green Gables'],
 [46, 'A Christmas Carol in Prose; Being a Ghost Story of Christmas'],
 [47, 'Anne of Avonlea'],
 [51, 'Anne of the Island'],
 [54, 'The Marvelous Land of Oz'],
 [60, 'The Scarlet Pimpernel'],
 [62, 'A Princess of Mars'],
 [63, 'The Number "e"'],
 [64, 'The Gods of Mars'],
 [68, 'Warlord of Mars'],
 [72, 'Thuvia, Maid of Mars'],
 [73, 'The Red Badge of Courage: An Episode of the American Civil War'],
 [74, 'The Adventures of Tom Sawyer'],
 [76, "The Adventures of Huckleberry Finn (Tom Sawyer's Comrade)"],
 [77, 'The House of the Seven Gab

 [2774, 'The Patrician'],
 [2775, 'The Good Soldier'],
 [2776, 'The Four Million'],
 [2777, 'Cabbages and Kings'],
 [2778, 'Jewel: A Chapter in Her Life'],
 [2781, 'Just So Stories'],
 [2783, 'The Trampling of the Lilies'],
 [2785, 'The Elusive Pimpernel'],
 [2786, 'Jack and Jill'],
 [2787, 'An Old-Fashioned Girl'],
 [2788, 'Little Men'],
 [2789, 'The Motor Girls'],
 [2791, 'Essays and Tales'],
 [2793, 'Flip: A California Romance'],
 [2794, 'Found at Blazing Star'],
 [2795, 'Bob, Son of Battle'],
 [2796, 'The Memoirs of Mr. Charles J. Yellowplush'],
 [2798, 'The Queen of the Pirate Isle'],
 [2799, 'Eben Holden: A Tale of the North Country'],
 [2802, 'Ramona'],
 [2803, 'The Rise of David Levinsky'],
 [2805, 'With Lee in Virginia: A Story of the American Civil War'],
 [2807, 'To Have and to Hold'],
 [2809, 'Other Main-Travelled Roads'],
 [2813, 'The Grand Babylon Hôtel'],
 [2814, 'Dubliners'],
 [2815, 'Democracy, an American novel'],
 [2818, 'Beautiful Joe'],
 [2821, 'The Story of the Ga

 [5945, 'The History of Don Quixote, Volume 2, Part 42'],
 [5946, 'The History of Don Quixote, Volume 2, Complete'],
 [5947, 'Billy Bunny and Uncle Bull Frog'],
 [5948, 'The Bobbsey Twins on Blueberry Island'],
 [5950, 'The Fortunes of Nigel'],
 [5951, 'Reno — a Book of Short Stories and Information'],
 [5952, 'The Bobbsey Twins in the Great West'],
 [5953, 'Many Kingdoms'],
 [5955, 'The Tale of Tommy Fox'],
 [5956, 'Gallegher and Other Stories'],
 [5959, 'Peveril of the Peak'],
 [5960, 'Little Sister Snow'],
 [5961, 'Samuel the Seeker'],
 [5962, 'Oh, Money! Money! A Novel'],
 [5963, 'Mr. Bingle'],
 [5964, "Love's Pilgrimage: A Novel"],
 [5965, 'The Devolutionist and the Emancipatrix'],
 [5966, "What's Mine's Mine — Volume 1"],
 [5967, "What's Mine's Mine — Volume 2"],
 [5968, "What's Mine's Mine — Volume 3"],
 [5969, "What's Mine's Mine — Complete"],
 [5970, 'Lovey Mary'],
 [5971, 'Jane Cable'],
 [5972, 'A Fascinating Traitor: An Anglo-Indian Story'],
 [5973, 'Thomas Wingfold, Curate'

 [9203, 'A Rill from the Town Pump'],
 [9204, 'The Prophetic Pictures (From "Twice Told Tales")'],
 [9205, 'Sights from a Steeple (From "Twice Told Tales")'],
 [9206, 'The Toll Gatherer\'s Day (From "Twice Told Tales")'],
 [9207, 'The Vision of the Fountain (From "Twice Told Tales")'],
 [9208, 'Fancy\'s Show-Box (From "Twice Told Tales")'],
 [9209, 'Twice Told Tales'],
 [9210, 'The Village Uncle (From "Twice Told Tales")'],
 [9211, 'The Sister Years (From "Twice Told Tales")'],
 [9212, 'Snow Flakes (From "Twice Told Tales")'],
 [9213, 'The Seven Vagabonds (From "Twice Told Tales")'],
 [9214, 'The White Old Maid (From "Twice Told Tales")'],
 [9215, 'Chippings with a Chisel (From "Twice Told Tales")'],
 [9216, 'Beneath an Umbrella (From "Twice Told Tales")'],
 [9217, 'The Lily\'s Quest (From "Twice Told Tales")'],
 [9218, 'Footprints on the Sea-Shore (From "Twice Told Tales")'],
 [9219, 'Edward Fane\'s Rosebud (From "Twice Told Tales")'],
 [9220, 'The Threefold Destiny (From "Twice Told 

 [13405, 'Travels and Adventures of Monsieur Violet'],
 [13409, 'The Horse-Stealers and Other Stories'],
 [13412, 'The Schoolmaster and Other Stories'],
 [13413, 'The Party and Other Stories'],
 [13414, 'Love and Other Stories'],
 [13415, 'The Lady with the Dog and Other Stories'],
 [13416, 'The Darling and Other Stories'],
 [13417, "The Cook's Wedding and Other Stories"],
 [13418, 'The Chorus Girl and Other Stories'],
 [13419, 'The Bishop and Other Stories'],
 [13423, 'Zarlah the Martian'],
 [13431, 'André'],
 [13432, 'Miss Bretherton'],
 [13438, "A King's Comrade"],
 [13450, 'The Motor Maids in Fair Japan'],
 [13453, 'The Testing of Diana Mallory'],
 [13454, 'Aylwin'],
 [13455, 'The Rover Boys In The Mountains; Or, A Hunt for Fun and Fortune'],
 [13456, 'Les deux nigauds'],
 [13459, 'The Waters of Edera'],
 [13461, 'Mistress and Maid: A Household Story'],
 [13472, 'Waysiders, Stories of Connacht'],
 [13478, 'Zézette : moeurs foraines'],
 [13490, 'Corysandre'],
 [13496,
  'The White M

 [17789, 'Molly McDonald'],
 [17790, 'Jane Field: A Novel'],
 [17791, "Au large de l'Écueil"],
 [17792, 'The Jamesons'],
 [17793, 'The Debtor: A Novel'],
 [17794, "L'épouvante"],
 [17795, 'La dernière Aldini'],
 [17796, 'Le pays des fourrures'],
 [17798, "L'île à hélice"],
 [17800, 'Wych Hazel'],
 [17801, 'Milly Darrell'],
 [17806, 'Foes in Ambush'],
 [17807, 'Uncle Wiggily in the Woods'],
 [17808, 'Belle-Rose'],
 [17809, 'Sous le burnous'],
 [17811, "Grace Harlowe's Junior Year at High School"],
 [17812, 'Vanhoista kätköistä'],
 [17819, "L'amic Fritz"],
 [17824, 'Little Black Sambo'],
 [17828, 'Excelsior'],
 [17832, 'Une ville flottante'],
 [17837, 'Beatrice Cenci'],
 [17841, 'The Old Flute-Player'],
 [17842, "Dead Man's Rock"],
 [17844, 'Ben Blair'],
 [17849, 'La contessa di Karolystria: Storia tragicomica'],
 [17850, 'La pergamena distrutta'],
 [17851, 'The History of Mary Prince, a West Indian Slave'],
 [17852, "Senz'Amore"],
 [17853, 'Ricordi di Parigi'],
 [17854, 'The Sport of th

 [21548, 'Ystävykset'],
 [21549, 'Jacob Faithful'],
 [21550, "The King's Own"],
 [21551, 'The Little Savage'],
 [21552, 'Masterman Ready: The Wreck of the "Pacific"'],
 [21553, 'Mr. Midshipman Easy'],
 [21554, 'Frank Mildmay'],
 [21555, 'The Mission; or Scenes in Africa'],
 [21556, 'Travels and Adventures of Monsieur Violet'],
 [21557, 'Newton Forster'],
 [21558, 'The Children of the New Forest'],
 [21559, 'The Pirate, and The Three Cutters'],
 [21568, 'Sweet Their Blood and Sticky'],
 [21572, 'Percival Keene'],
 [21573, 'The Phantom Ship'],
 [21574, 'The Poacher'],
 [21575, 'Poor Jack'],
 [21576, 'The Privateersman'],
 [21577, 'Peter Simple'],
 [21578, 'Rattlin the Reefer'],
 [21579, 'Snarleyyow'],
 [21580, 'The Pirate'],
 [21582, 'The Mightiest Man'],
 [21583, 'Children of the Tenements'],
 [21585, 'The Secret Garden'],
 [21586, 'The Sign of the Four'],
 [21587, 'The Parasite'],
 [21588, 'The Reluctant Dragon'],
 [21593, 'Das Urteil: Eine Geschichte'],
 [21594, 'Fred Fenton on the Cr

 [24872, 'The Tale of Master Meadow Mouse'],
 [24879, 'Curious, if True'],
 [24880, 'The Wreck of the Titan'],
 [24881, 'The Tale of Grumpy Weasel'],
 [24895, 'The Call Of The South'],
 [24896, 'The Ebbing Of The Tide'],
 [24898, 'Robert Elsmere'],
 [24907, 'The Lady and the Pirate'],
 [24909, 'The Golden Magnet'],
 [24911, 'Young Glory and the Spanish Cruiser'],
 [24913, 'The Monster'],
 [24916, 'Crown and Anchor'],
 [24918, 'Hollowdell Grange: Holiday Hours in a Country Home'],
 [24919, 'Amor Crioulo'],
 [24920, 'The Book of All-Power'],
 [24921, "It's like this, cat"],
 [24922, 'The Honourable Mr. Tawnish'],
 [24924, "Aventures extraordinaires d'un savant russe"],
 [24925, 'Amar es vencer'],
 [24926, "In the Mahdi's Grasp"],
 [24927, 'A Matter of Magnitude'],
 [24928, 'Longevity'],
 [24933, 'The Man Who Knew'],
 [24936, 'The Thunders of Silence'],
 [24937, 'Mike Marble: His Crotchets and Oddities.'],
 [24945, 'Mufti'],
 [24949, 'Control Group'],
 [24952, 'Âmona; The Child; And The B

 [29421, 'The Floating Island of Madness'],
 [29432, 'The Man the Martians Made'],
 [29437, 'The Martian Cabal'],
 [29439, 'Dr. Sevier'],
 [29445, 'The Hour of Battle'],
 [29446, 'Beside Still Waters'],
 [29447, 'Perez the Mouse'],
 [29448, 'Pariah Planet'],
 [29452, 'The Wings of the Dove, Volume 1 of 2'],
 [29453, 'Traffic in Souls: A Novel of Crime and Its Cure'],
 [29455, 'Invasion'],
 [29457, 'Loot of the Void'],
 [29458, 'Cost of Living'],
 [29462, 'The House Under the Sea: A Romance'],
 [29464, 'Tahiti: Roman aus der Südsee. Zweiter Band.'],
 [29466, 'Lords of the Stratosphere'],
 [29468, 'The Story of Don Quixote'],
 [29471, 'The Velvet Glove'],
 [29475, 'Under Arctic Ice'],
 [29479, 'The Night Riders'],
 [29481, 'The Fifth String'],
 [29483,
  'The Little Brown Hen Hears the Song of the Nightingale & The Golden '
  'Harvest'],
 [29485, 'Faro Nell and Her Friends: Wolfville Stories'],
 [29486, 'A Forest Hearth: A Romance of Indiana in the Thirties'],
 [29487, 'Forever'],
 [2948

 [32886,
  "The Battleship Boys' First Step Upward; Or, Winning Their Grades as Petty "
  'Officers'],
 [32889, 'Cue for Quiet'],
 [32890, 'Home is Where You Left It'],
 [32891, 'Phantom of the Forest'],
 [32893, 'The White Shield'],
 [32894, 'In the Whirl of the Rising'],
 [32895, 'A Veldt Vendetta'],
 [32896, "'Tween Snow and Fire: A Tale of the Last Kafir War"],
 [32897, 'Young Wallingford'],
 [32899, 'The Cosmic Deflector'],
 [32900, 'Rats in the Belfry'],
 [32901, 'The Merchants of Venus'],
 [32902, 'Villa Eden: The Country-House on the Rhine'],
 [32903, 'The Victor'],
 [32904, 'The Huddlers'],
 [32905, 'Mask of Death'],
 [32906, 'Thy Name Is Woman'],
 [32907, 'You Too Can Be A Millionaire'],
 [32909, 'Zero the Slaver: A Romance of Equatorial Africa'],
 [32910, 'With Wolseley to Kumasi: A Tale of the First Ashanti War'],
 [32911, 'The White Hand and the Black: A Story of the Natal Rising'],
 [32912, 'Into the Unknown: A Romance of South Africa'],
 [32914, 'Tales of South Africa'],