# We will use Gutenberg texts as corpus
## This library helps search them
https://pypi.org/project/Gutenberg/

In [1]:
# To install for Python 3 we need BSD-DB
# The code below is for unix, check the library repo
# for other OSs

#!sudo apt-get install libdb++-dev
#!export BERKELEYDB_DIR=/usr
#!pip install gutenberg

In [2]:
# Before searching we must index the texts
# (populate the metadata cache)
# This is a one-time operation that may take a long time to complete
# Let it run overnight

#from gutenberg.acquire import get_metadata_cache
#cache = get_metadata_cache()
#cache.populate()

In [3]:
# Example from the library
# Download book
from gutenberg.acquire import load_etext
from gutenberg.cleanup import strip_headers

text = strip_headers(load_etext(2701)).strip()
text[:1000] # prints 'MOBY DICK; OR THE WHALE\n\nBy Herman Melville ...'

MOBY-DICK;

or, THE WHALE.

By Herman Melville



CONTENTS

ETYMOLOGY.

EXTRACTS (Supplied by a Sub-Sub-Librarian).

CHAPTER 1. Loomings.

CHAPTER 2. The Carpet-Bag.

CHAPTER 3. The Spouter-Inn.

CHAPTER 4. The Counterpane.

CHAPTER 5. Breakfast.

CHAPTER 6. The Street.

CHAPTER 7. The Chapel.

CHAPTER 8. The Pulpit.

CHAPTER 9. The Sermon.

CHAPTER 10. A Bosom Friend.

CHAPTER 11. Nightgown.

CHAPTER 12. Biographical.

CHAPTER 13. Wheelbarrow.

CHAPTER 14. Nantucket.

CHAPTER 15. Chowder.

CHAPTER 16. The Ship.

CHAPTER 17. The Ramadan.

CHAPTER 18. His Mark.

CHAPTER 19. The Prophet.

CHAPTER 20. All Astir.

CHAPTER 21. Going Aboard.

CHAPTER 22. Merry Christmas.

CHAPTER 23. The Lee Shore.

CHAPTER 24. The Advocate.

CHAPTER 25. Postscript.

CHAPTER 26. Knights and Squires.

CHAPTER 27. Knights and Squires.

CHAPTER 28. Ahab.

CHAPTER 29. Enter Ahab; to Him, Stubb.

CHAPTER 30. The Pipe.

CHAPTER 31. Queen Mab.

CHAPTER 32. Cetology.

CHAPTER 33. The Specksnyder.

CHAPTER 34. The Ca

In [4]:
# Available metadata
from gutenberg.query import list_supported_metadatas

list_supported_metadatas() # prints (u'author', u'formaturi', u'language', ...)

('author', 'formaturi', 'language', 'rights', 'subject', 'title')


In [5]:
# Next: find books available in more than one language
from gutenberg.query import get_etexts
from gutenberg.query import get_metadata

texts = melville_texts = get_etexts('author', 'Melville, Herman')
for t in texts:
    print(t, get_metadata('title', t))

2694 frozenset({'I and My Chimney'})
2701 frozenset({'Moby Dick; Or, The Whale'})
15 frozenset({'Moby-Dick; or, The Whale'})
13720 frozenset({'Mardi: and A Voyage Thither, Vol. I'})
13721 frozenset({'Mardi: and A Voyage Thither, Vol. II'})
34970 frozenset({'Pierre; or The Ambiguities'})
23969 frozenset({'Typee'})
12841 frozenset({'John Marr and Other Poems'})
9268 frozenset({'Omoo: Adventures in the South Seas'})
9269 frozenset({'Typee'})
8118 frozenset({'Redburn. His First Voyage\r\nBeing the Sailor Boy Confessions and Reminiscences of the Son-Of-A-Gentleman in the Merchant Navy'})
21816 frozenset({'The Confidence-Man: His Masquerade'})
2489 frozenset({'Moby Dick; Or, The Whale'})
9146 frozenset({'I and My Chimney'})
9147 frozenset({'Moby Dick'})
15422 frozenset({'Israel Potter: His Fifty Years of Exile'})
4045 frozenset({'Omoo: Adventures in the South Seas'})
10712 frozenset({'White Jacket; Or, The World on a Man-of-War'})
11231 frozenset({'Bartleby, the Scrivener: A Story of Wall-St

In [16]:
texts = get_etexts('author', 'Dickens, Charles')
for t in texts:
    lng, = get_metadata('language', t)
    title, = get_metadata('title', t)
    print(t, lng, title)

9728 en Pictures from Italy
9729 en Perils of Certain English Prisoners
9730 en The Pickwick Papers
9731 en To Be Read at Dusk
9732 en Barnaby Rudge
9733 en Sketches by Boz, illustrative of everyday life and every-day people
9734 en Sketches of Young Gentlemen
9735 en Somebody's Luggage
9736 en Sunday under Three Heads
9737 en The Seven Poor Travellers
9738 en The Chimes
A Goblin Story of Some Bells That Rang an Old Year out and a New Year In
9739 en Cricket on the Hearth
9740 en Tom Tiddler's Ground
9741 en The Uncommercial Traveller
9742 en The Wreck of the Golden Mary
9743 en Sketches of Young Couples
9744 en David Copperfield
49683 en Cruikshank's Water Colours
61994 fr Olivier Twist: Les voleurs de Londres
46 en A Christmas Carol in Prose; Being a Ghost Story of Christmas
19505 en A Christmas Carol
564 en The Mystery of Edwin Drood
580 en The Pickwick Papers
588 en Master Humphrey's Clock
46675 en Oliver Twist; or, The Parish Boy's Progress. Illustrated
50771 nl Slechte Tijden
98 

In [21]:
Dickens_2_cities = {'en': 98, 'de': 58145, 'es': 61887}

In [19]:
texts = get_etexts('title', 'Oliver Twist')
Dickens_Twist = {}
for t in texts:
    lng, = get_metadata('language', t)
    title, = get_metadata('title', t)
    Dickens_Twist[lng] = t
    print(t, lng, title)

16023 fr Oliver Twist
730 en Oliver Twist
56586 de Oliver Twist
9727 en Oliver Twist


In [20]:
Dickens_Twist

{'fr': 16023, 'en': 9727, 'de': 56586}

In [24]:
books = {}
for ln in ['es', 'de', 'en', 'fr', 'ru']:
    books[ln] =  list(get_etexts('language', ln))

In [39]:
import pickle
with open('data/books.pkl', 'wb') as f:
    pickle.dump(books, f)