In [1]:
import os
import sys

# enable importing local modules from utils
sys.path.insert(0, os.path.abspath('./utils'))

# Check that elasticsearch is up and running (run `docker-compose up` before this if you haven't):

In [32]:
from elasticsearch import Elasticsearch

client = Elasticsearch()
client.count()


{'count': 3,
 '_shards': {'total': 3, 'successful': 3, 'skipped': 0, 'failed': 0}}

# Check that mongodb is up

In [33]:
import pymongo

client = pymongo.MongoClient()

client.server_info()

{'version': '4.2.2',
 'gitVersion': 'a0bbbff6ada159e19298d37946ac8dc4b497eadf',
 'modules': [],
 'allocator': 'tcmalloc',
 'javascriptEngine': 'mozjs',
 'sysInfo': 'deprecated',
 'versionArray': [4, 2, 2, 0],
 'openssl': {'running': 'OpenSSL 1.1.1  11 Sep 2018',
  'compiled': 'OpenSSL 1.1.1  11 Sep 2018'},
 'buildEnvironment': {'distmod': 'ubuntu1804',
  'distarch': 'x86_64',
  'cc': '/opt/mongodbtoolchain/v3/bin/gcc: gcc (GCC) 8.2.0',
  'ccflags': '-fno-omit-frame-pointer -fno-strict-aliasing -ggdb -pthread -Wall -Wsign-compare -Wno-unknown-pragmas -Winvalid-pch -Werror -O2 -Wno-unused-local-typedefs -Wno-unused-function -Wno-deprecated-declarations -Wno-unused-const-variable -Wno-unused-but-set-variable -Wno-missing-braces -fstack-protector-strong -fno-builtin-memcmp',
  'cxx': '/opt/mongodbtoolchain/v3/bin/g++: g++ (GCC) 8.2.0',
  'cxxflags': '-Woverloaded-virtual -Wno-maybe-uninitialized -fsized-deallocation -std=c++17',
  'target_arch': 'x86_64',
  'target_os': 'linux'},
 'bits': 

# Fetch book files and prep data for mongo insertion

In [36]:
import os
import requests
import pandas as pd
import re
from nltk import sent_tokenize


URLS = pd.read_csv(os.path.join("data", "book_urls.csv"))
OUTPUT_PATH = os.path.join("data", "books")


def fetch_books(urls=URLS, output_path=OUTPUT_PATH):
    os.makedirs(output_path, exist_ok=True)

    books = []

    for url in URLS['url'][0:3]:
        print(url)
        r = requests.get(url, allow_redirects=True)
        r.encoding = 'utf-8'
        books.append(r.text.strip())

    return books

def parse_books(raw_books):
    parsed_books = []

    for raw_book in raw_books:
        titleSearch = re.search(r"^Title:\s(.+)$", raw_book, flags=re.MULTILINE)
        authorSearch = re.search(r"^Author:\s(.+)$", raw_book, flags=re.MULTILINE)

        startOfBookSearch = re.search(r"\*\*\* START OF (THIS|THE) PROJECT GUTENBERG EBOOK .+ \*\*\*", raw_book)
        endOfBookSearch = re.search(r"\*\*\* END OF (THIS|THE) PROJECT GUTENBERG EBOOK .+ \*\*\*", raw_book)

        if titleSearch and authorSearch and startOfBookSearch and endOfBookSearch:
            title = titleSearch.group(1).strip()
            author = authorSearch.group(1).strip()
            full_text = raw_book[startOfBookSearch.end():endOfBookSearch.start()]
            # remove line breaks
            full_text = re.sub(r"\r\n"," ", full_text)
            # remove _ which are representing italics
            full_text = re.sub(r"_", "", full_text)

            sentences = sent_tokenize(full_text)

            print("Parsed " + title + " by " + author)

            for i, sentence in enumerate(sentences):
                parsed_books.append({ 
                    'title': title, 
                    'location': i, 
                    # this will remove any duplicate spaces
                    'text': " ".join(sentence.split()),
                    'author': author })
        else:
            print("Failed to parse book")

    return parsed_books

raw_books = fetch_books()
parsed_books = parse_books(raw_books)

print('Length of parsed books: ' + repr(len(parsed_books)))


https://www.gutenberg.org/files/1342/1342-0.txt
https://www.gutenberg.org/files/11/11-0.txt
https://www.gutenberg.org/ebooks/84.txt.utf-8
Parsed Pride and Prejudice by Jane Austen
Parsed Alice’s Adventures in Wonderland by Lewis Carroll
Parsed Frankenstein by Mary Wollstonecraft (Godwin) Shelley
Length of parsed books: 8948


# Insert books into our mongo `books` collection

In [44]:

bookworm_db = client['bookworm']
books_collection = bookworm_db['books']

books_collection.insert_many(parsed_books)




<pymongo.results.InsertManyResult at 0x1a2333b308>

In [28]:
test = "“Why, my dear, you must know, Mrs. Long says that Netherfield is\r\n      taken by a young man of large fortune from the north of England;\r\n      that he came down on Monday in a chaise and four to see the\r\n      place, and was so much delighted with it, that he agreed with Mr.\r\n      Morris immediately; that he is to take possession before\r\n      Michaelmas, and some of his servants are to be in the house by\r\n      the end of next week.”\r\n\r\n      “What is his name?”\r\n\r\n      “Bingley.”\r\n\r\n      “Is he married or single?”\r\n\r\n      “Oh!"

re.sub(r"\r\n"," ", test)

'“Why, my dear, you must know, Mrs. Long says that Netherfield is       taken by a young man of large fortune from the north of England;       that he came down on Monday in a chaise and four to see the       place, and was so much delighted with it, that he agreed with Mr.       Morris immediately; that he is to take possession before       Michaelmas, and some of his servants are to be in the house by       the end of next week.”        “What is his name?”        “Bingley.”        “Is he married or single?”        “Oh!'