In [1]:
import os
import sys

# enable importing local modules from utils
sys.path.insert(0, os.path.abspath('./utils'))

# Check that elasticsearch is up and running (run `docker-compose up` before this if you haven't):

In [2]:
from elasticsearch import Elasticsearch

client = Elasticsearch()
client.count()


{'count': 3,
 '_shards': {'total': 3, 'successful': 3, 'skipped': 0, 'failed': 0}}

# Check that mongodb is up

In [3]:
import pymongo

client = pymongo.MongoClient()

client.server_info()

{'version': '4.2.2',
 'gitVersion': 'a0bbbff6ada159e19298d37946ac8dc4b497eadf',
 'modules': [],
 'allocator': 'tcmalloc',
 'javascriptEngine': 'mozjs',
 'sysInfo': 'deprecated',
 'versionArray': [4, 2, 2, 0],
 'openssl': {'running': 'OpenSSL 1.1.1  11 Sep 2018',
  'compiled': 'OpenSSL 1.1.1  11 Sep 2018'},
 'buildEnvironment': {'distmod': 'ubuntu1804',
  'distarch': 'x86_64',
  'cc': '/opt/mongodbtoolchain/v3/bin/gcc: gcc (GCC) 8.2.0',
  'ccflags': '-fno-omit-frame-pointer -fno-strict-aliasing -ggdb -pthread -Wall -Wsign-compare -Wno-unknown-pragmas -Winvalid-pch -Werror -O2 -Wno-unused-local-typedefs -Wno-unused-function -Wno-deprecated-declarations -Wno-unused-const-variable -Wno-unused-but-set-variable -Wno-missing-braces -fstack-protector-strong -fno-builtin-memcmp',
  'cxx': '/opt/mongodbtoolchain/v3/bin/g++: g++ (GCC) 8.2.0',
  'cxxflags': '-Woverloaded-virtual -Wno-maybe-uninitialized -fsized-deallocation -std=c++17',
  'target_arch': 'x86_64',
  'target_os': 'linux'},
 'bits': 

# Fetch book files and prep data for mongo insertion

In [17]:
import os
import requests
import pandas as pd
import re


URLS = pd.read_csv(os.path.join("data", "book_urls_small.csv"))
OUTPUT_PATH = os.path.join("data", "books")


def fetch_books(urls=URLS, output_path=OUTPUT_PATH):
    os.makedirs(output_path, exist_ok=True)

    books = []

    for url in URLS['url']:
        print(url)
        r = requests.get(url, allow_redirects=True)
        r.encoding = 'utf-8'
        books.append(r.text.strip())

    return books

def parse_books(raw_books):
    parsed_books = []

    for raw_book in raw_books:
        titleSearch = re.search(r"^Title:\s(.+)$", raw_book, flags=re.MULTILINE)
        authorSearch = re.search(r"^Author:\s(.+)$", raw_book, flags=re.MULTILINE)

        startOfBookSearch = re.search(r"\*\*\* START OF (THIS|THE) PROJECT GUTENBERG EBOOK .+ \*\*\*", raw_book)
        endOfBookSearch = re.search(r"\*\*\* END OF (THIS|THE) PROJECT GUTENBERG EBOOK .+ \*\*\*", raw_book)

        if titleSearch and authorSearch and startOfBookSearch and endOfBookSearch:
            title = titleSearch.group(1).strip()
            author = authorSearch.group(1).strip()
            content = raw_book[startOfBookSearch.end():endOfBookSearch.start()]

            print("Parsed " + title + " by " + author)

            parsed_books.append({ 
                'title': title, 
                'content': content, 
                'author': author })
        else:
            print("Failed to parse book")

    return parsed_books

raw_books = fetch_books()
parsed_books = parse_books(raw_books)

print('Length of parsed books: ' + repr(len(parsed_books)))


https://www.gutenberg.org/files/1342/1342-0.txt
https://www.gutenberg.org/files/11/11-0.txt
https://www.gutenberg.org/ebooks/84.txt.utf-8
Parsed Pride and Prejudice by Jane Austen
Parsed Alice’s Adventures in Wonderland by Lewis Carroll
Parsed Frankenstein by Mary Wollstonecraft (Godwin) Shelley
Length of parsed books: 3


# Insert books into our mongo `books` collection

In [14]:
from pymongo import InsertOne

def bulk_insert(books):
    requests = []
    for book in books:
        requests.append(InsertOne(book))

    result = books_collection.bulk_write(requests)

    print("Inserted: " + repr(result.inserted_count))

bookworm_db = client['bookworm']
books_collection = bookworm_db['books']

bulk_insert(parsed_books)

Inserted: 3
