In [34]:
from collections import Counter
from re import split, sub
from requests import get

from bs4 import BeautifulSoup
from nltk import download
from nltk.corpus import stopwords

download("stopwords")
stop_words = stopwords.words("english") + ["ut", "'re", ".", ",", "--", "'s", "?", "(", ")", ":", "'", '"', "-", "{", "}", "&", "|", u"\u2014"]

books = {}
book_contents = {}
book_word_frequencies = {}
book_word_distributions = {}
book_bucketed_word_distributions = {}

def get_book(URL, book_name):
    # Get book
    book = get(URL).text

    # Clean html
    book = BeautifulSoup(book, "html.parser").get_text()

    # Save to globals
    books[book_name] = book

def get_book_contents(book_name):
    if book_name not in books:
        raise "Get book by calling get_book first"

    contents = books[book_name].lower()
    contents = split("\s+", contents)
    for i, word in enumerate(contents):
        word = sub('[,"\.\'&\|:@>*;/=]', "", word)
        contents[i] = sub('^[0-9\.]*$', "", word)

    book_contents[book_name] = contents

    if book_name in book_word_frequencies:
        del book_word_frequencies[book_name]

def get_book_word_frequencies(book_name):
    if book_name not in book_contents:
        raise "Get book contents by calling get_book_contents first"

    word_frequencies = Counter(book_contents[book_name])
    for word in stop_words:
        word_frequencies.pop(word, None)
    book_word_frequencies[book_name] = word_frequencies

    if book_name in book_word_distributions:
        del book_word_distributions[book_name]

    if book_name in book_bucketed_word_distributions:
        del book_bucketed_word_distributions[book_name]

def get_all_book_info(URL, book_name):
    get_book(URL, book_name)
    get_book_contents(book_name)
    get_book_word_frequencies(book_name)

def get_total_word_count(book_name):
    if book_name not in book_word_frequencies:
        raise "Get book word frequencies by calling get_book_word_frequencies first"

    return book_word_frequencies[book_name].total()

def get_n_most_common(book_name, n):
    if book_name not in book_word_frequencies:
        raise "Get book word frequencies by calling get_book_word_frequencies first"

    return book_word_frequencies[book_name].most_common(n)

def find_word_distribution(book_name, word):
    if book_name not in book_contents:
        raise "Get book contents by calling get_book_contents first"

    if book_name in book_word_distributions:
        if word in book_word_distributions[book_name]:
            return book_word_distributions[book_name][word]

    indicies = [i for i, book_word in enumerate(book_contents[book_name]) if book_word == word]
    book_word_distributions[book_name] = {}
    book_word_distributions[book_name][word] = indicies
    return indicies

def find_bucketed_word_distribution(book_name, word, bucket_size):
    if book_name not in book_contents:
        raise "Get book contents by calling get_book_contents first"

    if book_name in book_bucketed_word_distributions:
        if word in book_bucketed_word_distributions[book_name]:
            if bucket_size in book_bucketed_word_distributions[book_name][word]:
                return book_bucketed_word_distributions[book_name][word][bucket_size]

    indicies = find_word_distribution(book_name, word)
    buckets = [0 for i in range(((len(book_contents[book_name]) - 1) // bucket_size) + 1)]
    for index in indicies:
        buckets[index // bucket_size] += 1

    book_bucketed_word_distributions[book_name] = {}
    book_bucketed_word_distributions[book_name][word] = {}
    book_bucketed_word_distributions[book_name][word][bucket_size] = buckets
    return buckets

def get_window_around_word(book_name, index, n):
    if book_name not in book_contents:
        raise "Get book contents by calling get_book_contents first"

    start = max(index - n, 0)
    end = min(index + n + 1, len(book_contents[book_name]) - 1)

    return book_contents[book_name][start:end]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [35]:
book_1 = "Frankenstein; Or, The Modern Prometheus by Mary Wollstonecraft Shelley"
get_all_book_info("https://www.gutenberg.org/ebooks/84.txt.utf-8", book_1)

In [50]:
print(get_total_word_count(book_1))
print(get_n_most_common(book_1, 15))
print(find_bucketed_word_distribution(book_1, "creature", 5000))
print(get_window_around_word(book_1, 18025, 25))
print(get_window_around_word(book_1, 24727, 25))
print(get_window_around_word(book_1, 40983, 25))

36858
[('one', 199), ('could', 197), ('would', 183), ('yet', 149), ('upon', 128), ('man', 123), ('may', 112), ('first', 108), ('life', 108), ('might', 108), ('father', 108), ('every', 106), ('shall', 105), ('eyes', 103), ('said', 102)]
[3, 2, 0, 4, 2, 3, 8, 0, 2, 5, 2, 2, 2, 2, 4, 0]
['to', 'give', 'her', 'an', 'education', 'superior', 'to', 'that', 'which', 'she', 'had', 'at', 'first', 'intended', 'this', 'benefit', 'was', 'fully', 'repaid', 'justine', 'was', 'the', 'most', 'grateful', 'little', 'creature', 'in', 'the', 'world', 'i', 'do', 'not', 'mean', 'that', 'she', 'made', 'any', 'professions', 'i', 'never', 'heard', 'one', 'pass', 'her', 'lips', 'but', 'you', 'could', 'see', 'by', 'her']
['even', 'long', 'before', 'his', 'birth', 'it', 'may', 'therefore', 'be', 'judged', 'indecent', 'in', 'me', 'to', 'come', 'forward', 'on', 'this', 'occasion', 'but', 'when', 'i', 'see', 'a', 'fellow', 'creature', 'about', 'to', 'perish', 'through', 'the', 'cowardice', 'of', 'her', 'pretended', '

In [39]:
book_2 = "The Great Gatsby by F. Scott Fitzgerald"
get_all_book_info("https://www.gutenberg.org/ebooks/64317.txt.utf-8", book_2)

In [52]:
print(get_total_word_count(book_2))
print(get_n_most_common(book_2, 15))
print(find_bucketed_word_distribution(book_2, "jay", 5000))
print(get_window_around_word(book_1, 18026, 25))
print(get_window_around_word(book_1, 26061, 25))
print(get_window_around_word(book_1, 26935, 25))

27327
[('said', 232), ('gatsby', 175), ('“i', 175), ('tom', 163), ('one', 135), ('daisy', 135), ('like', 116), ('came', 108), ('back', 105), ('little', 102), ('went', 90), ('man', 86), ('project', 85), ('house', 85), ('eyes', 85)]
[0, 0, 1, 2, 1, 4, 0, 1, 0, 0, 0]
['give', 'her', 'an', 'education', 'superior', 'to', 'that', 'which', 'she', 'had', 'at', 'first', 'intended', 'this', 'benefit', 'was', 'fully', 'repaid', 'justine', 'was', 'the', 'most', 'grateful', 'little', 'creature', 'in', 'the', 'world', 'i', 'do', 'not', 'mean', 'that', 'she', 'made', 'any', 'professions', 'i', 'never', 'heard', 'one', 'pass', 'her', 'lips', 'but', 'you', 'could', 'see', 'by', 'her', 'eyes']
['awful', 'boundary', 'between', 'life', 'and', 'death', 'felt', 'not', 'as', 'i', 'did', 'such', 'deep', 'and', 'bitter', 'agony', 'i', 'gnashed', 'my', 'teeth', 'and', 'ground', 'them', 'together', 'uttering', 'a', 'groan', 'that', 'came', 'from', 'my', 'inmost', 'soul', 'justine', 'started', 'when', 'she', 'saw

In [43]:
book_3 = "Alice's Adventures in Wonderland by Lewis Carroll"
get_all_book_info("https://www.gutenberg.org/ebooks/11.txt.utf-8", book_3)

In [53]:
print(get_total_word_count(book_3))
print(get_n_most_common(book_3, 15))
print(find_bucketed_word_distribution(book_3, "alice", 3000))
print(get_window_around_word(book_1, 514, 25))
print(get_window_around_word(book_1, 9033, 25))
print(get_window_around_word(book_1, 25617, 25))

15785
[('said', 455), ('alice', 374), ('little', 124), ('“i', 119), ('', 88), ('one', 88), ('project', 83), ('went', 83), ('like', 78), ('could', 75), ('thought', 73), ('would', 72), ('see', 65), ('queen', 64), ('know', 61)]
[29, 37, 34, 44, 59, 55, 56, 34, 26, 0]
['attracts', 'the', 'needle', 'and', 'may', 'regulate', 'a', 'thousand', 'celestial', 'observations', 'that', 'require', 'only', 'this', 'voyage', 'to', 'render', 'their', 'seeming', 'eccentricities', 'consistent', 'for', 'ever', 'i', 'shall', 'satiate', 'my', 'ardent', 'curiosity', 'with', 'the', 'sight', 'of', 'a', 'part', 'of', 'the', 'world', 'never', 'before', 'visited', 'and', 'may', 'tread', 'a', 'land', 'never', 'before', 'imprinted', 'by', 'the']
['in', 'the', 'eighteenth', 'century', 'but', 'while', 'i', 'followed', 'the', 'routine', 'of', 'education', 'in', 'the', 'schools', 'of', 'geneva', 'i', 'was', 'to', 'a', 'great', 'degree', 'self-taught', 'with', 'regard', 'to', 'my', 'favourite', 'studies', 'my', 'father',