In [15]:
import requests
import bs4
import os
import nltk
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Create a directory for NLTK data if it doesn't exist
nltk_data_dir = os.path.join(os.path.expanduser("~"), "nltk_data")
if not os.path.exists(nltk_data_dir):
    os.makedirs(nltk_data_dir)

# Append this directory to NLTK data path
nltk.data.path.append(nltk_data_dir)

# Download the resources again
nltk.download('punkt', download_dir=nltk_data_dir)
nltk.download('wordnet', download_dir=nltk_data_dir)
nltk.download('stopwords', download_dir=nltk_data_dir)

Gutenberg_home_page_url = 'https://www.gutenberg.org'
Gutenberg_top_page_url = Gutenberg_home_page_url + '/browse/scores/top'

if not os.path.exists("../week2/downloaded_books"):
    os.makedirs("../week2/downloaded_books")

lemmatizer = WordNetLemmatizer()

def getpagetext(parsedpage):
    scriptelements=parsedpage.find_all('script')
    for scriptelement in scriptelements:
        scriptelement.extract()
    pagetext=parsedpage.get_text()
    return pagetext

def parse_webpage(url):
    try:
        response=requests.get(url, timeout=10)
        response.raise_for_status()
        parsed_html=bs4.BeautifulSoup(response.content,'html.parser')
        return parsed_html
    except requests.exceptions.RequestException as e:
        print(f"Error fetcing {url}: {e}" )
        return None

def extract_actual_book_content(read_online_url, book_title):
    parsed_html = parse_webpage(read_online_url)
    if parsed_html is None:
        return ""
    full_text = getpagetext(parsed_html)
    HEADER_TEXT_MARKER = "*** START OF THE PROJECT GUTENBERG EBOOK " + book_title.upper() + " ***"
    FOOTER_TEXT_MARKER = "*** END OF THE PROJECT GUTENBERG EBOOK " + book_title.upper() + " ***"
    start_index = full_text.find(HEADER_TEXT_MARKER) + len(HEADER_TEXT_MARKER)
    end_index = full_text.find(FOOTER_TEXT_MARKER)
    return full_text[start_index:end_index].strip()

def save_text_to_file(author, title, content):
    clean_title = ''.join(c for c in title if c.isalnum() or c.isspace()).replace(' ', '_')
    file_path = os.path.join("../week2/downloaded_books", f"{clean_title}")

    try:
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(content)
        print(f"saved {title} by {author} to {file_path}")
    except OSError as e:
        print(f"Error saving {title} by {author} to {file_path}: {e}")

[nltk_data] Downloading package punkt to C:\Users\EmmaK\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\EmmaK\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\EmmaK\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
def crawl_ebook(relative_link, author, title):
    e_book_url = Gutenberg_home_page_url + relative_link
    parsed_html = parse_webpage(e_book_url)

    read_online_link = parsed_html.find('a', title='Read online')['href']
    actual_content = extract_actual_book_content(Gutenberg_home_page_url + read_online_link, title)
    save_text_to_file(author, title, actual_content)

def get_author_and_title(relative_link):
    e_book_url = Gutenberg_home_page_url + relative_link
    parsed_html = parse_webpage(e_book_url)

    if parsed_html is None:
        return "Unknown Author", "Unknown Title"

    try:
        book_author = parsed_html.find('a', itemprop="creator").text.strip()
        book_title = parsed_html.find('td', itemprop="headline").text.strip()
    except AttributeError:
        print(f"Error parsing author or title for {relative_link}")
        return "Unknown Author", "Unknown Title"
    
    return book_author, book_title

def print_book_list(books):
    for idx, (author, title) in enumerate(books, 1):
        print(f"{idx}. Author: {author}, Title: {title}")

def gutenberg_top_k_ebook_crawler(top_page_url, k):
    parsed_html = parse_webpage(top_page_url)

    books_last_30_header = parsed_html.find('h2', id='books-last30')
    book_list = books_last_30_header.find_next('ol')
    book_items = book_list.find_all('li')
    e_book_links = []
    for book_item in book_items[0:k]:
        a_element = book_item.find_next('a')
        e_book_links.append(a_element['href'])

    books = [get_author_and_title(link) for link in e_book_links]
    
    print("Books that will be downloaded:")
    print_book_list(books)

    print()

    for i, (link, (author, title)) in enumerate(zip(e_book_links, books)):
        print(f"Downloading book {i+1}/{k}: {title} by {author}")
        crawl_ebook(link, author, title)

gutenberg_top_k_ebook_crawler(Gutenberg_top_page_url, k=20)

Books that will be downloaded:
1. Author: Shelley, Mary Wollstonecraft, 1797-1851, Title: Frankenstein; Or, The Modern Prometheus
2. Author: Shakespeare, William, 1564-1616, Title: Romeo and Juliet
3. Author: Austen, Jane, 1775-1817, Title: Pride and Prejudice
4. Author: Melville, Herman, 1819-1891, Title: Moby Dick; Or, The Whale
5. Author: Eliot, George, 1819-1880, Title: Middlemarch
6. Author: Shakespeare, William, 1564-1616, Title: The Complete Works of William Shakespeare
7. Author: Forster, E. M. (Edward Morgan), 1879-1970, Title: A Room with a View
8. Author: Carroll, Lewis, 1832-1898, Title: Alice's Adventures in Wonderland
9. Author: Alcott, Louisa May, 1832-1888, Title: Little Women; Or, Meg, Jo, Beth, and Amy
10. Author: Von Arnim, Elizabeth, 1866-1941, Title: The Enchanted April
11. Author: Montgomery, L. M. (Lucy Maud), 1874-1942, Title: The Blue Castle: a novel
12. Author: Hawthorne, Nathaniel, 1804-1864, Title: The Scarlet Letter
13. Author: Smollett, T. (Tobias), 1721-1

In [28]:
def tokenize_and_lemmatize(text):
    tokens = word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalnum()]  # Ignore punctuation
    return lemmatized_tokens

def process_books(directory="../week2/downloaded_books"):
    unified_vocabulary = {}
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                text = file.read()
                lemmatized_tokens = tokenize_and_lemmatize(text)
                for token in lemmatized_tokens:
                    unified_vocabulary[token] = unified_vocabulary.get(token, 0) + 1
                #pruned_unified_vocabulary = prune(unified_vocabulary)
        except FileNotFoundError:
            print(f"Error: {filename} not found in {directory}")
        except OSError as e:
            print(f"Error reading {filename}: {e}")

    #return pruned_unified_vocabulary
    return sorted(unified_vocabulary.items(), key=lambda x: x[1], reverse=True)

# Process the books and create a unified vocabulary
vocabulary = process_books()

# Sort and display the top 100 words by frequency
top_100_words = vocabulary[:100]
print("Top 100 words in the downloaded books:")
for word, freq in top_100_words:
    print(f"{word}: {freq}")

Top 100 words in the downloaded books:
the: 153898
of: 97619
to: 96664
and: 96274
a: 94100
I: 70335
in: 54990
that: 42800
wa: 36177
his: 35919
he: 34156
with: 32679
you: 32406
it: 31904
my: 30270
her: 29121
not: 28712
for: 28057
had: 26241
is: 24019
be: 23400
me: 21275
s: 20157
have: 20037
him: 19227
at: 19165
this: 18923
she: 18916
which: 18088
on: 16878
by: 16797
but: 16316
all: 14619
so: 14607
from: 12902
your: 12298
The: 12162
And: 11931
will: 11124
would: 11090
said: 10780
no: 10226
an: 9861
were: 9823
one: 9811
who: 9775
are: 9752
they: 9257
we: 9116
if: 8682
do: 8533
or: 8249
been: 8176
what: 8159
more: 8153
their: 7830
when: 7642
But: 7625
them: 7582
very: 7158
than: 6933
there: 6838
now: 6834
He: 6833
could: 6778
our: 6728
out: 6434
man: 6425
some: 6362
time: 6244
up: 6125
such: 6078
upon: 6056
shall: 6053
good: 6029
should: 5965
d: 5914
did: 5904
can: 5873
know: 5835
thou: 5796
any: 5687
am: 5659
like: 5647
into: 5636
then: 5528
much: 5282
say: 5190
It: 5075
must: 4988
little

In [41]:
def prune(vocab):
    nltk_stop_words = nltk.corpus.stopwords.words('english')
    pruned_vocab = []
    for word, freq in vocab:
        # rule 1: check the nltk stop words list
        if(word in nltk_stop_words):
            continue
        # rule 2: check if is in the top 1% of frequent words
        if(freq >= vocab[int(len(vocab)/100)][1]):
            continue
        # rule 3: if the word occurs less than 4 times
        if(freq < 4):
            continue
        # rule 4: word is overly short (less than 3 characters) or long (over than 15 characters)
        if(len(freq) < 3 or len(freq) > 15):
            continue
        else:
            pruned_vocab.append((word,freq))
    return pruned_vocab

pruned_vocabulary = prune(vocabulary)
print(pruned_vocabulary[:100])

the 153898
of 97619
to 96664
and 96274
a 94100
I 70335
in 54990
that 42800
wa 36177
his 35919
he 34156
with 32679
you 32406
it 31904
my 30270
her 29121
not 28712
for 28057
had 26241
is 24019
be 23400
me 21275
s 20157
have 20037
him 19227
at 19165
this 18923
she 18916
which 18088
on 16878
by 16797
but 16316
all 14619
so 14607
from 12902
your 12298
The 12162
And 11931
will 11124
would 11090
said 10780
no 10226
an 9861
were 9823
one 9811
who 9775
are 9752
they 9257
we 9116
if 8682
do 8533
or 8249
been 8176
what 8159
more 8153
their 7830
when 7642
But 7625
them 7582
very 7158
than 6933
there 6838
now 6834
He 6833
could 6778
our 6728
out 6434
man 6425
some 6362
time 6244
up 6125
such 6078
upon 6056
shall 6053
good 6029
should 5965
d 5914
did 5904
can 5873
know 5835
thou 5796
any 5687
am 5659
like 5647
into 5636
then 5528
much 5282
say 5190
It 5075
must 4988
little 4941
To 4900
come 4853
t 4787
make 4770
other 4728
never 4613
see 4613
well 4609
u 4576
What 4570
That 4565
own 4527
thy 4477
ma