In [3]:
import requests
import bs4
import os
import nltk
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import matplotlib as plt

# Create a directory for NLTK data if it doesn't exist
nltk_data_dir = os.path.join(os.path.expanduser("~"), "nltk_data")
if not os.path.exists(nltk_data_dir):
    os.makedirs(nltk_data_dir)

# Append this directory to NLTK data path
nltk.data.path.append(nltk_data_dir)

# Download the resources again
nltk.download('punkt')
nltk.download('wordnet', download_dir=nltk_data_dir)
nltk.download('stopwords', download_dir=nltk_data_dir)

call_of_the_wild_txt_url = 'https://www.gutenberg.org/cache/epub/215/pg215.txt'

if not os.path.exists("downloaded_books"):
    os.makedirs("downloaded_books")

lemmatizer = WordNetLemmatizer()

def getpagetext(parsedpage):
    scriptelements=parsedpage.find_all('script')
    for scriptelement in scriptelements:
        scriptelement.extract()
    pagetext=parsedpage.get_text()
    return pagetext

def parse_webpage(url):
    try:
        response=requests.get(url, timeout=10)
        response.raise_for_status()
        parsed_html=bs4.BeautifulSoup(response.content,'html.parser')
        return parsed_html
    except requests.exceptions.RequestException as e:
        print(f"Error fetcing {url}: {e}" )
        return None

def extract_actual_book_content(read_online_url, book_title):
    parsed_html = parse_webpage(read_online_url)
    if parsed_html is None:
        return ""
    full_text = getpagetext(parsed_html)
    HEADER_TEXT_MARKER = "*** START OF THE PROJECT GUTENBERG EBOOK " + book_title.upper() + " ***"
    FOOTER_TEXT_MARKER = "*** END OF THE PROJECT GUTENBERG EBOOK " + book_title.upper() + " ***"
    start_index = full_text.find(HEADER_TEXT_MARKER) + len(HEADER_TEXT_MARKER)
    end_index = full_text.find(FOOTER_TEXT_MARKER)
    return full_text[start_index:end_index].strip()

def save_text_to_file(author, title, content):
    clean_title = ''.join(c for c in title if c.isalnum() or c.isspace()).replace(' ', '_')
    file_path = os.path.join("downloaded_books", f"{clean_title}.txt")

    try:
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(content)
        print(f"saved {title} by {author} to {file_path}")
    except OSError as e:
        print(f"Error saving {title} by {author} to {file_path}: {e}")

[nltk_data] Downloading package punkt to C:\Users\EmmaK/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\EmmaK\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\EmmaK\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
def crawl_ebook(author, title):
    actual_content = extract_actual_book_content(call_of_the_wild_txt_url, title)
    save_text_to_file(author, title, actual_content)

crawl_ebook("Jack London", "The call of the wild")

saved The call of the wild by Jack London to downloaded_books\The_call_of_the_wild.txt


In [20]:
def tokenize_and_lemmatize(text):
    tokens = word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalnum()]  # Ignore punctuation
    return lemmatized_tokens

def process_book(file_path):
    """
    Process a specific book file and return its lemmatized tokens.

    :param file_path: The path to the book file
    :type file_path: str
    :return: List of lemmatized tokens from the book
    :rtype: list
    """
    unified_vocabulary = {}
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read().lower()
            lemmatized_tokens = tokenize_and_lemmatize(text)
            for token in lemmatized_tokens:
                    unified_vocabulary[token] = unified_vocabulary.get(token, 0) + 1
    except FileNotFoundError:
        print(f"Error: {file_path} not found.")
    except OSError as e:
        print(f"Error reading {file_path}: {e}")

    return unified_vocabulary


# Process the books and create a unified vocabulary

file_to_process = ".\downloaded_books\The_call_of_the_wild.txt"  # Specify the path to the specific book
lemmatized_tokens = process_book(file_to_process)
sorted_lemmatized_tokens = sorted(lemmatized_tokens.items(), key=lambda x: x[1], reverse=True)



In [18]:
def prune(vocab):
    nltk_stop_words = nltk.corpus.stopwords.words('english')
    pruned_vocab = []
    for word, freq in vocab:
        # rule 1: check the nltk stop words list
        if(word in nltk_stop_words):
            continue
        # rule 2: check if is in the top 1% of frequent words
        if(freq >= vocab[int(len(vocab)/100)][1]):
            continue
        # rule 3: if the word occurs less than 4 times
        if(freq < 4):
            continue
        # rule 4: word is overly short (less than 3 characters) or long (over than 15 characters)
        if(len(word) < 3 or len(word) > 15):
            continue
        else:
            pruned_vocab.append(word)
    return pruned_vocab

pruned_vocabulary = prune(sorted_lemmatized_tokens)
pruned_top_100_words = pruned_vocabulary[:100]

print("Pruned top 100 words in the downloaded books:")
for word in pruned_top_100_words:
    print(f"{word}")

Pruned top 100 words in the downloaded books:
back
could
upon
men
would
life
made
came
spitz
sled
françois
never
thing
like
foot
two
head
though
camp
night
long
great
way
went
eye
knew
last
trail
club
hand
saw
away
john
till
perrault
half
hundred
three
side
hal
trace
first
every
fire
get
wolf
wild
place
snow
team
teeth
many
another
body
husky
left
old
took
end
ice
behind
come
water
face
found
around
sprang
fell
seemed
mile
dave
hour
forest
rest
shoulder
stood
even
good
said
nothing
leg
work
still
well
throat
turned
run
love
sound
morning
lay
strength
cried
mercedes
know
rope
air
whip
mate
bank


In [25]:
print(lemmatized_tokens)
print(pruned_vocabulary)

def get_text_tokenized(file_path):
    """
    Process a specific book file and return its lemmatized tokens.

    :param file_path: The path to the book file
    :type file_path: str
    :return: List of lemmatized tokens from the book
    :rtype: list
    """
    unified_vocab = []
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read().lower()
            lemmatized_tokens = tokenize_and_lemmatize(text)
            unified_vocab = lemmatized_tokens
    except FileNotFoundError:
        print(f"Error: {file_path} not found.")
    except OSError as e:
        print(f"Error reading {file_path}: {e}")

    return unified_vocab

call_of_the_wild_txt_text = get_text_tokenized(file_to_process)
print(call_of_the_wild_txt_text)



cover
the
call
of
the
wild
by
jack
london
content
chapter
into
the
primitive
chapter
ii
the
law
of
club
and
fang
chapter
iii
the
dominant
primordial
beast
chapter
iv
who
ha
won
to
mastership
chapter
the
toil
of
trace
and
trail
chapter
vi
for
the
love
of
a
man
chapter
vii
the
sounding
of
the
call
chapter
into
the
primitive
old
longing
nomadic
leap
chafing
at
custom
s
chain
again
from
it
brumal
sleep
wakens
the
ferine
buck
did
not
read
the
newspaper
or
he
would
have
known
that
trouble
wa
brewing
not
alone
for
himself
but
for
every
dog
strong
of
muscle
and
with
warm
long
hair
from
puget
sound
to
san
diego
because
men
groping
in
the
arctic
darkness
had
found
a
yellow
metal
and
because
steamship
and
transportation
company
were
booming
the
find
thousand
of
men
were
rushing
into
the
northland
these
men
wanted
dog
and
the
dog
they
wanted
were
heavy
dog
with
strong
muscle
by
which
to
toil
and
furry
coat
to
protect
them
from
the
frost
buck
lived
at
a
big
house
in
the
santa
clara
valley
judge
mil