### Word Frequency in Classic Novels

##### 1. Import necessary libraries


In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from collections import Counter
import nltk
from nltk.corpus import stopwords
import string
import sys

##### 2. List constants

In [2]:
# URL of the text file to analyze
URL = "https://www.gutenberg.org/cache/epub/15/pg15-images.html"

# Number of most common words to display
TOP_N_WORDS = 10


##### 3. Set up NLTK 

In [3]:
try:
    nltk.data.find('corpora/stopwords')

except nltk.downloader.DownloaderError:
    print("NLTK stopwords not found. Downloading now...")
    try:
        nltk.download('stopwords')
        print("NLTK stopwords downloaded successfully.")
    except Exception as e:
        print(f"Error downloading NLTK stopwords: {e}")
        print("Please ensure you have an internet connection and try again.")
        sys.exit(1) # Exit if stopwords cannot be downloaded

##### 4. Fetch text from url

In [5]:
def fetch_text_from_url(url: str) -> str | None:
    """
    Fetches HTML content from a given URL and extracts text from <p> tags.
    
    Args:
        url (str): The URL of the web page to fetch.
        
    Returns:

        str | None: The extracted text as a single string, or None if an error occurs.
    """
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        # Find all <p> tags and extract their text, joining them into one string
        paragraphs = soup.find_all('p')
        extracted_text = " ".join(p.get_text(strip=True) for p in paragraphs)
        return extracted_text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching content from {url}: {e}")
        return None
    except Exception as e:
        print(f"An unexpected error occured during text extraction: {e}")
        return None

##### 5. Clean and Tokenize Text


In [6]:
def clean_and_tokenize_text (text: str, remove_stopwords: bool = True) -> list[str]:
    """
    Cleans the input text by converting to lowercase, removing punctuations, and optionally
    removing English stopwords.

    Args:
        text (str): The input text string.
        remove_stopwords (bool): If True, remove common English stopwords.

    Returns:
        list[str]: A list of cleaned and tokenized words.
    """

    # Convert text to lowercase 
    cleaned_text = text.lower()

    # Remove punctuation
    # str.maketrans creates a translation table: '' for remoal

    cleaned_text = cleaned_text.translate(str.maketrans('','', string.punctuation))

    # Tokenize the text into words
    tokens = cleaned_text.split()

    if remove_stopwords:
        # Get the set of English stopwords for efficient lookup
        stop_words = set(stopwords.words('english'))

        # Filter out stopwords
        tokens = [word for word in tokens if word not in stop_words]

    return tokens

##### 6. Count Word Frequencies

In [7]:
def count_word_frequencies(word_list: list[str]) -> Counter:
    """
        Counts the frequency of each word in a lst.
    Args:
        word_list (list[str]): A list of words.

    Returns:
        Counter: A Counter object mapping words to their frequencies.
    """
    return Counter(word_list)

##### Main Execution

In [9]:
if __name__ == "__main__":
    print(f"--- Starting Word Frequency Analysis from {URL} ---")

    # Step 1: Fetch text from the URL
    raw_book_text =  fetch_text_from_url(URL)

    if raw_book_text:
        print("\n-- Raw Word Frequencies (before cleaning) ---")
        # Step 2a: Tokenize raw text (without cleaning)
        raw_word_list = raw_book_text.split()
        # Step 3a: Count frequencies for raw words
        raw_word_frequencies = count_word_frequencies(raw_word_list)
        # Print the most common words from the raw text
        print(raw_word_frequencies.most_common(TOP_N_WORDS))

        print("\n--- Cleaning Word Frequencies (after cleaning and stopwords removal")
        # Step 2b: Clean and tokenize text (with stopword removal)
        cleaned_tokens = clean_and_tokenize_text(raw_book_text, remove_stopwords=True)
        # Step 3b: Count frequencies for cleaned tokens
        nlp_word_frequencies = count_word_frequencies(cleaned_tokens)
        # Print the most common words from the cleaned text
        print(nlp_word_frequencies.most_common(TOP_N_WORDS))
    else:
        print("Could not retrieve text. Exiting.")
    
    print("\n--- Analysis Complete ---")

--- Starting Word Frequency Analysis from https://www.gutenberg.org/cache/epub/15/pg15-images.html ---

-- Raw Word Frequencies (before cleaning) ---
[('the', 13662), ('of', 6494), ('and', 5899), ('a', 4477), ('to', 4448), ('in', 3824), ('that', 2679), ('his', 2426), ('I', 1715), ('with', 1647)]

--- Cleaning Word Frequencies (after cleaning and stopwords removal
[('whale', 894), ('one', 880), ('like', 570), ('upon', 560), ('old', 439), ('would', 427), ('man', 414), ('ahab', 400), ('ye', 394), ('ship', 369)]

--- Analysis Complete ---
