In [28]:
import os
import re
import random
import requests
import pandas as pd
from bs4 import BeautifulSoup
from nltk.tokenize import sent_tokenize, word_tokenize
from collections import Counter
import logging


# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')
logger = logging.getLogger(__name__)

BASE_URL = "https://www.gutenberg.org"

def get_bookshelves():
    """
    Fetches all bookshelves (genres) from Project Gutenberg.
    """
    response = requests.get(f"{BASE_URL}/ebooks/bookshelf/")
    if response.status_code != 200:
        raise Exception("Failed to fetch the bookshelves page.")

    soup = BeautifulSoup(response.text, 'html.parser')
    bookshelves = {}
    for link in soup.find_all("a", href=True):
        if "/ebooks/bookshelf/" in link['href']:
            genre_name = link.text.strip()
            genre_url = BASE_URL + link['href']
            bookshelves[genre_name] = genre_url

    logger.info(f"Found {len(bookshelves)} bookshelves.")
    return bookshelves

def fetch_books_from_shelf(shelf_url, output_folder, num_books=10):
    """
    Fetches books from a specific bookshelf URL, removes non-English books, 
    and cleans the title to exclude download numbers or extra details.
    """
    response = requests.get(shelf_url)
    if response.status_code != 200:
        raise Exception(f"Failed to fetch books from bookshelf: {shelf_url}")

    soup = BeautifulSoup(response.text, 'html.parser')
    book_links = []
    seen_urls = set()

    # Extract book links
    for link in soup.find_all("a", href=True):
        if link['href'].startswith("/ebooks/") and link['href'].split("/")[-1].isdigit():
            full_url = f"{BASE_URL}{link['href']}.txt.utf-8"
            if full_url not in seen_urls:
                seen_urls.add(full_url)
                raw_title = link.text.strip()
                
                # Remove downloads information and filter non-English books
                if "(French)" in raw_title or "(German)" in raw_title or "(Spanish)" in raw_title:
                    logger.info(f"Skipping non-English book: {raw_title}")
                    continue

                # Clean title to remove extra details like downloads
                sanitized_title = re.sub(r"\s*\d+\s*downloads$", "", raw_title).strip()
                sanitized_title = re.sub(r'[\\/*?:"<>|]', "_", sanitized_title)
                sanitized_title = re.sub(r'\s+', ' ', sanitized_title).strip()  # Remove extra spaces
                
                book_links.append({
                    "title": sanitized_title,
                    "url": full_url
                })

    logger.info(f"Found {len(book_links)} books in the selected bookshelf.")
    if len(book_links) == 0:
        logger.warning("No books found in this bookshelf.")
        return []

    # Select a limited number of books
    selected_books = random.sample(book_links, min(num_books, len(book_links)))

    # Prepare output folder
    os.makedirs(output_folder, exist_ok=True)
    raw_folder = os.path.join(output_folder, "raw")
    os.makedirs(raw_folder, exist_ok=True)

    metadata = []
    for book in selected_books:
        title = book["title"]
        raw_file_path = os.path.join(raw_folder, f"{title}.txt")
        if download_gutenberg_text(book["url"], raw_file_path):
            metadata.append({"title": book["title"], "url": book["url"], "genre": os.path.basename(shelf_url)})

    # Save metadata
    if metadata:
        metadata_df = pd.DataFrame(metadata)
        metadata_df.to_csv(os.path.join(output_folder, "metadata.csv"), index=False)
        logger.info(f"Metadata saved to {output_folder}/metadata.csv")
    return metadata



def download_gutenberg_text(url, save_path):
    """
    Downloads text from a Project Gutenberg URL and saves it locally.
    """
    response = requests.get(url)
    if response.status_code == 200:
        with open(save_path, 'w', encoding='utf-8') as file:
            file.write(response.text)
        logger.info(f"Downloaded: {save_path}")
        return True
    else:
        logger.error(f"Failed to download: {url}")
        return False

def preprocess_books(raw_folder, clean_folder):
    """
    Preprocesses all raw text files: cleans the text and extracts linguistic features.
    """
    os.makedirs(clean_folder, exist_ok=True)
    features = []

    for file_name in os.listdir(raw_folder):
        if not file_name.endswith(".txt"):
            continue

        raw_file_path = os.path.join(raw_folder, file_name)
        with open(raw_file_path, 'r', encoding='utf-8') as raw_file:
            raw_text = raw_file.read()

        # Clean text
        cleaned_text = clean_text(raw_text)
        clean_file_path = os.path.join(clean_folder, file_name)
        with open(clean_file_path, 'w', encoding='utf-8') as clean_file:
            clean_file.write(cleaned_text)
        
        # Extract linguistic features
        tokens = word_tokenize(cleaned_text)
        sentences = sent_tokenize(cleaned_text)
        word_freq = Counter(tokens)
        vocab_richness = len(set(tokens)) / len(tokens) if tokens else 0
        avg_sentence_length = sum(len(word_tokenize(s)) for s in sentences) / len(sentences) if sentences else 0

        # Store features
        features.append({
            "file_name": file_name,
            "vocab_richness": vocab_richness,
            "avg_sentence_length": avg_sentence_length,
            "num_sentences": len(sentences),
            "num_words": len(tokens),
            "most_common_word": word_freq.most_common(1)[0][0] if word_freq else None
        })
        logger.info(f"Processed: {file_name}")

    # Save features to CSV
    if features:
        features_df = pd.DataFrame(features)
        features_df.to_csv(os.path.join(clean_folder, "linguistic_features.csv"), index=False)
        logger.info(f"Linguistic features saved to {clean_folder}/linguistic_features.csv")
    else:
        logger.warning("No features extracted; the dataset might be empty.")

def clean_text(text):
    """
    Cleans the raw text by removing headers, footers, and extra spaces.
    """
    text = re.sub(r"\s+", " ", text)  # Remove extra whitespace
    start_index = text.find("*** START OF THIS PROJECT GUTENBERG EBOOK")
    end_index = text.find("*** END OF THIS PROJECT GUTENBERG EBOOK")
    if start_index != -1 and end_index != -1:
        text = text[start_index + len("*** START OF THIS PROJECT GUTENBERG EBOOK"):end_index]
    return text.strip()

if __name__ == "__main__":
    # Get bookshelves
    bookshelves = get_bookshelves()

    # Choose genres to process
    selected_genres = ["Science Fiction", "Fantasy", "Mystery Fiction"]
    output_folder = "gutenberg_bookshelf_data"

    for genre in selected_genres:
        if genre in bookshelves:
            logger.info(f"Processing genre: {genre}")
            shelf_url = bookshelves[genre]
            genre_folder = os.path.join(output_folder, genre.replace(" ", "_"))
            metadata = fetch_books_from_shelf(shelf_url, genre_folder, num_books=5)
            if metadata:
                preprocess_books(os.path.join(genre_folder, "raw"), os.path.join(genre_folder, "cleaned"))


2024-12-03 16:21:45,975 - Found 404 bookshelves.
2024-12-03 16:21:45,976 - Processing genre: Science Fiction
2024-12-03 16:21:46,123 - Skipping non-English book: Voyage au Centre de la Terre (French)
Jules Verne
1535 downloads
2024-12-03 16:21:46,124 - Skipping non-English book: L'île mystérieuse (French)
Jules Verne
1436 downloads
2024-12-03 16:21:46,125 - Found 23 books in the selected bookshelf.
2024-12-03 16:21:46,309 - Downloaded: gutenberg_bookshelf_data\Science_Fiction\raw\The Eyes Have It Philip K. Dick.txt
2024-12-03 16:21:46,561 - Downloaded: gutenberg_bookshelf_data\Science_Fiction\raw\The War of the Worlds H. G. Wells.txt
2024-12-03 16:21:46,763 - Downloaded: gutenberg_bookshelf_data\Science_Fiction\raw\The Big Trip Up Yonder Kurt Vonnegut.txt
2024-12-03 16:21:47,096 - Downloaded: gutenberg_bookshelf_data\Science_Fiction\raw\The Country of the Blind, and Other Stories H. G. Wells.txt
2024-12-03 16:21:47,298 - Downloaded: gutenberg_bookshelf_data\Science_Fiction\raw\The Marc