In [4]:
import requests
import re
import os
import spacy
from pathlib import Path
from bs4 import BeautifulSoup

## 📦 What Does `en_core_web_trf` Actually Do?

## 🔍 Overview
`en_core_web_trf` is spaCy's **transformer-based** English NLP pipeline. It's powered by **Hugging Face Transformers** (like RoBERTa) and offers **state-of-the-art accuracy** in text processing.

- ✅ Smarter (better at context)
- 💪 Heavier (uses more RAM/VRAM)
- 🐢 Slower (especially on CPU)
- 💯 More accurate (great for serious analysis)


In [7]:
nlp = spacy.load("en_core_web_trf")

  model.load_state_dict(torch.load(filelike, map_location=device))


In [8]:
# Create a directory to store books
Path("books").mkdir(exist_ok=True)

# List of Project Gutenberg book URLs (plain text UTF-8 format)
books = {
    # Jane Austen
    "Pride and Prejudice": "https://www.gutenberg.org/files/1342/1342-0.txt",
    "Emma": "https://www.gutenberg.org/files/158/158-0.txt",
    
    # Charlotte Brontë
    "Jane Eyre": "https://www.gutenberg.org/files/1260/1260-0.txt",
    "The Professor": "https://www.gutenberg.org/files/9682/9682-0.txt",
    
    # Emily Brontë
    "Wuthering Heights": "https://www.gutenberg.org/files/768/768-0.txt",
    
    # George Eliot
    "Middlemarch": "https://www.gutenberg.org/files/145/145-0.txt",
    "The Mill on the Floss": "https://www.gutenberg.org/files/6688/6688-0.txt",

    # Mary Wollstonecraft
    "A Vindication of the Rights of Woman": "https://www.gutenberg.org/files/3420/3420-0.txt",
    "Mary, A Fiction": "https://www.gutenberg.org/files/3351/3351-0.txt",
    
    # Margaret Fuller
    "Woman in the Nineteenth Century": "https://www.gutenberg.org/files/8904/8904-0.txt",

    # Louisa May Alcott
    "Little Women": "https://www.gutenberg.org/files/514/514-0.txt",
    "An Old-Fashioned Girl": "https://www.gutenberg.org/files/2786/2786-0.txt",

    # Charlotte Perkins Gilman
    "The Yellow Wallpaper": "https://www.gutenberg.org/files/1952/1952-0.txt",
    "Herland": "https://www.gutenberg.org/files/32/32-0.txt",

    # Virginia Woolf
    "A Room of One's Own": "https://www.gutenberg.org/files/5200/5200-0.txt",
    "Mrs Dalloway": "https://www.gutenberg.org/files/612/612-0.txt",

    # Kate Chopin
    "The Awakening": "https://www.gutenberg.org/files/160/160-0.txt",
    "Bayou Folk": "https://www.gutenberg.org/files/229/229-0.txt",

    # Elizabeth Cady Stanton
    "The Woman's Bible": "https://www.gutenberg.org/files/9880/9880-0.txt",
    "Eighty Years and More": "https://www.gutenberg.org/files/31272/31272-0.txt",

    # Sojourner Truth
    "Ain't I a Woman?": "https://www.gutenberg.org/files/17488/17488-0.txt",
    "Narrative of Sojourner Truth": "https://www.gutenberg.org/files/16754/16754-0.txt",

    # Fredrika Bremer
    "Hertha": "https://www.gutenberg.org/files/22349/22349-0.txt",
    "The Neighbours": "https://www.gutenberg.org/files/6632/6632-0.txt",
}

In [25]:
corrected_books = {
    "The Professor": "https://www.gutenberg.org/cache/epub/9692/pg9692.txt",
    "A Vindication of the Rights of Woman": "https://www.gutenberg.org/cache/epub/3420/pg3420.txt",
    "Mary, A Fiction": "https://www.gutenberg.org/cache/epub/3351/pg3351.txt",
    "Mrs Dalloway": "https://www.gutenberg.org/cache/epub/612/pg612.txt",
    "Bayou Folk": "https://www.gutenberg.org/cache/epub/229/pg229.txt",
    "The Woman's Bible": "https://www.gutenberg.org/cache/epub/9880/pg9880.txt",
    "Eighty Years and More": "https://www.gutenberg.org/cache/epub/31272/pg31272.txt",
    "Ain't I a Woman?": "https://www.gutenberg.org/cache/epub/17488/pg17488.txt",
    "Narrative of Sojourner Truth": "https://www.gutenberg.org/cache/epub/16754/pg16754.txt",
    "Hertha": "https://www.gutenberg.org/cache/epub/22349/pg22349.txt"
}

In [9]:
def download_book(title, url):
    file_path = Path(f"books/{title.replace(' ', '_')}.txt")
    if file_path.exists():
        print(f"✅ Already downloaded: {title}")
        return str(file_path)
    
    response = requests.get(url)
    if response.status_code == 200:
        with open(file_path, "w", encoding="utf-8") as f:
            f.write(response.text)
        print(f"⬇️ Downloaded: {title}")
        return str(file_path)
    else:
        print(f"❌ Failed to download: {title}")
        return None


In [10]:
# Function to clean Project Gutenberg headers/footers
def clean_gutenberg(text):
    start = re.search(r"\*\*\* START OF(.*?)\*\*\*", text, re.DOTALL)
    end = re.search(r"\*\*\* END OF(.*?)\*\*\*", text, re.DOTALL)
    if start and end:
        return text[start.end():end.start()].strip()
    return text.strip()

In [11]:
def process_text(text, chunk_size=200):
    import textwrap

    chunks = []
    word_chunks = textwrap.wrap(text, width=chunk_size * 6)  # estimate ~6 characters per word

    for chunk_text in word_chunks:
        chunk_text = chunk_text.strip()
        if len(chunk_text) < 100:
            continue

        doc = nlp(chunk_text)

        chunks.append({
            "text": chunk_text,
            "sentiment": doc._.polarity if hasattr(doc._, 'polarity') else "N/A",
            "entities": [(ent.text, ent.label_) for ent in doc.ents],
            "verbs": [token.lemma_ for token in doc if token.pos_ == "VERB"]
        })

    return chunks


In [12]:
all_chunks = []
for title, url in books.items():
    path = download_book(title, url)
    if path:
        with open(path, "r", encoding="utf-8") as f:
            raw_text = f.read()
        cleaned = clean_gutenberg(raw_text)
        chunks = process_text(cleaned)
        for chunk in chunks:
            chunk["title"] = title
        all_chunks.extend(chunks)

✅ Already downloaded: Pride and Prejudice
⬇️ Downloaded: Emma
✅ Already downloaded: Jane Eyre
❌ Failed to download: The Professor
⬇️ Downloaded: Wuthering Heights
⬇️ Downloaded: Middlemarch
⬇️ Downloaded: The Mill on the Floss
❌ Failed to download: A Vindication of the Rights of Woman
❌ Failed to download: Mary, A Fiction
⬇️ Downloaded: Woman in the Nineteenth Century
⬇️ Downloaded: Little Women
⬇️ Downloaded: An Old-Fashioned Girl
⬇️ Downloaded: The Yellow Wallpaper
⬇️ Downloaded: Herland
⬇️ Downloaded: A Room of One's Own
❌ Failed to download: Mrs Dalloway
✅ Already downloaded: The Awakening
❌ Failed to download: Bayou Folk
❌ Failed to download: The Woman's Bible
❌ Failed to download: Eighty Years and More
❌ Failed to download: Ain't I a Woman?
❌ Failed to download: Narrative of Sojourner Truth
❌ Failed to download: Hertha
⬇️ Downloaded: The Neighbours


In [13]:
print(f"Total chunks: {len(all_chunks)}")
print(all_chunks[0].keys())
print(all_chunks[0])

Total chunks: 8063
dict_keys(['text', 'sentiment', 'entities', 'verbs', 'title'])
{'text': '[Illustration:                               GEORGE ALLEN                                PUBLISHER                          156 CHARING CROSS ROAD                                 LONDON                               RUSKIN HOUSE                                    ]                              [Illustration:                 _Reading Jane’s Letters._      _Chap 34._                                    ]                                     PRIDE.                                   and                                PREJUDICE                                    by                              Jane Austen,                             with a Preface by                            George Saintsbury                                   and                            Illustrations by                              Hugh Thomson                           [Illustration: 1894]                         Ruskin       15

In [27]:
# Reuse your functions: download_book(), clean_gutenberg(), process_text()

missing_chunks = []

for title, url in corrected_books.items():
    path = download_book(title, url)
    if path:
        with open(path, "r", encoding="utf-8") as f:
            raw_text = f.read()
            cleaned = clean_gutenberg(raw_text)
            chunks = process_text(cleaned)
            for chunk in chunks:
                chunk["title"] = title
            missing_chunks.extend(chunks)
        print(f"✅ Processed: {title}")
    else:
        print(f"❌ Still failed: {title}")

# Update your existing all_chunks
all_chunks.extend(missing_chunks)


❌ Failed to download: The Professor
❌ Still failed: The Professor
⬇️ Downloaded: A Vindication of the Rights of Woman
✅ Processed: A Vindication of the Rights of Woman
⬇️ Downloaded: Mary, A Fiction
✅ Processed: Mary, A Fiction
⬇️ Downloaded: Mrs Dalloway
✅ Processed: Mrs Dalloway
⬇️ Downloaded: Bayou Folk
✅ Processed: Bayou Folk
⬇️ Downloaded: The Woman's Bible
✅ Processed: The Woman's Bible
⬇️ Downloaded: Eighty Years and More
✅ Processed: Eighty Years and More
⬇️ Downloaded: Ain't I a Woman?
✅ Processed: Ain't I a Woman?
⬇️ Downloaded: Narrative of Sojourner Truth
✅ Processed: Narrative of Sojourner Truth
❌ Failed to download: Hertha
❌ Still failed: Hertha


In [33]:
import pandas as pd

# STEP 1: Convert list of chunks to DataFrame
df_chunks = pd.DataFrame(all_chunks)

# STEP 2: Format lists as strings for CSV output
df_chunks["entities"] = df_chunks["entities"].apply(lambda x: str(x))
df_chunks["verbs"] = df_chunks["verbs"].apply(lambda x: ", ".join(x))

# STEP 3: Save to CSV
df_chunks.to_csv("processed_book_chunks.csv", index=False)

print("✅ CSV file saved as 'processed_book_chunks.csv'")


✅ CSV file saved as 'processed_book_chunks.csv'


In [38]:
print(len(all_chunks))

10037
