# **Downloaded all Jane Austen's books, and all similar authors' books**

In [None]:
import os
import requests
import time
import hashlib
import re
import string
from bs4 import BeautifulSoup

BASE_URL = "https://www.gutenberg.org"

# Pages to scrape ebook IDs from: one author page and several subject pages.
PAGES_TO_SCRAPE = [
    "https://www.gutenberg.org/ebooks/author/68",  # Jane Austen
    "https://www.gutenberg.org/ebooks/22964", # All of Jane Austen Topics, I'm reading many authors cause original Jane Austen's length won't do it
    "https://www.gutenberg.org/ebooks/subject/104",
    "https://www.gutenberg.org/ebooks/subject/1702",
    "https://www.gutenberg.org/ebooks/subject/1699",
    "https://www.gutenberg.org/ebooks/subject/2487",
    "https://www.gutenberg.org/ebooks/subject/2489",
    "https://www.gutenberg.org/ebooks/subject/2514",
    "https://www.gutenberg.org/ebooks/subject/2545",
    "https://www.gutenberg.org/ebooks/subject/2906",
    "https://www.gutenberg.org/ebooks/subject/18493",
    "https://www.gutenberg.org/ebooks/subject/18494",
]

def get_ebook_ids(page_url):
    """
    Scrape a Gutenberg page (author or subject) to extract unique ebook IDs.
    Assumes links to ebooks are in the format "/ebooks/<id>".
    """
    try:
        response = requests.get(page_url)
        response.raise_for_status()
    except Exception as e:
        print(f"Error fetching {page_url}: {e}")
        return set()

    soup = BeautifulSoup(response.text, "html.parser")
    ebook_ids = set()
    for link in soup.find_all("a", href=True):
        href = link["href"]
        # The format is expected to be /ebooks/<id>
        if href.startswith("/ebooks/"):
            parts = href.split("/")
            if parts[-1].isdigit():
                ebook_ids.add(parts[-1])
    print(f"Found {len(ebook_ids)} ebooks from {page_url}")
    return ebook_ids

def get_plain_text_link(ebook_id):
    """Attempt to find the plain text link for an ebook."""
    ebook_page_url = f"{BASE_URL}/ebooks/{ebook_id}"
    try:
        response = requests.get(ebook_page_url)
        response.raise_for_status()
    except Exception as e:
        print(f"Error fetching ebook page {ebook_page_url}: {e}")
        return None

    soup = BeautifulSoup(response.text, "html.parser")
    # Try to find a link labeled "Plain Text UTF-8"
    link = soup.find("a", string="Plain Text UTF-8")
    if link and link.has_attr("href"):
        return BASE_URL + link["href"]

    # Fallback: try "Plain Text"
    link = soup.find("a", string="Plain Text")
    if link and link.has_attr("href"):
        return BASE_URL + link["href"]

    return None

def get_html_link(ebook_id):
    """
    Attempt to find the HTML version link for the ebook using the "Read online (web)" label.
    """
    ebook_page_url = f"{BASE_URL}/ebooks/{ebook_id}"
    try:
        response = requests.get(ebook_page_url)
        response.raise_for_status()
    except Exception as e:
        print(f"Error fetching ebook page {ebook_page_url}: {e}")
        return None

    soup = BeautifulSoup(response.text, "html.parser")
    link = soup.find("a", string=re.compile(r"read online\s*\(web\)", re.IGNORECASE))
    if link and link.has_attr("href"):
        href = link["href"]
        # If the URL is relative, prepend the base URL.
        if not href.startswith("http"):
            href = BASE_URL + href
        return href
    return None

def download_text(url):
    """Download text content from a given URL."""
    try:
        response = requests.get(url)
        response.raise_for_status()
        return response.text
    except Exception as e:
        print(f"Error downloading {url}: {e}")
    return None

def extract_text_from_html(html_content):
    """Extract plain text from HTML content."""
    soup = BeautifulSoup(html_content, "html.parser")
    for script_or_style in soup(["script", "style"]):
        script_or_style.decompose()
    return soup.get_text(separator="\n")

def normalize_text(text):
    """
    Clean and normalize the ebook text.

    - Remove header: Everything before the start marker (if present).
    - Remove footer: Everything after any end marker (including variants such as fragments).
    - Remove mentions of "Project Gutenberg".
    - Normalize quotes, capitalization, punctuation, and whitespace.
    """
    # Remove header: everything before the start marker, if found.
    start_match = re.search(r'\*\*\* START OF THE PROJECT GUTENBERG EBOOK', text, re.IGNORECASE)
    if start_match:
        text = text[start_match.end():]

    # Remove footer: any content starting with an end marker (including fragment markers)
    text = re.split(r'\*\*\* END OF THE PROJECT GUTENBERG EBOOK.*', text, flags=re.IGNORECASE)[0]

    # Further cleaning: remove "Project Gutenberg" mentions
    text = text.replace("Project Gutenberg", "").replace("Gutenberg", "")
    text = text.replace("\r", "")
    text = text.replace("“", "\"").replace("”", "\"")

    # Mark capital letters at word boundaries with a caret then lowercase (similar to your preprocessing)
    text = re.sub(r"(?<![a-zA-Z])([A-Z])", lambda m: f"^{m.group(0).lower()}", text)
    text = re.sub(r"([A-Z])", lambda m: m.group(0).lower(), text)

    # Normalize whitespace and separate punctuation with spaces.
    text = re.sub(r"\s+", " ", text)
    text = re.sub("([{}])".format(re.escape(string.punctuation)), r" \1 ", text)
    text = re.sub(r"\s+", " ", text)

    return text.strip()

def get_ebook_content(ebook_id):
    """
    Get the content of an ebook.
    First, try downloading the plain text version.
    If not available, fall back to the HTML version ("Read online (web)") and extract its text.
    """
    text_link = get_plain_text_link(ebook_id)
    if text_link:
        print(f"Using plain text for ebook {ebook_id}: {text_link}")
        content = download_text(text_link)
        if content:
            return content
    # Fall back to HTML if plain text is unavailable.
    html_link = get_html_link(ebook_id)
    if html_link:
        print(f"Falling back to HTML for ebook {ebook_id}: {html_link}")
        html_content = download_text(html_link)
        if html_content:
            return extract_text_from_html(html_content)
    print(f"No downloadable content found for ebook {ebook_id}.")
    return None

def compile_corpus(ebook_ids):
    """
    Download all ebooks, clean and normalize the text, remove duplicates,
    and compile them into one text corpus.
    """
    corpus = ""
    seen_hashes = set()
    for ebook_id in ebook_ids:
        print(f"Processing ebook ID: {ebook_id}")
        raw_content = get_ebook_content(ebook_id)
        if raw_content:
            clean_content = normalize_text(raw_content)
            content_hash = hashlib.md5(clean_content.encode("utf-8")).hexdigest()
            if content_hash in seen_hashes:
                print(f"Duplicate content detected for ebook {ebook_id}, skipping.")
            else:
                seen_hashes.add(content_hash)
                corpus += f"\n\n=== Ebook ID {ebook_id} ===\n\n"
                corpus += clean_content
        else:
            print(f"Skipping ebook {ebook_id} due to lack of content.")
        time.sleep(2)
    return corpus

if __name__ == "__main__":
    all_ebook_ids = set()
    for page in PAGES_TO_SCRAPE:
        ids = get_ebook_ids(page)
        all_ebook_ids = all_ebook_ids.union(ids)
    print(f"Total unique ebook IDs gathered: {len(all_ebook_ids)}")

    corpus = compile_corpus(all_ebook_ids)
    output_file = "combined_corpus.txt"
    with open(output_file, "w", encoding="utf-8") as f:
        f.write(corpus)
    print(f"Compilation complete. Corpus saved to {output_file}.")


Found 25 ebooks from https://www.gutenberg.org/ebooks/author/68
Found 15 ebooks from https://www.gutenberg.org/ebooks/22964
Found 25 ebooks from https://www.gutenberg.org/ebooks/subject/104
Found 25 ebooks from https://www.gutenberg.org/ebooks/subject/1702
Found 25 ebooks from https://www.gutenberg.org/ebooks/subject/1699
Found 25 ebooks from https://www.gutenberg.org/ebooks/subject/2487
Found 25 ebooks from https://www.gutenberg.org/ebooks/subject/2489
Found 25 ebooks from https://www.gutenberg.org/ebooks/subject/2514
Found 25 ebooks from https://www.gutenberg.org/ebooks/subject/2545
Found 25 ebooks from https://www.gutenberg.org/ebooks/subject/2906
Found 9 ebooks from https://www.gutenberg.org/ebooks/subject/18493
Found 12 ebooks from https://www.gutenberg.org/ebooks/subject/18494
Total unique ebook IDs gathered: 164
Processing ebook ID: 3268
Using plain text for ebook 3268: https://www.gutenberg.org/ebooks/3268.txt.utf-8
Processing ebook ID: 4274
Using plain text for ebook 4274: htt

In [None]:
def count_tokens(text):
    # Tokenize by splitting on whitespace; for more advanced tokenization,
    # consider using libraries like nltk or spaCy.
    tokens = text.split()
    num_tokens = len(tokens)
    unique_tokens = set(tokens)
    num_unique_tokens = len(unique_tokens)
    return num_tokens, num_unique_tokens

if __name__ == "__main__":
    # Replace 'author_corpus.txt' with your text file path if needed.
    with open("combined_corpus.txt", "r", encoding="utf-8") as f:
        text = f.read()

    total_tokens, unique_tokens = count_tokens(text)
    print(f"Total tokens: {total_tokens}")
    print(f"Unique tokens: {unique_tokens}")


Total tokens: 25796632
Unique tokens: 192233


# **Downloaded All Gutenberg Books (Not all actually)**

In [None]:
#!apt-get update
#!apt-get install libdb-dev
#!pip install gutenberg --use-deprecated=legacy-resolver


0% [Working]            Hit:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:4 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:6 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:7 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Reading package lists... Done
W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Reading package lists... Done
Building depe

In [None]:
!wget -w 2 -m -H "http://www.gutenberg.org/robot/harvest?filetypes[]=txt&langs[]=en"


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Last-modified header missing -- time-stamps turned off.
2025-03-08 07:10:10 (192 MB/s) - ‘www.gutenberg.org/robot/harvest?offset=1169900&filetypes[]=txt&langs[]=en’ saved [13254]

--2025-03-08 07:10:12--  https://www.gutenberg.org/robot/harvest?offset=1172704&filetypes[]=txt&langs[]=en
Reusing existing connection to www.gutenberg.org:443.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/html]
Saving to: ‘www.gutenberg.org/robot/harvest?offset=1172704&filetypes[]=txt&langs[]=en’

www.gutenberg.org/r     [ <=>                ]  13.13K  --.-KB/s    in 0s      

Last-modified header missing -- time-stamps turned off.
2025-03-08 07:10:13 (143 MB/s) - ‘www.gutenberg.org/robot/harvest?offset=1172704&filetypes[]=txt&langs[]=en’ saved [13442]

--2025-03-08 07:10:15--  https://www.gutenberg.org/robot/harvest?offset=1225129&filetypes[]=txt&langs[]=en
Reusing existing connection to www.gutenberg.org:443.
HTTP 

In [None]:
!rsync -av --del --exclude='*/old/**' --include='*/' --include='*.txt' --exclude='*' aleph.gutenberg.org::gutenberg /content/gutenberg/


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
1/0/5/5/10554/old/
1/0/5/5/10555/
1/0/5/5/10555/10555-0.txt
1/0/5/5/10555/old/
1/0/5/5/10556/
1/0/5/5/10556/10556-0.txt
1/0/5/5/10556/10556-h/
1/0/5/5/10556/10556-h/images/
1/0/5/5/10556/old/
1/0/5/5/10557/
1/0/5/5/10557/10557-0.txt
1/0/5/5/10557/10557.txt
1/0/5/5/10557/10557-h/
1/0/5/5/10557/10557-h/jcparty/
1/0/5/5/10557/old/
1/0/5/5/10558/
1/0/5/5/10558/10558-m/
1/0/5/5/10558/10558-m/10558-m-readme.txt
1/0/5/5/10559/
1/0/5/5/10559/10559-readme.txt
1/0/5/5/10559/old/
1/0/5/6/
1/0/5/6/10560/
1/0/5/6/10560/10560-0.txt
1/0/5/6/10560/old/
1/0/5/6/10561/
1/0/5/6/10561/10561-0.txt
1/0/5/6/10561/10561.txt
1/0/5/6/10561/10561-h/
1/0/5/6/10561/old/
1/0/5/6/10562/
1/0/5/6/10562/10562-0.txt
1/0/5/6/10562/10562.txt
1/0/5/6/10562/10562-h/
1/0/5/6/10562/old/
1/0/5/6/10563/
1/0/5/6/10563/10563-0.txt
1/0/5/6/10563/10563.txt
1/0/5/6/10563/10563-h/
1/0/5/6/10563/old/
1/0/5/6/10564/
1/0/5/6/10564/10564-0.txt
1/0/5/6/10564/10564.txt
1/0/5/

In [None]:
!zip -r /content/www.gutenberg.org.zip /content/www.gutenberg.org
!zip -r /content/gutenberg.zip /content/gutenberg

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  adding: content/gutenberg/1/0/0/5/10056/old/ (stored 0%)
  adding: content/gutenberg/1/0/0/5/10058/ (stored 0%)
  adding: content/gutenberg/1/0/0/5/10058/old/ (stored 0%)
  adding: content/gutenberg/1/0/0/5/10058/10058-0.txt (deflated 63%)
  adding: content/gutenberg/1/0/0/5/10059/ (stored 0%)
  adding: content/gutenberg/1/0/0/5/10059/old/ (stored 0%)
  adding: content/gutenberg/1/0/0/5/10059/10059-0.txt (deflated 61%)
  adding: content/gutenberg/1/0/0/5/10052/ (stored 0%)
  adding: content/gutenberg/1/0/0/5/10052/10052-0.txt (deflated 62%)
  adding: content/gutenberg/1/0/0/5/10052/10052-h/ (stored 0%)
  adding: content/gutenberg/1/0/0/5/10052/10052-h/images/ (stored 0%)
  adding: content/gutenberg/1/0/0/5/10052/old/ (stored 0%)
  adding: content/gutenberg/1/0/0/5/10052/old/10052-h/ (stored 0%)
  adding: content/gutenberg/1/0/0/5/10052/old/10052-h/images/ (stored 0%)
  adding: content/gutenberg/1/0/0/5/10053/ (stored 0%

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
import re
import requests
import zipfile
import io
import pandas as pd
from bs4 import BeautifulSoup

# ----- Configuration -----
# Path to your metadata CSV file
metadata_csv_path = "/content/gutenberg_metadata.csv"  # adjust as needed

# Directory where your harvest HTML files are stored
harvest_dir = "/content/www.gutenberg.org/robot"

# Directory where processed books will be saved
processed_dir = "/content/processed_books"
if not os.path.exists(processed_dir):
    os.makedirs(processed_dir)

# Base URL (for constructing full links)
base_url = "https://www.gutenberg.org/"

# ----- Load Metadata CSV -----
# Expecting columns like "Text#" (ebook ID), "Authors", "Subjects", "Title", etc.
metadata_df = pd.read_csv(metadata_csv_path).fillna("")
# Build a dictionary keyed by ebook ID (as string)
metadata_dict = {}
for idx, row in metadata_df.iterrows():
    ebook_id = str(row["Text#"]).strip()  # adjust column name if needed
    metadata_dict[ebook_id] = {
        "Authors": str(row.get("Authors", "")).strip(),
        "Subjects": str(row.get("Subjects", "")).strip(),
        "Title": str(row.get("Title", "")).strip()
    }

# ----- Functions to Process ZIP Files -----
def extract_book_id(zip_url):
    """
    Try to extract the ebook ID from the zip_url.
    For example, from '.../12370/12370-8.zip', extract '12370'.
    """
    # Look for a pattern of one or more digits appearing twice in the URL
    m = re.search(r'/(\d+)/\1', zip_url)
    if m:
        return m.group(1)
    # If not, try a simpler search for a block of digits
    m = re.search(r'/(\d+)', zip_url)
    if m:
        return m.group(1)
    return None

def download_and_process_zip(zip_url):
    print(f"Downloading ZIP: {zip_url}")
    try:
        response = requests.get(zip_url)
        response.raise_for_status()
    except Exception as e:
        print(f"Error downloading {zip_url}: {e}")
        return

    book_id = extract_book_id(zip_url)
    if not book_id:
        print("Could not extract ebook ID from URL:", zip_url)
        return

    # Create a folder for this ebook if it doesn't exist
    book_folder = os.path.join(processed_dir, book_id)
    if not os.path.exists(book_folder):
        os.makedirs(book_folder)

    # Get metadata for this book if available
    meta = metadata_dict.get(book_id, {})
    metadata_header = ""
    if meta:
        metadata_header = (
            f"[AUTHOR: {meta.get('Authors', 'Unknown') }]\n"
            f"[SUBJECTS: {meta.get('Subjects', 'None') }]\n"
            f"[TITLE: {meta.get('Title', 'Untitled') }]\n"
            f"[BOOK]\n"
        )

    try:
        with zipfile.ZipFile(io.BytesIO(response.content)) as z:
            # Process all files ending with .txt (case-insensitive)
            for zinfo in z.infolist():
                if zinfo.filename.lower().endswith(".txt"):
                    print(f"Extracting {zinfo.filename} for book {book_id}")
                    # Read the text content
                    try:
                        with z.open(zinfo) as file:
                            text_content = file.read().decode('utf-8', errors='replace')
                    except Exception as e:
                        print(f"Error reading {zinfo.filename} in {zip_url}: {e}")
                        continue

                    # Prepend metadata header if available, and append an end marker.
                    full_text = metadata_header + text_content + "\n[END BOOK]\n[CONTEXT: Repeat Author and Subjects]\n"
                    # Save the file with the same name inside the book folder.
                    output_file_path = os.path.join(book_folder, os.path.basename(zinfo.filename))
                    with open(output_file_path, "w", encoding="utf-8") as out_f:
                        out_f.write(full_text)
    except Exception as e:
        print(f"Error extracting ZIP {zip_url}: {e}")

# ----- Process Harvest HTML Files -----
# Walk through the harvest directory and process files that contain "harvest" in their filename.
for root, dirs, files in os.walk(harvest_dir):
    for file in files:
        if "harvest" in file.lower():
            file_path = os.path.join(root, file)
            print(f"Processing HTML file: {file_path}")
            with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
                html_content = f.read()
            soup = BeautifulSoup(html_content, "html.parser")
            zip_links = [a.get("href") for a in soup.find_all("a") if a.get("href") and a.get("href").lower().endswith(".zip")]
            print(f"Found {len(zip_links)} zip links in {file_path}")
            for link in zip_links:
                zip_url = link if link.startswith("http") else requests.compat.urljoin(base_url, link)
                download_and_process_zip(zip_url)

print("Processing complete. Check the 'processed_books' directory for processed texts.")


Processing HTML file: /content/www.gutenberg.org/robot/harvest?offset=3108822&filetypes[]=txt&langs[]=en
Found 100 zip links in /content/www.gutenberg.org/robot/harvest?offset=3108822&filetypes[]=txt&langs[]=en
Downloading ZIP: http://aleph.gutenberg.org/4/1/8/2/41826/41826-8.zip
Error downloading http://aleph.gutenberg.org/4/1/8/2/41826/41826-8.zip: 404 Client Error: Not Found for url: http://aleph.gutenberg.org/4/1/8/2/41826/41826-8.zip
Downloading ZIP: http://aleph.gutenberg.org/4/1/8/2/41826/41826.zip
Error downloading http://aleph.gutenberg.org/4/1/8/2/41826/41826.zip: 404 Client Error: Not Found for url: http://aleph.gutenberg.org/4/1/8/2/41826/41826.zip
Downloading ZIP: http://aleph.gutenberg.org/4/1/8/2/41827/41827-0.zip
Error downloading http://aleph.gutenberg.org/4/1/8/2/41827/41827-0.zip: 404 Client Error: Not Found for url: http://aleph.gutenberg.org/4/1/8/2/41827/41827-0.zip
Downloading ZIP: http://aleph.gutenberg.org/4/1/8/2/41827/41827-8.zip
Error downloading http://alep

KeyboardInterrupt: 

In [None]:
import os
import re
import requests
import zipfile
import io
import pandas as pd
from bs4 import BeautifulSoup

# ----- Configuration -----
metadata_csv_path = "/content/gutenberg_metadata.csv"  # adjust as needed
harvest_dir = "/content/www.gutenberg.org/robot"
processed_dir = "/content/processed_books"
if not os.path.exists(processed_dir):
    os.makedirs(processed_dir)
base_url = "https://www.gutenberg.org/"

# ----- Load Metadata CSV -----
metadata_df = pd.read_csv(metadata_csv_path).fillna("")
metadata_dict = {}

# Helper function to extract a 4-digit year from a date string (if needed)
def extract_year(issued):
    issued = str(issued).strip()
    if issued:
        parts = issued.split("/")
        if parts and len(parts[-1]) == 4 and parts[-1].isdigit():
            return parts[-1]
        else:
            m = re.search(r'\b(\d{4})\b', issued)
            if m:
                return m.group(1)
    return "Unknown"

# Build metadata dictionary from CSV
for idx, row in metadata_df.iterrows():
    ebook_id = str(row["Text#"]).strip()  # adjust column name if needed
    metadata_dict[ebook_id] = {
        "Authors": str(row.get("Authors", "")).strip(),
        "Subjects": str(row.get("Subjects", "")).strip(),
        "Title": str(row.get("Title", "")).strip(),
        "YEAR": extract_year(row.get("Issued", "")),
        "LoCC": str(row.get("LoCC", "")).strip(),
        "Bookshelves": str(row.get("Bookshelves", "")).strip()
    }

# ----- Function to Extract Book ID from a ZIP URL -----
def extract_book_id(zip_url):
    """
    Extract the ebook ID from a zip_url.
    Assumes the URL has a folder with the book id followed by a file whose name starts with the same id,
    optionally with a suffix (like -8) before .zip.
    For example, from '.../10084/10084-8.zip' it extracts '10084'.
    """
    m = re.search(r'/(\d+)/\d+(?:-\d+)?\.zip$', zip_url)
    if m:
        return m.group(1)
    return None

# ----- Function to Download and Process ZIP Files -----
def download_and_process_zip(zip_url):
    print(f"Downloading ZIP: {zip_url}")
    try:
        response = requests.get(zip_url)
        response.raise_for_status()
    except Exception as e:
        print(f"Error downloading {zip_url}: {e}")
        return

    book_id = extract_book_id(zip_url)
    if not book_id:
        print("Could not extract ebook ID from URL:", zip_url)
        return

    book_folder = os.path.join(processed_dir, book_id)
    if not os.path.exists(book_folder):
        os.makedirs(book_folder)

    # Get CSV metadata (if available) for additional tags.
    meta = metadata_dict.get(book_id, {})

    try:
        with zipfile.ZipFile(io.BytesIO(response.content)) as z:
            for zinfo in z.infolist():
                if zinfo.filename.lower().endswith(".txt"):
                    print(f"Extracting {zinfo.filename} for book {book_id}")
                    try:
                        with z.open(zinfo) as file:
                            text_content = file.read().decode('utf-8', errors='replace')
                    except Exception as e:
                        print(f"Error reading {zinfo.filename} in {zip_url}: {e}")
                        continue

                    # --- Trim the Project Gutenberg header/footer ---
                    # Remove everything before the start marker and after the end marker.
                    start_match = re.search(r'\*\*\* START OF THIS PROJECT GUTENBERG EBOOK.*?\*\*\*',
                                            text_content, re.IGNORECASE | re.DOTALL)
                    end_match = re.search(r'\*\*\* END OF THIS PROJECT GUTENBERG EBOOK.*?\*\*\*',
                                          text_content, re.IGNORECASE | re.DOTALL)

                    if start_match and end_match:
                        header_text = text_content[:start_match.start()]
                        book_body = text_content[start_match.end():end_match.start()]
                    else:
                        header_text = text_content
                        book_body = text_content

                    # --- Extract fields from header_text using case-insensitive matching ---
                    # Note: We account for variations like "author:" or "Author" etc.
                    author_field = re.search(r'(?i)^\s*author\s*[:\-]?\s*(.+)$', header_text, re.MULTILINE)
                    release_date_field = re.search(r'(?i)^\s*release date\s*[:\-]?\s*(.+)$', header_text, re.MULTILINE)
                    ebook_id_field = re.search(r'\[\s*EBook\s*#\s*(\d+)\s*\]', header_text, re.IGNORECASE)

                    author_val = author_field.group(1).strip() if author_field else "Unknown"
                    release_date_val = release_date_field.group(1).strip() if release_date_field else "Unknown"
                    ebook_id_val = ebook_id_field.group(1).strip() if ebook_id_field else book_id

                    # Build a new metadata header from the parsed fields.
                    new_metadata_header = (
                        f"[AUTHOR: {author_val}]\n"
                        f"[RELEASE DATE: {release_date_val}]\n"
                        f"[EBOOK ID: {ebook_id_val}]\n"
                    )

                    # Append additional CSV metadata (excluding already parsed fields).
                    other_metadata = ""
                    if meta:
                        other_metadata = (
                            f"[TITLE: {meta.get('Title', 'Untitled')}]\n"
                            f"[SUBJECTS: {meta.get('Subjects', 'None')}]\n"
                            f"[LOCC: {meta.get('LoCC', 'Unknown')}]\n"
                            f"[BOOKSHELVES: {meta.get('Bookshelves', 'Unknown')}]\n"
                        )

                    final_metadata_header = new_metadata_header + other_metadata + "[BOOK]\n"

                    # Build the final text: new metadata header + trimmed book body.
                    final_text = final_metadata_header + book_body.strip() + "\n[END BOOK]\n[CONTEXT: Repeat Author and Subjects]\n"

                    output_file_path = os.path.join(book_folder, os.path.basename(zinfo.filename))
                    with open(output_file_path, "w", encoding="utf-8") as out_f:
                        out_f.write(final_text)
    except Exception as e:
        print(f"Error extracting ZIP {zip_url}: {e}")

# ----- Process Harvest HTML Files -----
for root, dirs, files in os.walk(harvest_dir):
    for file in files:
        if "harvest" in file.lower():
            file_path = os.path.join(root, file)
            print(f"Processing HTML file: {file_path}")
            with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
                html_content = f.read()
            soup = BeautifulSoup(html_content, "html.parser")
            zip_links = [a.get("href") for a in soup.find_all("a") if a.get("href") and a.get("href").lower().endswith(".zip")]
            print(f"Found {len(zip_links)} zip links in {file_path}")
            for link in zip_links:
                zip_url = link if link.startswith("http") else requests.compat.urljoin(base_url, link)
                download_and_process_zip(zip_url)

print("Processing complete. Check the 'processed_books' directory for processed texts.")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Found 100 zip links in /content/www.gutenberg.org/robot/harvest?offset=3477853&filetypes[]=txt&langs[]=en
Downloading ZIP: http://aleph.gutenberg.org/4/3/4/2/43422/43422-8.zip
Error downloading http://aleph.gutenberg.org/4/3/4/2/43422/43422-8.zip: 404 Client Error: Not Found for url: http://aleph.gutenberg.org/4/3/4/2/43422/43422-8.zip
Downloading ZIP: http://aleph.gutenberg.org/4/3/4/2/43422/43422.zip
Error downloading http://aleph.gutenberg.org/4/3/4/2/43422/43422.zip: 404 Client Error: Not Found for url: http://aleph.gutenberg.org/4/3/4/2/43422/43422.zip
Downloading ZIP: http://aleph.gutenberg.org/4/3/4/2/43423/43423-8.zip
Error downloading http://aleph.gutenberg.org/4/3/4/2/43423/43423-8.zip: 404 Client Error: Not Found for url: http://aleph.gutenberg.org/4/3/4/2/43423/43423-8.zip
Downloading ZIP: http://aleph.gutenberg.org/4/3/4/2/43423/43423.zip
Error downloading http://aleph.gutenberg.org/4/3/4/2/43423/43423.zip: 4

KeyboardInterrupt: 

In [None]:
!zip -r /content/gutenberg.zip /content/processed_books

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  adding: content/processed_books/58919/ (stored 0%)
  adding: content/processed_books/58919/58919-0.txt (deflated 61%)
  adding: content/processed_books/22849/ (stored 0%)
  adding: content/processed_books/22849/22849-8.txt (deflated 62%)
  adding: content/processed_books/22849/22849.txt (deflated 62%)
  adding: content/processed_books/22176/ (stored 0%)
  adding: content/processed_books/22176/22176-8.txt (deflated 62%)
  adding: content/processed_books/22176/22176.txt (deflated 62%)
  adding: content/processed_books/35003/ (stored 0%)
  adding: content/processed_books/35003/35003-8.txt (deflated 60%)
  adding: content/processed_books/35003/35003.txt (deflated 60%)
  adding: content/processed_books/26482/ (stored 0%)
  adding: content/processed_books/26482/26482-8.txt (deflated 62%)
  adding: content/processed_books/26482/26482.txt (deflated 62%)
  adding: content/processed_books/59155/ (stored 0%)
  adding: content/proc

In [None]:
from google.colab import files
files.download('/content/full_books_text.txt')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from datasets import load_dataset

# Load the dataset (you can choose a split, e.g., "train")
dataset = load_dataset("super_natural_instructions", split="train")

# Output file name
output_file = "super_natural_instructions_tagged.txt"

with open(output_file, "w", encoding="utf-8") as f:
    for example in dataset:
        # Extract fields. If a field is empty, we can output "None" or an empty string.
        instruction = example.get("instruction", "").strip() or "None"
        input_text = example.get("input", "").strip() or "None"
        output_text = example.get("output", "").strip() or "None"
        task_name = example.get("task_name", "").strip() or "None"

        # Write the formatted example to the file with tags.
        f.write(f"[INSTRUCTION: {instruction}]\n")
        f.write(f"[INPUT: {input_text}]\n")
        f.write(f"[OUTPUT: {output_text}]\n")
        f.write(f"[TASK: {task_name}]\n")
        f.write("[END EXAMPLE]\n\n")

print(f"Saved tagged dataset to {output_file}")


In [None]:
import os
import re
import polars as pl
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

# Function to recursively list all .txt files under a directory.
def get_txt_files(root_dir):
    file_paths = []
    for root, _, files in os.walk(root_dir):
        for file in files:
            if file.endswith(".txt"):
                file_paths.append(os.path.join(root, file))
    return file_paths

# Define a cleaning function to normalize the text.
def clean_text(text):
    # Replace multiple newlines with two newlines (preserving paragraph breaks)
    text = re.sub(r'\n\s*\n', '\n\n', text)
    # Replace multiple spaces with a single space
    text = re.sub(r' +', ' ', text)
    return text.strip()

# Function to read and clean a file.
def read_and_clean_file(fp):
    # Optionally, print the file being processed (note: output may be interleaved in parallel execution)
    # print(f"Processing {fp}")
    try:
        with open(fp, "r", encoding="utf8") as f:
            content = f.read()
        cleaned = clean_text(content)
        if not cleaned:
            print(f"Warning: {fp} appears empty after cleaning.")
        return cleaned
    except Exception as e:
        print(f"Error reading {fp}: {e}")
        return ""

# Set your absolute path to the processed books folder.
books_dir = "/content/processed_books"
if not os.path.exists(books_dir):
    print(f"Folder {books_dir} doesn't exist!")
else:
    print(f"Folder {books_dir} exists.")

# Discover all .txt files.
file_list = get_txt_files(books_dir)
print(f"Discovered {len(file_list)} text files in nested folders.")

# Create a Polars DataFrame with file paths and file names.
df = pl.DataFrame({
    "file_path": file_list,
    "file_name": [os.path.basename(fp) for fp in file_list]
})

# Ensure file_name is a string.
df = df.with_columns(pl.col("file_name").fill_null(""))

# Extract the book ID from filenames.
df = df.with_columns(
    pl.col("file_name").str.extract(r"^(\d+)(?:-[0-9]+)?\.txt$", 1).alias("book_id"),
    (~pl.col("file_name").str.contains("-")).alias("is_primary")
)

# Filter out any rows where book_id wasn't extracted.
df = df.filter(pl.col("book_id").is_not_null())
print("DataFrame after regex extraction:")
print(df)

# For each book_id, sort so that primary files come first, then take the first unique entry.
df_primary = df.sort("is_primary", descending=True).unique("book_id", keep="first")
print(f"Unique primary files: {df_primary.shape[0]}")

# Parallelize file reading with a ThreadPoolExecutor using tqdm for progress.
file_paths = df_primary["file_path"].to_list()
print("Reading and cleaning files in parallel...")

with ThreadPoolExecutor(max_workers=8) as executor:
    texts = list(tqdm(executor.map(read_and_clean_file, file_paths), total=len(file_paths)))

# Optionally, report how many files produced empty text.
num_empty = sum(1 for t in texts if not t)
print(f"{num_empty} out of {len(texts)} files produced empty text.")



Folder /content/processed_books exists.
Discovered 26883 text files in nested folders.
DataFrame after regex extraction:
shape: (26_883, 4)
┌─────────────────────────────────┬─────────────┬─────────┬────────────┐
│ file_path                       ┆ file_name   ┆ book_id ┆ is_primary │
│ ---                             ┆ ---         ┆ ---     ┆ ---        │
│ str                             ┆ str         ┆ str     ┆ bool       │
╞═════════════════════════════════╪═════════════╪═════════╪════════════╡
│ /content/processed_books/31518… ┆ 31518.txt   ┆ 31518   ┆ true       │
│ /content/processed_books/31518… ┆ 31518-8.txt ┆ 31518   ┆ false      │
│ /content/processed_books/15416… ┆ 15416.txt   ┆ 15416   ┆ true       │
│ /content/processed_books/22744… ┆ 22744-8.txt ┆ 22744   ┆ false      │
│ /content/processed_books/22744… ┆ 22744.txt   ┆ 22744   ┆ true       │
│ …                               ┆ …           ┆ …       ┆ …          │
│ /content/processed_books/32505… ┆ 32505.txt   ┆ 32505  

100%|██████████| 16873/16873 [06:51<00:00, 41.02it/s]

0 out of 16873 files produced empty text.





AttributeError: 'DataFrame' object has no attribute 'with_column'

In [None]:

# Add the texts to the DataFrame.
df_primary = df_primary.with_columns(pl.Series("text", texts))

# Concatenate all book texts with a clear delimiter.
delimiter = "\n\n### NEW BOOK ###\n\n"
full_books_text = delimiter.join(df_primary["text"].to_list())

# Write the full books corpus to a file.
output_path = "full_books_text.txt"
with open(output_path, "w", encoding="utf8") as f:
    f.write(full_books_text)

print(f"Compilation complete! The full books corpus is saved as '{output_path}'.")

Compilation complete! The full books corpus is saved as 'full_books_text.txt'.


In [None]:
import polars as pl

# Lazily read the file as a CSV with one column ("line"), treating each line as a row.
df = pl.scan_csv(
    "/content/full_books_text.txt",
    has_header=False,
    new_columns=["line"],
    separator="\n",            # Use newline as the record separator
    ignore_errors=True,        # Skip rows that cause parsing errors
    infer_schema_length=10000, # Increase inference length for schema detection
    schema_overrides={"line": pl.Utf8},
    quote_char=None            # Disable quoting so no fields are interpreted as quoted
)

# Now, split each line into tokens (splitting on whitespace) and explode the tokens.
tokens_df = df.with_columns(
    pl.col("line").str.split(" ").alias("tokens")
).explode("tokens")

# Collect the first 100 tokens.
first_100_tokens = tokens_df.select("tokens").head(100).collect()["tokens"].to_list()

# Count the total number of tokens.
total_tokens = tokens_df.select(pl.count("tokens")).collect().item()

# Count the unique tokens.
unique_tokens = tokens_df.select(pl.col("tokens").n_unique()).collect().item()

print("First 100 tokens:")
print(" ".join(first_100_tokens))
print("\nToken counts:")
print("Total tokens:", total_tokens)
print("Unique tokens:", unique_tokens)


First 100 tokens:


TypeError: sequence item 35: expected str instance, NoneType found

In [None]:
!split -n 8 /content/full_books_text.txt /content/full_books_text_part_


In [None]:
# Collect in streaming mode to avoid memory spikes.
limited_tokens = tokens_df_limited.select("tokens").collect(streaming=True)["tokens"].to_list()

# In case the list has None values, filter them out.
limited_tokens = [token for token in limited_tokens if token is not None]

# Then take the first 100 tokens.
first_100_tokens = limited_tokens[:100]

print("First 100 tokens:")
print(" ".join(first_100_tokens))


First 100 tokens:
[AUTHOR: Edward Everett Hale] [RELEASE DATE: September, 2005] [EBOOK ID: 8904] [TITLE: How to Do It] [SUBJECTS: Self-culture] [LOCC: BJ] [BOOKSHELVES: Browsing: How To...; Browsing: Philosophy & Ethics; Browsing: Psychiatry/Psychology] [BOOK] Produced by Distributed Proofreaders How To Do It. By Edward Everett Hale. Contents.  Chapter I. Introductory.--How We Met  Chapter II. How To Talk  Chapter III. Talk  Chapter IV. How To Write  Chapter V. How To Read. I.  Chapter VI. How To Read. II.  Chapter VII. How To Go Into Society  Chapter VIII. How To Travel  Chapter IX. Life At School


In [None]:
import polars as pl

def process_chunk(lines):
    # Create a DataFrame from the chunk of lines.
    df_chunk = pl.DataFrame({"line": lines})
    # Split each line into tokens by whitespace.
    df_chunk = df_chunk.with_columns(
        pl.col("line").str.split(" ").alias("tokens")
    )
    # Explode tokens so that each token becomes its own row.
    df_chunk = df_chunk.explode("tokens")
    # Count tokens in this chunk.
    total_tokens_chunk = df_chunk.select(pl.count("tokens")).item()
    # Get unique tokens in this chunk.
    unique_tokens_chunk = set(df_chunk["tokens"].to_list())
    # Remove None values if present.
    unique_tokens_chunk.discard(None)
    return total_tokens_chunk, unique_tokens_chunk

chunk_size = 10000  # number of lines per chunk; adjust as needed
total_tokens = 0
unique_tokens_set = set()

# Read the file in chunks.
with open("/content/full_books_text.txt", "r", encoding="utf8") as f:
    lines_chunk = []
    for i, line in enumerate(f):
        lines_chunk.append(line)
        if (i + 1) % chunk_size == 0:
            t_count, unique_chunk = process_chunk(lines_chunk)
            total_tokens += t_count
            unique_tokens_set.update(unique_chunk)
            lines_chunk = []  # reset for the next chunk
    # Process any remaining lines.
    if lines_chunk:
        t_count, unique_chunk = process_chunk(lines_chunk)
        total_tokens += t_count
        unique_tokens_set.update(unique_chunk)

print("Token counts:")
print("Total tokens:", total_tokens)
print("Unique tokens:", len(unique_tokens_set))


Token counts:
Total tokens: 1068364428
Unique tokens: 14883525


In [None]:
!zip /content/full_books_text_part_aa.zip /content/full_books_text_part_aa
from google.colab import files
files.download('/content/full_books_text_part_aa.zip')


  adding: content/full_books_text_part_aa (deflated 62%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
!zip /content/full_books_text_part_ab.zip /content/full_books_text_part_ab
!zip /content/full_books_text_part_ac.zip /content/full_books_text_part_ac
!zip /content/full_books_text_part_ad.zip /content/full_books_text_part_ad
!zip /content/full_books_text_part_ae.zip /content/full_books_text_part_ae
!zip /content/full_books_text_part_af.zip /content/full_books_text_part_af
!zip /content/full_books_text_part_ag.zip /content/full_books_text_part_ag
!zip /content/full_books_text_part_ah.zip /content/full_books_text_part_ah



  adding: content/full_books_text_part_ab (deflated 62%)
  adding: content/full_books_text_part_ac (deflated 62%)
  adding: content/full_books_text_part_ad (deflated 62%)
  adding: content/full_books_text_part_ae (deflated 62%)
  adding: content/full_books_text_part_af (deflated 62%)
  adding: content/full_books_text_part_ag (deflated 63%)
  adding: content/full_books_text_part_ah (deflated 62%)


FileNotFoundError: Cannot find file: /content/full_books_text_part_aab.zip

In [None]:
from google.colab import files
files.download('/content/full_books_text_part_ab.zip')
files.download('/content/full_books_text_part_ac.zip')
files.download('/content/full_books_text_part_ad.zip')
files.download('/content/full_books_text_part_ae.zip')
files.download('/content/full_books_text_part_af.zip')
files.download('/content/full_books_text_part_ag.zip')
files.download('/content/full_books_text_part_ah.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
!unzip /content/natural_instructions_tagged.zip -d /content/natural_instructions_tagged


Archive:  /content/natural_instructions_tagged.zip
  inflating: /content/natural_instructions_tagged/natural_instructions_tagged.txt  


In [None]:
import os

# Define file paths and directories
extraction_dir = '/content/natural_instructions_tagged'
final_merged_file = '/content/final_merged_data.txt'
books_file_path = '/content/full_books_text.txt'

chunk_size = 1024 * 1024  # 1 MB

# Open the final merged file for writing
with open(final_merged_file, 'w', encoding='utf-8') as final_out:
    # Process each conversation text file one by one
    print("Appending conversation files one by one...")
    for root, dirs, files in os.walk(extraction_dir):
        for file in sorted(files):
            if file.endswith('.txt'):
                file_path = os.path.join(root, file)
                print(f"Processing {file_path} ...")
                with open(file_path, 'r', encoding='utf-8') as infile:
                    while True:
                        chunk = infile.read(chunk_size)
                        if not chunk:
                            break
                        final_out.write(chunk)
                # Write a separator between files (optional)
                final_out.write("\n\n")

    # Append a separator before the book data (optional)
    final_out.write("\n\n========== BOOK DATA ==========\n\n")

    # Now append the book text in chunks
    print("Appending book data...")
    with open(books_file_path, 'r', encoding='utf-8') as book_in:
        while True:
            chunk = book_in.read(chunk_size)
            if not chunk:
                break
            final_out.write(chunk)

print(f"Final merged file created at: {final_merged_file}")


Appending conversation files one by one...
Processing /content/natural_instructions_tagged/natural_instructions_tagged.txt ...
Appending book data...
Final merged file created at: /content/final_merged_data.txt


In [None]:
# Split the final merged data into 12 parts
!split -n 12 /content/final_merged_data.txt /content/final_merged_data_part_

# Zip each part into its own file
!zip /content/final_merged_data_part_aa.zip /content/final_merged_data_part_aa
!zip /content/final_merged_data_part_ab.zip /content/final_merged_data_part_ab
!zip /content/final_merged_data_part_ac.zip /content/final_merged_data_part_ac
!zip /content/final_merged_data_part_ad.zip /content/final_merged_data_part_ad
!zip /content/final_merged_data_part_ae.zip /content/final_merged_data_part_ae
!zip /content/final_merged_data_part_af.zip /content/final_merged_data_part_af
!zip /content/final_merged_data_part_ag.zip /content/final_merged_data_part_ag
!zip /content/final_merged_data_part_ah.zip /content/final_merged_data_part_ah
!zip /content/final_merged_data_part_ai.zip /content/final_merged_data_part_ai
!zip /content/final_merged_data_part_aj.zip /content/final_merged_data_part_aj
!zip /content/final_merged_data_part_ak.zip /content/final_merged_data_part_ak
!zip /content/final_merged_data_part_al.zip /content/final_merged_data_part_al

  adding: content/final_merged_data_part_aa (deflated 92%)
  adding: content/final_merged_data_part_ab (deflated 84%)
  adding: content/final_merged_data_part_ac (deflated 90%)
  adding: content/final_merged_data_part_ad (deflated 90%)
  adding: content/final_merged_data_part_ae (deflated 73%)
  adding: content/final_merged_data_part_af (deflated 62%)
  adding: content/final_merged_data_part_ag (deflated 62%)
  adding: content/final_merged_data_part_ah (deflated 62%)
  adding: content/final_merged_data_part_ai (deflated 62%)
  adding: content/final_merged_data_part_aj (deflated 62%)
  adding: content/final_merged_data_part_ak (deflated 63%)
  adding: content/final_merged_data_part_al (deflated 62%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from google.colab import files

files.download('/content/final_merged_data_part_aa.zip')
files.download('/content/final_merged_data_part_ab.zip')
files.download('/content/final_merged_data_part_ac.zip')
files.download('/content/final_merged_data_part_ad.zip')
files.download('/content/final_merged_data_part_ae.zip')
files.download('/content/final_merged_data_part_af.zip')
files.download('/content/final_merged_data_part_ag.zip')
files.download('/content/final_merged_data_part_ah.zip')
files.download('/content/final_merged_data_part_ai.zip')
files.download('/content/final_merged_data_part_aj.zip')
files.download('/content/final_merged_data_part_ak.zip')
files.download('/content/final_merged_data_part_al.zip')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
#!pip install tiktoken
import os
import tiktoken
import concurrent.futures
from tqdm import tqdm

# Define paths and parameters
merged_file_path = '/content/final_merged_data.txt'
output_tokenized_file = '/content/final_tokenized_data.txt'
chunk_size = 1024 * 1024  # 1 MB

# Function to tokenize a given chunk of text
def tokenize_chunk(chunk):
    # Use the desired encoding (e.g., "gpt2", "cl100k_base", etc.)
    encoding = tiktoken.get_encoding("gpt2")
    return encoding.encode(chunk)

# Read the merged file in chunks
chunks = []
with open(merged_file_path, 'r', encoding='utf-8') as f:
    while True:
        chunk = f.read(chunk_size)
        if not chunk:
            break
        chunks.append(chunk)

# Process tokenization in parallel using ProcessPoolExecutor
tokenized_chunks = []
with concurrent.futures.ProcessPoolExecutor() as executor:
    # Submit all chunks for tokenization and wrap with tqdm for progress
    futures = [executor.submit(tokenize_chunk, chunk) for chunk in chunks]
    for future in tqdm(concurrent.futures.as_completed(futures),
                       total=len(futures),
                       desc="Tokenizing Chunks"):
        tokenized_chunks.append(future.result())

# Optionally, write the tokenized data to a file.
# Here we join tokens with a space (you might choose a different format).
with open(output_tokenized_file, 'w', encoding='utf-8') as out:
    for tokens in tokenized_chunks:
        out.write(" ".join(map(str, tokens)) + "\n")

print(f"Tokenized data saved at: {output_tokenized_file}")


Tokenizing Chunks:  39%|███▊      | 3517/9090 [02:17<04:09, 22.30it/s]

In [None]:
#!pip install tiktoken
import os
import tiktoken
import concurrent.futures
from tqdm import tqdm

# Define paths and parameters
merged_file_path = '/content/final_merged_data.txt'
output_tokenized_file = '/content/final_tokenized_data.txt'
chunk_size = 1024 * 256  # 256 KB; adjust as needed

# Function to tokenize a given chunk of text
def tokenize_chunk(chunk):
    encoding = tiktoken.get_encoding("gpt2")
    return encoding.encode(chunk)

# Generator to read file in chunks
def read_in_chunks(file_path, chunk_size):
    with open(file_path, 'r', encoding='utf-8') as f:
        while True:
            chunk = f.read(chunk_size)
            if not chunk:
                break
            yield chunk

# Process tokenization in parallel and write output incrementally
with open(output_tokenized_file, 'w', encoding='utf-8') as out:
    # Create a generator for chunks
    chunks = list(read_in_chunks(merged_file_path, chunk_size))

    # Optionally, reduce number of workers if needed
    with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor:
        futures = {executor.submit(tokenize_chunk, chunk): idx for idx, chunk in enumerate(chunks)}
        for future in tqdm(concurrent.futures.as_completed(futures),
                           total=len(futures),
                           desc="Tokenizing Chunks"):
            tokens = future.result()
            # Write the tokenized chunk immediately
            out.write(" ".join(map(str, tokens)) + "\n")

print(f"Tokenized data saved at: {output_tokenized_file}")


Tokenizing Chunks:  22%|██▏       | 7921/36360 [03:09<10:36, 44.69it/s]

In [None]:
from google.colab import files

# Define all suffixes from 'ae' to 'al' in order.
parts = ["ac", "ad", "ae", "af", "ag", "ah", "ai", "aj", "ak", "al"]

for suffix in parts:
    file_path = f"/content/final_merged_data_part_{suffix}"
    try:
        files.download(file_path)
        print(f"Downloaded {file_path}")
    except Exception as e:
        print(f"Could not download {file_path}: {e}")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded /content/final_merged_data_part_ac


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded /content/final_merged_data_part_ad


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded /content/final_merged_data_part_ae


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded /content/final_merged_data_part_af


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded /content/final_merged_data_part_ag


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded /content/final_merged_data_part_ah


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded /content/final_merged_data_part_ai


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded /content/final_merged_data_part_aj


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded /content/final_merged_data_part_ak


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded /content/final_merged_data_part_al


In [None]:
!pip install ftfy

import ftfy

original = """â€œO Gaetano,â€ she exclaimed, â€œI do love you so very dearly.
When you look at me your eyes are likeâ€”they are like the eyesâ€â€”here she faltered a
littleâ€”â€œthe eyes of a cow.â€"""

fixed = ftfy.fix_text(original)
print(fixed)


Collecting ftfy
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Downloading ftfy-6.3.1-py3-none-any.whl (44 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/44.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ftfy
Successfully installed ftfy-6.3.1
"O Gaetano," she exclaimed, "I do love you so very dearly. 
When you look at me your eyes are like—they are like the eyes"—here she faltered a
little—"the eyes of a cow."
