# **Control Version**

In [None]:
!pip install ftfy
!pip install tqdm
!pip install tiktoken
import os
import ftfy
import re
import concurrent.futures
from tqdm import tqdm
import numpy as np  # For handling npy files

# List of input files (parts ag, ah, ai, aj, ak, al)
split_files = [
    #'/content/final_merged_data_part_af',
    #'/content/final_merged_data_part_ag',
    #'/content/final_merged_data_part_ah'
    #'/content/final_merged_data_part_ai',
    #'/content/final_merged_data_part_aj',
    #'/content/final_merged_data_part_ak',
    #'/content/final_merged_data_part_al',
    #'/content/april_book.txt',
    '/content/_bom'
]

# Loop over each file and process it
for file_path in split_files:
    # Extract file suffix (e.g., "ag", "ah", etc.)
    suffix = file_path.split('_')[-1]

    # --- Step 0: Read the file ---
    with open(file_path, 'r', encoding='utf-8') as f:
        raw_text = f.read()

    # --- Step 1: Split the file into overlapping chunks ---
    num_workers = 4
    total_length = len(raw_text)
    chunk_size = total_length // num_workers
    overlap = 1000  # Number of characters to overlap between chunks

    chunks = []
    for i in range(num_workers):
        start = i * chunk_size
        if i > 0:
            start = max(0, start - overlap)
        end = (i + 1) * chunk_size if i < num_workers - 1 else total_length
        chunks.append(raw_text[start:end])

    # --- Step 2: Define the per-chunk processing function ---
    def process_chunk(chunk):
        # Collapse all whitespace into a single space.
        collapsed = re.sub(r'\s+', ' ', chunk).strip()
        # Fix encoding issues using ftfy.
        fixed = ftfy.fix_text(collapsed)
        return fixed

    # --- Step 3: Process the chunks in parallel with a progress bar ---
    with concurrent.futures.ProcessPoolExecutor(max_workers=num_workers) as executor:
        processed_chunks = list(tqdm(
            executor.map(process_chunk, chunks),
            total=len(chunks),
            desc=f"Processing chunks for part {suffix}"
        ))

    # --- Step 4: Merge processed chunks by trimming the overlapping parts ---
    marker = "### NEW BOOK ###"
    merged_text = processed_chunks[0]
    for i in range(1, len(processed_chunks)):
        chunk = processed_chunks[i]
        # For chunks beyond the first, look for the first occurrence of the marker.
        idx = chunk.find(marker)
        if idx != -1:
            merged_text += " " + chunk[idx:]
        else:
            merged_text += " " + chunk

    # --- Step 5: Reintroduce the final book structure with Gutenberg marker handling ---
    # The Gutenberg pattern looks for the end marker (allowing for extra text between the asterisks)
    gutenberg_pattern = re.compile(r'\*\*\* END OF THE PROJECT GUTENBERG.*?\*\*\*', re.DOTALL)
    parts = merged_text.split(marker)
    intro = parts[0].strip()  # Text before the first "### NEW BOOK ###"
    book_blocks = parts[1:]
    processed_books = []
    for block in book_blocks:
        gutenberg_match = gutenberg_pattern.search(block)
        if gutenberg_match:
            # Remove any text from the Gutenberg marker onward
            book_content = block[:gutenberg_match.start()].strip()
            # Replace the Gutenberg marker and any intervening text with the end-of-book marker
            processed_books.append(marker + "\n" + book_content + "\n### END OF BOOK ###")
        else:
            # If no Gutenberg marker is found, do not add an end-of-book marker
            processed_books.append(marker + "\n" + block.strip())

    # --- Step 5.1: Clean metadata between [BOOK] and the Gutenberg start marker ---
    # Remove metadata between "[BOOK]" and a marker starting with "***START OF THE PROJECT GUTENBERG"
    clean_pattern = re.compile(r'(\[BOOK\]).*?(\*\*\*START OF THE PROJECT GUTENBERG.*?\*\*\*)', re.DOTALL)
    cleaned_books = []
    for book in processed_books:
        cleaned_book = clean_pattern.sub(r'\1 \2', book)
        cleaned_books.append(cleaned_book)

    # --- Step 6: Combine the intro (if any) with the cleaned book blocks ---
    final_output = ""
    if intro:
        final_output += intro + "\n\n"
    final_output += "\n\n".join(cleaned_books)

    # --- Step 7: Save the final compiled output to a text file ---
    output_filename = f"processed_part_final_control_version_{suffix}.txt"
    with open(output_filename, "w", encoding="utf-8") as f:
        f.write(final_output)
    print(f"\nFinished processing {output_filename}")


Collecting ftfy
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Downloading ftfy-6.3.1-py3-none-any.whl (44 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/44.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ftfy
Successfully installed ftfy-6.3.1
Collecting tiktoken
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.9.0


Processing chunks for part bom: 100%|██████████| 4/4 [00:00<00:00, 74.48it/s]


Finished processing processed_part_final_control_version_bom.txt





# **Comparison**

In [None]:
#!pip install ftfy
#!pip install tqdm
#!pip install tiktoken
import os
import ftfy
import re
import concurrent.futures
from tqdm import tqdm
import numpy as np  # For handling npy files

# List of input files (parts af, ag, ah, ai, aj, ak, al)
split_files = [
    #'/content/final_merged_data_part_af',
    #'/content/final_merged_data_part_ag',
    #'/content/final_merged_data_part_ah'
    #'/content/final_merged_data_part_ai',
    #'/content/final_merged_data_part_aj',
    #'/content/final_merged_data_part_ak',
    #'/content/final_merged_data_part_al'
    #'/content/processed_part_final_control_version_book.txt.txt',
    '/content/processed_part_final_control_version_bom.txt'
]

# Loop over each file and process it
for file_path in split_files:
    # Extract file suffix (e.g., "af", "ag", etc.)
    suffix = file_path.split('_')[-1]

    # --- Step 0: Read the file ---
    with open(file_path, 'r', encoding='utf-8') as f:
        raw_text = f.read()

    # --- Step 1: Split the file into overlapping chunks ---
    num_workers = 4
    total_length = len(raw_text)
    chunk_size = total_length // num_workers
    overlap = 1000  # Number of characters to overlap between chunks

    chunks = []
    for i in range(num_workers):
        start = i * chunk_size
        if i > 0:
            start = max(0, start - overlap)
        end = (i + 1) * chunk_size if i < num_workers - 1 else total_length
        chunks.append(raw_text[start:end])

    # --- Step 2: Define the per-chunk processing function ---
    def process_chunk(chunk):
        # Collapse all whitespace into a single space.
        collapsed = re.sub(r'\s+', ' ', chunk).strip()
        # Fix encoding issues using ftfy.
        fixed = ftfy.fix_text(collapsed)
        return fixed

    # --- Step 3: Process the chunks in parallel with a progress bar ---
    with concurrent.futures.ProcessPoolExecutor(max_workers=num_workers) as executor:
        processed_chunks = list(tqdm(
            executor.map(process_chunk, chunks),
            total=len(chunks),
            desc=f"Processing chunks for part {suffix}"
        ))

    # --- Step 4: Merge processed chunks by trimming the overlapping parts ---
    marker = "### NEW BOOK ###"
    merged_text = processed_chunks[0]
    for i in range(1, len(processed_chunks)):
        chunk = processed_chunks[i]
        # For chunks beyond the first, look for the first occurrence of the marker.
        idx = chunk.find(marker)
        if idx != -1:
            merged_text += " " + chunk[idx:]
        else:
            merged_text += " " + chunk

    # --- New Fix: Remove invalid first book ---
    # Find the first instance of [AUTHOR: Lewis Goldsmith] and remove everything before it.
    author_marker = "[AUTHOR: Lewis Goldsmith]"
    idx_author = merged_text.find(author_marker)
    if idx_author != -1:
        merged_text = merged_text[idx_author:]

    # --- Step 5: Reintroduce the final book structure with Gutenberg marker handling ---
    # The Gutenberg pattern looks for the end marker (allowing for extra text between the asterisks)
    gutenberg_pattern = re.compile(r'\*\*\* END OF THE PROJECT GUTENBERG.*?\*\*\*', re.DOTALL)
    parts = merged_text.split(marker)
    intro = parts[0].strip()  # Text before the first "### NEW BOOK ###"
    book_blocks = parts[1:]
    processed_books = []
    for block in book_blocks:
        gutenberg_match = gutenberg_pattern.search(block)
        if gutenberg_match:
            # Remove any text from the Gutenberg marker onward
            book_content = block[:gutenberg_match.start()].strip()
            # Replace the Gutenberg marker and any intervening text with the end-of-book marker
            processed_books.append(marker + "\n" + book_content + "\n### END OF BOOK ###")
        else:
            # If no Gutenberg marker is found, do not add an end-of-book marker
            processed_books.append(marker + "\n" + block.strip())

    # --- Step 5.1: Clean metadata between [BOOK] and the Gutenberg start marker ---
    # Remove metadata between "[BOOK]" and a marker starting with "***START OF THE PROJECT GUTENBERG"
    clean_pattern = re.compile(r'(\[BOOK\]).*?(\*\*\*START OF THE PROJECT GUTENBERG.*?\*\*\*)', re.DOTALL)
    cleaned_books = []
    for book in processed_books:
        cleaned_book = clean_pattern.sub(r'\1 \2', book)
        cleaned_books.append(cleaned_book)

    # --- Step 6: Combine the intro (if any) with the cleaned book blocks ---
    final_output = ""
    if intro:
        final_output += intro + "\n\n"
    final_output += "\n\n".join(cleaned_books)

    # --- Step 7: Save the final compiled output to a text file ---
    output_filename = f"processed_part_final_{suffix}.txt"
    with open(output_filename, "w", encoding="utf-8") as f:
        f.write(final_output)
    print(f"\nFinished processing {output_filename}")


Processing chunks for part bom.txt: 100%|██████████| 4/4 [00:00<00:00, 75.39it/s]


Finished processing processed_part_final_bom.txt.txt





In [None]:
# Set your file suffix (for example, "af")
suffix = "bom"

# File names based on the suffix.
control_filename = f"processed_part_final_control_version_{suffix}.txt"
processed_filename = f"processed_part_final_{suffix}.txt"

def print_first_characters(filename, num_chars=10000):
    with open(filename, 'r', encoding='utf-8') as f:
        text = f.read()

    print("Part AF needs to start with the second book. The first one is not actually a book.")
    print("\n")
    print(f"First {num_chars} characters for {filename}:")
    print(text[:num_chars])
    print("\n" + "="*50 + "\n")

print_first_characters(control_filename)
print_first_characters(processed_filename)


Part AF needs to start with the second book. The first one is not actually a book.


First 10000 characters for processed_part_final_control_version_bom.txt:
THE BOOK OF MORMON An Account Written by THE HAND OF MORMON UPON PLATES TAKEN FROM THE PLATES OF NEPHI Wherefore, it is an abridgment of the record of the people of Nephi, and also of the Lamanites--Written to the Lamanites, who are a remnant of the house of Israel; and also to Jew and Gentile--Written by way of commandment, and also by the spirit of prophecy and of revelation--Written and sealed up, and hid up unto the Lord, that they might not be destroyed--To come forth by the gift and power of God unto the interpretation thereof--Sealed by the hand of Moroni, and hid up unto the Lord, to come forth in due time by way of the Gentile--The interpretation thereof by the gift of God. An abridgment taken from the Book of Ether also, which is a record of the people of Jared, who were scattered at the time the Lord confounded the lang

# **Tokenizing Comparison Version**

In [None]:
    # --- Step 10: Tokenize and save as a NumPy .npy file ---
import tiktoken
enc = tiktoken.get_encoding("gpt2")
tokenized_output = enc.encode(final_output)

tokenized_filename = f"tokenized_part_{suffix}.npy"
np.save(tokenized_filename, np.array(tokenized_output, dtype=np.int32))
print(f"Tokenized file saved as {tokenized_filename}\n")


Tokenized file saved as tokenized_part_bom.npy

