## Data Processing

Grabs data from wikipedia, BookCorpus, and Pilesubset for training

In [18]:
from datasets import load_dataset, concatenate_datasets
import os
import re
import math
from collections import Counter
from tqdm import tqdm
from langdetect import detect, DetectorFactory

In [2]:
output_file = os.path.join("raw_data", "combined_text.txt")


In [3]:
os.makedirs("raw_data", exist_ok=True)
# 1. Load Wikipedia (1%)
print("Loading Wikipedia...")
wiki = load_dataset("wikipedia", "20220301.en", split="train[:1%]", trust_remote_code=True)
wiki = wiki.filter(lambda x: len(x['text']) > 200)

Loading Wikipedia...


In [4]:
# 2. Load BookCorpus
print("Loading BookCorpus...")
books = load_dataset("bookcorpus", split="train", trust_remote_code=True)
books = books.filter(lambda x: len(x['text']) > 200)

Loading BookCorpus...


Downloading data: 100%|███████████████████████████████████████████████████████████| 1.18G/1.18G [01:42<00:00, 11.6MB/s]
Generating train split: 100%|███████████████████████████████████| 74004228/74004228 [08:09<00:00, 151118.33 examples/s]
Filter: 100%|███████████████████████████████████████████████████| 74004228/74004228 [01:27<00:00, 847086.75 examples/s]


In [6]:
# 3. Load OpenWebText
print("Loading OpenWebText...")
pile = load_dataset("openwebtext", split="train[:1%]", trust_remote_code=True)
pile = pile.filter(lambda x: len(x['text']) > 200)

Loading OpenWebText...


Downloading data: 100%|█████████████████████████████████████████████████████████████| 21/21 [18:33<00:00, 53.02s/files]
Generating train split: 100%|███████████████████████████████████████| 8013769/8013769 [13:21<00:00, 9997.62 examples/s]
Filter: 100%|█████████████████████████████████████████████████████████| 80138/80138 [00:00<00:00, 230282.21 examples/s]


In [7]:
# 4. Merge datasets
print("Merging datasets...")
combined = concatenate_datasets([wiki, books, pile])

# 5. Save to text file
print("Saving to file...")
with open(output_file, "w", encoding="utf-8") as f:
    for entry in combined:
        f.write(entry['text'].strip().replace("\n", " ") + "\n")

print(f"Saved {len(combined):,} entries to {output_file}")

Merging datasets...
Saving to file...
Saved 1,150,270 entries to raw_data\combined_text.txt


In [8]:
print(f"Wikipedia dataset size: {len(wiki):,}")


Wikipedia dataset size: 64,098


## Start here if Dataset Created

In [2]:
def clean_text(text):
    """
    Cleans raw text by removing excessive whitespace and HTML tags.

    Steps:
    - Collapses multiple spaces, tabs, and newlines into a single space
    - Removes HTML tags using regex
    - Strips leading/trailing whitespace

    Args:
        text (str): The raw text to clean.

    Returns:
        str: Cleaned text.
    """
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'<[^>]+>', '', text)
    text = text.strip()
    return text

In [3]:
def shannon_entropy(text):
    """
    Calculates Shannon entropy of a string.

    Useful to measure text randomness:
    - Natural language usually has entropy > 3.0
    - Low-entropy suggests repetitive or spammy text

    Args:
        text (str): The text to analyze.

    Returns:
        float: Shannon entropy value.
    """
    prob = [freq / len(text) for freq in Counter(text).values()]
    return -sum(p * math.log2(p) for p in prob)

In [4]:
def has_repeated_patterns(text):
    """
    Detects low-quality repetitive patterns in text.

    Flags:
    - Single-character repetition (e.g., "aaaaaaa")
    - Repeated words (e.g., "yes yes yes yes")

    Args:
        text (str): The input text to scan.

    Returns:
        bool: True if repetitive pattern found, else False.
    """
    return bool(
        re.search(r'(.)\1{6,}', text) or      
        re.search(r'\b(\w+)\b(\s+\1\b){3,}', text)
    )

In [20]:
from langdetect import detect

def is_english(text):
    """
    Determines whether the given text is written in English using language detection.

    This function uses the `langdetect` library to analyze the first 1000 characters
    of the input text. If the detected language is English ("en"), the function returns True.

    Args:
        text (str): The text to analyze.

    Returns:
        bool: True if the text is detected as English, False otherwise.
    """
    try:
        return detect(text[:1000]) == "en"
    except:
        return False


In [21]:
input_path = "raw_data/combined_text.txt"
output_path = "raw_data/cleaned_combined.txt"

count = 0
with open(output_path, "w", encoding="utf-8") as out_f:
    with open(input_path, "r", encoding="utf-8") as in_f:
        for line in tqdm(in_f, desc="Filtering"):
            # Step 1: Clean text
            cleaned = clean_text(line)

            # Step 2: Language detection
            try:
                if detect(cleaned[:1000]) != "en":
                    continue
            except:
                continue

            # Step 3: Minimum length check
            if len(cleaned) < 200:
                continue

            # Step 4: Shannon entropy check
            if shannon_entropy(cleaned) <= 3.0:
                continue

            # Step 5: Pattern repetition check
            if has_repeated_patterns(cleaned):
                continue

            # If all checks pass, save the line
            out_f.write(cleaned + "\n")
            count += 1

print(f"Saved {count:,} clean entries to {output_path}")


Filtering: 41338it [02:45, 250.37it/s]

Saved 41,094 clean entries to raw_data/cleaned_combined.txt



