# 0. Data Extraction

In this initial section we are going to list all the data sources that are going to be used for this project and also all the related local data ingestions and uploads to HuggingFace, which we will use as our main repository for datasets and models. Right now we are going to list some of the available datasets:

- [Common Corpus](https://huggingface.co/datasets/PleIAs/common_corpus): Real-world large scale dataset (Over 2 Trillion tokens)
- [UpVoteWeb](https://huggingface.co/datasets/OpenCo7/UpVoteWeb): Large-scale reddit comments and posts containing final scores
- [BurialGoods Transcripts](): My own hand-made dataset containing transcriptions of BurialGoods cinematographic pieces of art for pretraining
- [Cursed Toxic Pretraining Collection](https://huggingface.co/collections/eZWALT/cursed-toxic-pretraining): Curated collection of HuggingFace

If you are feeling brave (or dumb) enough, use the last corpus at your own risk! Now lets start cooking!

<p align="center">
  <img src="../resources/walterwhite.gif" width="300">
</p>


For each dataset we will have to define an personalized simple **local preprocessing**, which for pretraining data that consists of raw strings can be as simple as removing unnecessary fields and columns (categorical and numerical variables) and concatenating multiple string fields or subsetting. Optionally, we can also at the end define some **global preprocessing** unified steps such as special character removal or other simple steps that do not require great

In [None]:
import pandas 
import os 

## a) Preprocess BurialGoods Transcripts

In [11]:
import os
import json
import random
import hashlib
from pathlib import Path
from zipfile import ZipFile
from tqdm import tqdm
from datasets import Dataset, Features, Value
from huggingface_hub import HfApi, create_repo

def iter_samples_from_zips(data_dir, selected_zips, k_per_zip):
    """
    Generator yielding simplified sample dicts from selected zip files.
    Fields are simplified to: text, title.
    """
    for zip_path in tqdm(selected_zips, desc="Processing Zips"):
        with ZipFile(zip_path, "r") as z:
            # list only .txt files (case-insensitive)
            txt_files = [n for n in z.namelist() if n.lower().endswith(".txt")]
            if not txt_files:
                continue

            random.shuffle(txt_files)
            chosen = txt_files[:k_per_zip]

            for fname in chosen:
                try:
                    raw = z.read(fname)
                except KeyError:
                    continue
                
                try:
                    # Attempt UTF-8, fall back to latin-1
                    text = raw.decode("utf-8")
                except UnicodeDecodeError:
                    text = raw.decode("latin-1")
                    
                text = text.strip()
                if not text:
                    continue
                
                # Use the filename as the title, removing the extension
                title = Path(fname).stem
                
                # --- SIMPLIFIED SAMPLE DICT ---
                sample = {
                    "text": text,
                    "title": title,
                }
                # ------------------------------
                yield sample

In [12]:
# ---------- USER CONFIG ----------
DATA_DIR = "../data/burialgoods"       # folder containing many .zip files
N_ZIPS = 10                            # pick N zip files
K_PER_ZIP = 50                       # up to K .txt files per zip
REPO_ID = "eZWALT/burialgoods-pretraining-corpus" 
PRIVATE_REPO = False
LANGUAGE = "en"
SEED = 42
# ---------------------------------
random.seed(SEED)


In [None]:
# 1. Select zip files
zip_files = sorted(Path(DATA_DIR).glob("*.zip"))
if not zip_files:
    raise FileNotFoundError(f"No .zip files found in {DATA_DIR}")

if len(zip_files) <= N_ZIPS:
    selected = zip_files
else:
    selected = random.sample(list(zip_files), N_ZIPS)
    
print(f"Selected {len(selected)} zip files (out of {len(zip_files)})")

# 2. Define features and setup generator arguments
# --- SIMPLIFIED FEATURES ---
features = Features({
    "text": Value("string"),
    "title": Value("string"),
})
# ---------------------------

generator_func = iter_samples_from_zips

# Define the keyword arguments needed for the function
generator_kwargs = {
    "data_dir": Path(DATA_DIR),
    "selected_zips": selected,
    "k_per_zip": K_PER_ZIP
}

# 3. Create Hugging Face Dataset from the callable
print("Building and pushing HF Dataset object...")
ds = Dataset.from_generator(
    generator=generator_func, 
    gen_kwargs=generator_kwargs, 
    features=features
)
print(f"Dataset size: {ds.num_rows} samples.")

# 4. Push to Hugging Face Hub
print(f"Pushing dataset to the Hub as {REPO_ID} ...")
ds.push_to_hub(REPO_ID, private=PRIVATE_REPO)
print("Push complete.")

Selected 9 zip files (out of 9)
Building and pushing HF Dataset object...


Processing Zips: 100%|██████████| 9/9 [00:00<00:00, 621.11it/s]
Generating train split: 147 examples [00:00, 7072.62 examples/s]

Dataset size: 147 samples.





## b) Preview Pretraining Corpora

In [None]:
#!/usr/bin/env python3
"""
load_collection_and_visualize.py

- Scrapes a Hugging Face collection page for dataset links.
- Loads a small streaming sample from each dataset.
- Applies a per-dataset small preprocessing function.
- Visualizes safe stats: length histogram and label distribution.
- ALWAYS masks text so we don't print toxic content verbatim.
"""

import re
import requests
from bs4 import BeautifulSoup
from itertools import islice
from collections import Counter, defaultdict
import matplotlib.pyplot as plt
from datasets import load_dataset
from tqdm import tqdm
import math
import os

# ---------- CONFIG ----------
COLLECTION_URL = "https://huggingface.co/collections/eZWALT/cursed-toxic-pretraining-corpora"
SAMPLES_PER_DATASET = 500          # how many items to inspect per dataset (streaming)
OUTPUT_DIR = "hf_collection_vis"
os.makedirs(OUTPUT_DIR, exist_ok=True)
# -----------------------------

# --- Utilities ---
def mask_text_safe(txt, max_len=200):
    """Mask alphabetic characters with bullets so we never print raw toxic words."""
    if not isinstance(txt, str):
        return ""
    masked = re.sub(r"[A-Za-zÀ-ÖØ-öø-ÿ]", "•", txt)  # preserve punctuation & spacing
    return masked[:max_len] + ("…" if len(masked) > max_len else "")

def first_text_field(example, candidate_fields=None):
    """Return the first plausible text field in a dataset example."""
    if candidate_fields is None:
        candidate_fields = ["text", "body", "selftext", "comment", "post", "sentence", "content", "review", "caption"]
    for f in candidate_fields:
        if f in example and isinstance(example[f], str) and example[f].strip():
            return example[f]
    # fallback: find the first str valued field
    for k, v in example.items():
        if isinstance(v, str) and v.strip():
            return v
    return ""

# --- Per-dataset small preprocessing functions ---
# Add or override functions in this dict keyed by dataset_id (short id like 'hate_speech_offensive' or 'user/dsname')
def default_preprocess(example):
    text = first_text_field(example)
    tlen = len(text.split())
    return {"masked_preview": mask_text_safe(text, max_len=240), "length_tokens": tlen}

def reddit_preprocess(example):
    # reddit style datasets commonly have 'selftext' or 'body'
    text = example.get("selftext") or example.get("body") or first_text_field(example)
    tlen = len(text.split()) if text else 0
    return {"masked_preview": mask_text_safe(text), "length_tokens": tlen}

def hate_speech_preprocess(example):
    # often have 'text' and 'label'
    text = example.get("text") or first_text_field(example)
    label = example.get("label", example.get("labels", None))
    return {"masked_preview": mask_text_safe(text), "length_tokens": len(text.split()) if text else 0, "label": label}

# Map dataset slug (partial match) → preprocessing function
PREPROCESS_MAP = {
    "reddit": reddit_preprocess,
    "hate_speech_offensive": hate_speech_preprocess,
    "hate-offensive": hate_speech_preprocess,
    # add more mappings as you need, keys can be substrings matched against dataset id
}

# --- Step 1: scrape collection page for dataset links ---
def extract_dataset_ids_from_collection(url):
    r = requests.get(url, timeout=20)
    r.raise_for_status()
    soup = BeautifulSoup(r.text, "html.parser")
    dataset_ids = []
    # links to datasets often include '/datasets/' or '/viewer' pages which include dataset slugs
    for a in soup.find_all("a", href=True):
        href = a["href"]
        if href.startswith("/datasets/"):
            # /datasets/username/dsname or /datasets/dsname
            dataset_ids.append(href.split("/datasets/")[1].strip("/"))
        elif "/datasets/" in href:
            dataset_ids.append(href.split("/datasets/")[-1].strip("/"))
        # some collection items show a 'Viewer' link which may point to '/datasets/USER/DS/viewer' etc.
    # dedupe preserving order
    seen = set()
    out = []
    for d in dataset_ids:
        if d and d not in seen:
            seen.add(d)
            out.append(d)
    return out

# --- Step 2: load streaming sample and preprocess ---
def load_and_analyze_dataset(dataset_id, n_samples=SAMPLES_PER_DATASET):
    print(f"\n=== Processing dataset: {dataset_id} ===")
    preprocess_fn = default_preprocess
    for key, fn in PREPROCESS_MAP.items():
        if key.lower() in dataset_id.lower():
            preprocess_fn = fn
            break

    # attempt to stream 'train' split first, then fallback to default
    splits_to_try = ["train", "valid", "test", None]
    results = []
    last_exception = None
    for sp in splits_to_try:
        try:
            if sp:
                ds = load_dataset(dataset_id, split=sp, streaming=True)
            else:
                ds = load_dataset(dataset_id, streaming=True)
            # gather up to n_samples
            for item in islice(ds, n_samples):
                try:
                    proc = preprocess_fn(item)
                    results.append(proc)
                except Exception:
                    # fallback to default preprocess for this item
                    try:
                        results.append(default_preprocess(item))
                    except Exception:
                        continue
            break
        except Exception as e:
            last_exception = e
            continue
    if not results:
        print(f"  ! Could not load any samples for {dataset_id} (error: {last_exception})")
        return None

    # compute some safe stats
    lengths = [r.get("length_tokens", 0) for r in results]
    avg_len = sum(lengths) / len(lengths) if lengths else 0
    pct_short = sum(1 for L in lengths if L <= 5) / len(lengths)
    label_counts = Counter([r.get("label") for r in results if "label" in r and r.get("label") is not None])

    # prepare a tiny report (masked)
    masked_examples = [r["masked_preview"] for r in results[:5]]

    report = {
        "dataset_id": dataset_id,
        "n_samples_examined": len(results),
        "avg_length_tokens": avg_len,
        "pct_short_<=5_tokens": pct_short,
        "label_counts": dict(label_counts),
        "masked_examples": masked_examples,
        "lengths": lengths,
    }
    return report

# --- Step 3: visualizations (safe) ---
def plot_length_histogram(lengths, dataset_id):
    plt.figure(figsize=(6,3))
    plt.hist(lengths, bins=30)
    plt.title(f"Length distribution (tokens) — {dataset_id}")
    plt.xlabel("Tokens")
    plt.ylabel("Count")
    fn = os.path.join(OUTPUT_DIR, f"{dataset_id.replace('/', '_')}_len_hist.png")
    plt.tight_layout()
    plt.savefig(fn)
    plt.close()
    return fn

def plot_label_distribution(label_counts, dataset_id):
    if not label_counts:
        return None
    labels = list(label_counts.keys())
    values = [label_counts[k] for k in labels]
    plt.figure(figsize=(6,3))
    plt.bar(range(len(labels)), values)
    plt.title(f"Label distribution — {dataset_id}")
    plt.xticks(range(len(labels)), [str(x) for x in labels], rotation=45)
    plt.tight_layout()
    fn = os.path.join(OUTPUT_DIR, f"{dataset_id.replace('/', '_')}_label_dist.png")
    plt.savefig(fn)
    plt.close()
    return fn


dataset_ids = extract_dataset_ids_from_collection(COLLECTION_URL)
print(f"Found {len(dataset_ids)} dataset ids in collection (first 20 shown):")
for d in dataset_ids[:20]:
    print("  -", d)
reports = []
for dsid in tqdm(dataset_ids, desc="Datasets"):
    rep = load_and_analyze_dataset(dsid, n_samples=SAMPLES_PER_DATASET)
    if rep is None:
        continue
    # write a small JSON report
    import json
    outpath = os.path.join(OUTPUT_DIR, f"{dsid.replace('/', '_')}_report.json")
    with open(outpath, "w", encoding="utf-8") as fh:
        json.dump({k: v for k,v in rep.items() if k != "lengths"}, fh, indent=2)
    # plots
    plot_length_histogram(rep["lengths"], dsid)
    plot_label_distribution(rep["label_counts"], dsid)
    print(f"  -> examined {rep['n_samples_examined']} samples; avg tokens {rep['avg_length_tokens']:.1f}")
    print("     masked previews (first 3):")
    for m in rep["masked_examples"][:3]:
        print("       ", m)
    reports.append(rep)
print(f"\nAll done. Reports and plots saved in {OUTPUT_DIR}")


HTTPError: 404 Client Error: Not Found for url: https://huggingface.co/collections/eZWALT/cursed-toxic-pretraining

## c) Preview Cursed Toxic Pretraining Corpora