# ðŸ“š FineWeb-Edu Subject Filter

Filter FineWeb-Edu dataset by academic subject using vector embeddings.

**Approach:**
1. Define subject anchors (canonical descriptions)
2. Embed anchors and documents with a strong embedding model
3. Compute cosine similarity to assign subjects
4. Export filtered subsets

No classifier training needed â€” pure embedding similarity.

In [None]:
# Install dependencies
!pip install datasets sentence-transformers faiss-cpu pandas numpy tqdm huggingface_hub -q

In [None]:
import numpy as np
import pandas as pd
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
import faiss
from tqdm.auto import tqdm
from collections import defaultdict
import json

## 1. Define Subject Anchors

Each subject gets multiple anchor texts â€” canonical descriptions that capture what content in that subject looks like.

In [None]:
SUBJECT_ANCHORS = {
    "mathematics": [
        "Mathematics, algebra, calculus, geometry, and mathematical proofs",
        "Solving equations, derivatives, integrals, and mathematical theorems",
        "Linear algebra, matrices, vectors, and mathematical analysis",
        "Probability theory, statistics, and mathematical modeling",
        "Number theory, discrete mathematics, and combinatorics"
    ],
    "physics": [
        "Physics, mechanics, thermodynamics, and electromagnetism",
        "Quantum mechanics, relativity, and particle physics",
        "Newton's laws, force, energy, momentum, and motion",
        "Waves, optics, acoustics, and electromagnetic radiation",
        "Astrophysics, cosmology, and gravitational physics"
    ],
    "chemistry": [
        "Chemistry, chemical reactions, molecules, and compounds",
        "Organic chemistry, inorganic chemistry, and biochemistry",
        "Periodic table, elements, atoms, and chemical bonding",
        "Stoichiometry, molarity, and chemical equilibrium",
        "Acids, bases, pH, and electrochemistry"
    ],
    "biology": [
        "Biology, cells, genetics, and evolution",
        "DNA, RNA, proteins, and molecular biology",
        "Ecology, ecosystems, biodiversity, and environmental science",
        "Human anatomy, physiology, and organ systems",
        "Microbiology, bacteria, viruses, and immunology"
    ],
    "computer_science": [
        "Computer science, algorithms, data structures, and programming",
        "Software engineering, databases, and system design",
        "Machine learning, artificial intelligence, and neural networks",
        "Operating systems, computer architecture, and networking",
        "Cybersecurity, cryptography, and information security"
    ],
    "history": [
        "History, historical events, civilizations, and historical figures",
        "World War, ancient history, medieval history, and modern history",
        "American history, European history, and Asian history",
        "Political history, social movements, and revolutions",
        "Archaeology, historical documents, and historiography"
    ],
    "literature": [
        "Literature, novels, poetry, and literary analysis",
        "Shakespeare, classic literature, and contemporary fiction",
        "Literary criticism, narrative techniques, and symbolism",
        "Drama, prose, verse, and creative writing",
        "World literature, genres, and literary movements"
    ],
    "economics": [
        "Economics, microeconomics, macroeconomics, and economic theory",
        "Supply and demand, market equilibrium, and price theory",
        "GDP, inflation, monetary policy, and fiscal policy",
        "International trade, finance, and economic development",
        "Behavioral economics, game theory, and econometrics"
    ],
    "law": [
        "Law, legal systems, jurisprudence, and legislation",
        "Constitutional law, criminal law, and civil law",
        "Contracts, torts, property law, and legal procedures",
        "International law, human rights, and legal ethics",
        "Court cases, legal precedents, and judicial review"
    ],
    "medicine": [
        "Medicine, medical diagnosis, treatment, and healthcare",
        "Diseases, symptoms, pathology, and pharmacology",
        "Clinical medicine, surgery, and medical procedures",
        "Public health, epidemiology, and preventive medicine",
        "Medical research, clinical trials, and evidence-based medicine"
    ],
    "philosophy": [
        "Philosophy, ethics, metaphysics, and epistemology",
        "Logic, reasoning, and philosophical arguments",
        "Political philosophy, aesthetics, and philosophy of mind",
        "Ancient philosophy, modern philosophy, and contemporary philosophy",
        "Existentialism, phenomenology, and analytic philosophy"
    ],
    "psychology": [
        "Psychology, cognitive psychology, and behavioral psychology",
        "Mental health, psychological disorders, and therapy",
        "Developmental psychology, social psychology, and personality",
        "Neuroscience, brain function, and cognitive science",
        "Psychological research, experiments, and psychological theories"
    ]
}

print(f"Defined {len(SUBJECT_ANCHORS)} subjects")
for subject, anchors in SUBJECT_ANCHORS.items():
    print(f"  - {subject}: {len(anchors)} anchors")

## 2. Load Embedding Model

In [None]:
# Using a strong, fast embedding model
# Alternatives: "BAAI/bge-large-en-v1.5", "thenlper/gte-large", "Snowflake/snowflake-arctic-embed-m"
MODEL_NAME = "BAAI/bge-small-en-v1.5"  # Small for testing, swap to bge-large for production

print(f"Loading embedding model: {MODEL_NAME}")
model = SentenceTransformer(MODEL_NAME)
print(f"Embedding dimension: {model.get_sentence_embedding_dimension()}")

## 3. Embed Subject Anchors

In [None]:
def embed_anchors(subject_anchors, model):
    """Embed all anchor texts and compute mean embedding per subject."""
    subject_embeddings = {}
    
    for subject, anchors in subject_anchors.items():
        # Embed all anchors for this subject
        embeddings = model.encode(anchors, normalize_embeddings=True)
        # Take the mean as the subject centroid
        subject_embeddings[subject] = np.mean(embeddings, axis=0)
    
    return subject_embeddings

print("Embedding subject anchors...")
subject_embeddings = embed_anchors(SUBJECT_ANCHORS, model)

# Stack into matrix for fast similarity computation
subjects = list(subject_embeddings.keys())
subject_matrix = np.stack([subject_embeddings[s] for s in subjects])
print(f"Subject embedding matrix shape: {subject_matrix.shape}")

## 4. Load FineWeb-Edu Sample

In [None]:
# Load a sample â€” adjust size based on your compute
# For full dataset, use streaming=True and process in batches

SAMPLE_SIZE = 10000  # Start small for testing

print(f"Loading FineWeb-Edu sample ({SAMPLE_SIZE} docs)...")
dataset = load_dataset(
    "HuggingFaceFW/fineweb-edu",
    name="sample-10BT",  # Use the 10B token sample, or "default" for full
    split=f"train[:{SAMPLE_SIZE}]"
)

print(f"Loaded {len(dataset)} documents")
print(f"Columns: {dataset.column_names}")
print(f"\nSample document:")
print(dataset[0]['text'][:500])

## 5. Classify Documents by Subject

In [None]:
def classify_documents(texts, model, subject_matrix, subjects, batch_size=64, max_length=512):
    """
    Classify documents by computing similarity to subject centroids.
    
    Returns:
        labels: List of assigned subject labels
        scores: List of similarity scores for assigned subject
        all_scores: Matrix of all similarity scores (docs x subjects)
    """
    all_scores = []
    
    # Process in batches
    for i in tqdm(range(0, len(texts), batch_size), desc="Classifying"):
        batch_texts = texts[i:i + batch_size]
        
        # Truncate long texts (embedding models have limits)
        batch_texts = [t[:max_length * 4] for t in batch_texts]  # ~4 chars per token approx
        
        # Embed batch
        batch_embeddings = model.encode(batch_texts, normalize_embeddings=True)
        
        # Compute cosine similarity to all subjects
        # Since embeddings are normalized, dot product = cosine similarity
        similarities = batch_embeddings @ subject_matrix.T
        all_scores.append(similarities)
    
    # Concatenate all batches
    all_scores = np.vstack(all_scores)
    
    # Get best subject for each document
    best_indices = np.argmax(all_scores, axis=1)
    labels = [subjects[i] for i in best_indices]
    scores = [all_scores[i, best_indices[i]] for i in range(len(labels))]
    
    return labels, scores, all_scores

# Run classification
texts = dataset['text']
labels, scores, all_scores = classify_documents(texts, model, subject_matrix, subjects)

print(f"\nClassification complete!")

## 6. Analyze Results

In [None]:
# Subject distribution
from collections import Counter

distribution = Counter(labels)
print("Subject Distribution:")
print("=" * 40)
for subject, count in distribution.most_common():
    pct = count / len(labels) * 100
    print(f"{subject:20s}: {count:6d} ({pct:5.1f}%)")

In [None]:
# Confidence distribution
print(f"\nConfidence Scores:")
print(f"  Mean:   {np.mean(scores):.3f}")
print(f"  Median: {np.median(scores):.3f}")
print(f"  Min:    {np.min(scores):.3f}")
print(f"  Max:    {np.max(scores):.3f}")

# How many are high confidence?
thresholds = [0.3, 0.4, 0.5, 0.6]
print(f"\nDocuments above confidence threshold:")
for thresh in thresholds:
    count = sum(1 for s in scores if s >= thresh)
    print(f"  >= {thresh}: {count} ({count/len(scores)*100:.1f}%)")

In [None]:
# Sample documents per subject
print("\nSample documents per subject:")
print("=" * 60)

for subject in subjects[:5]:  # First 5 subjects
    print(f"\n[{subject.upper()}]")
    
    # Get indices for this subject, sorted by confidence
    subject_indices = [(i, scores[i]) for i, l in enumerate(labels) if l == subject]
    subject_indices.sort(key=lambda x: x[1], reverse=True)
    
    if subject_indices:
        # Show top example
        idx, score = subject_indices[0]
        text = texts[idx][:300].replace('\n', ' ')
        print(f"  Score: {score:.3f}")
        print(f"  Text: {text}...")

## 7. Export Filtered Datasets

In [None]:
# Add labels to dataset
dataset_with_labels = dataset.add_column("subject", labels)
dataset_with_labels = dataset_with_labels.add_column("subject_score", scores)

print(f"Dataset with labels: {dataset_with_labels}")
print(f"New columns: {dataset_with_labels.column_names}")

In [None]:
# Filter by subject with confidence threshold
CONFIDENCE_THRESHOLD = 0.4

def export_subject(dataset, subject, threshold=0.4):
    """Filter dataset for a specific subject with confidence threshold."""
    filtered = dataset.filter(
        lambda x: x['subject'] == subject and x['subject_score'] >= threshold
    )
    return filtered

# Export each subject
print(f"Exporting subjects (threshold={CONFIDENCE_THRESHOLD}):")
for subject in subjects:
    filtered = export_subject(dataset_with_labels, subject, CONFIDENCE_THRESHOLD)
    print(f"  {subject}: {len(filtered)} documents")
    
    # Save to parquet
    if len(filtered) > 0:
        filtered.to_parquet(f"fineweb_edu_{subject}.parquet")

In [None]:
# Or save the full labeled dataset
dataset_with_labels.to_parquet("fineweb_edu_labeled.parquet")
print("Saved full labeled dataset to fineweb_edu_labeled.parquet")

## 8. Scale to Full Dataset (Streaming)

In [None]:
# For processing the full 1.3T token dataset, use streaming
# This processes chunks without loading everything into memory

def process_streaming(model, subject_matrix, subjects, batch_size=1000, max_docs=None):
    """
    Process FineWeb-Edu in streaming mode for full-scale processing.
    """
    from datasets import load_dataset
    
    # Load in streaming mode
    stream = load_dataset(
        "HuggingFaceFW/fineweb-edu",
        name="default",  # Full dataset
        split="train",
        streaming=True
    )
    
    results = []
    batch_texts = []
    batch_ids = []
    doc_count = 0
    
    for doc in tqdm(stream, desc="Processing"):
        batch_texts.append(doc['text'][:2000])  # Truncate
        batch_ids.append(doc.get('id', doc_count))
        
        if len(batch_texts) >= batch_size:
            # Process batch
            embeddings = model.encode(batch_texts, normalize_embeddings=True)
            similarities = embeddings @ subject_matrix.T
            best_indices = np.argmax(similarities, axis=1)
            
            for i, (doc_id, idx) in enumerate(zip(batch_ids, best_indices)):
                results.append({
                    'id': doc_id,
                    'subject': subjects[idx],
                    'score': float(similarities[i, idx])
                })
            
            batch_texts = []
            batch_ids = []
            doc_count += batch_size
            
            # Save periodically
            if doc_count % 100000 == 0:
                pd.DataFrame(results).to_parquet(f"labels_checkpoint_{doc_count}.parquet")
        
        if max_docs and doc_count >= max_docs:
            break
    
    return results

# Uncomment to run on larger scale:
# results = process_streaming(model, subject_matrix, subjects, max_docs=100000)

## 9. Upload to HuggingFace Hub

In [None]:
# Upload your filtered datasets to HuggingFace
from huggingface_hub import HfApi, login

# Login (get token from https://huggingface.co/settings/tokens)
# login(token="your_token_here")

def upload_to_hub(dataset, repo_name, private=False):
    """Upload dataset to HuggingFace Hub."""
    dataset.push_to_hub(
        repo_name,
        private=private
    )
    print(f"Uploaded to https://huggingface.co/datasets/{repo_name}")

# Example:
# upload_to_hub(math_dataset, "your-username/fineweb-edu-math")

## 10. Bonus: Multi-Label Classification

In [None]:
# Some documents might belong to multiple subjects
# e.g., "biophysics" is both biology and physics

def get_multi_labels(all_scores, subjects, threshold=0.35):
    """
    Assign multiple labels to documents that score above threshold
    for multiple subjects.
    """
    multi_labels = []
    
    for doc_scores in all_scores:
        doc_labels = [
            subjects[i] 
            for i, score in enumerate(doc_scores) 
            if score >= threshold
        ]
        multi_labels.append(doc_labels if doc_labels else ['unknown'])
    
    return multi_labels

multi_labels = get_multi_labels(all_scores, subjects, threshold=0.35)

# How many docs have multiple labels?
multi_count = sum(1 for labels in multi_labels if len(labels) > 1)
print(f"Documents with multiple subjects: {multi_count} ({multi_count/len(multi_labels)*100:.1f}%)")

# Show examples
print("\nExamples of multi-label documents:")
for i, labels in enumerate(multi_labels[:100]):
    if len(labels) > 1:
        print(f"  {labels}: {texts[i][:100]}...")
        if sum(1 for l in multi_labels[:i+1] if len(l) > 1) >= 5:
            break

---

## Next Steps

1. **Scale up**: Run on full FineWeb-Edu using streaming mode
2. **Validate**: Manually check samples from each subject for accuracy
3. **Tune thresholds**: Adjust confidence thresholds per subject if needed
4. **Better anchors**: Use actual textbook excerpts or Wikipedia intros as anchors
5. **Publish**: Upload filtered subsets to HuggingFace Hub
6. **Analyze**: Write up findings on subject distribution in FineWeb-Edu