In [28]:
# This section simulates the kind of quality-metric evaluation
# performed by a Foundation Data team before training large models:
#   - Measure toxicity to ensure safe content
#   - Estimate factuality to prioritize reliable data
#   - Assess domain coverage for diversity and balance

In [29]:
!pip install langdetect
!pip install ftfy
!pip install plotly
!pip install graphviz
!pip install detoxify
from datasets import load_dataset
import pandas as pd
import numpy as np
from langdetect import detect
import ftfy
import re
import matplotlib.pyplot as plt
import plotly.express as px
from detoxify import Detoxify
from tqdm import tqdm
import torch



In [30]:
device = "cuda" if torch.cuda.is_available() else "cpu"
toxicity_model = Detoxify('original', device=device)

In [31]:

print("Loading dataset...")
dataset = load_dataset("cc_news", split="train[:5000]")
df = pd.DataFrame(dataset)
print(f"Loaded {len(df)} samples.")

Loading dataset...
Loaded 5000 samples.


In [32]:
df['text_length'] = df['text'].str.len()
print(df['text_length'].describe())

count     5000.000000
mean      2214.230800
std       2159.622082
min         74.000000
25%        802.000000
50%       1456.000000
75%       3124.500000
max      33930.000000
Name: text_length, dtype: float64


In [33]:
def clean_text(text):
    text = ftfy.fix_text(text)
    text = re.sub(r'\s+', ' ', text.strip())
    return text
df['clean_text'] = df['text'].apply(clean_text)


In [34]:
def try_detect(text):
    try:
        return detect(text)
    except:
        return None

df['lang'] = df['clean_text'].apply(try_detect)
df = df[df['lang'] == 'en']
print(f"After language filtering: {len(df)} samples.")

After language filtering: 5000 samples.


In [35]:
df = df.drop_duplicates(subset=['clean_text'])
df = df[(df['text_length'] > 200) & (df['text_length'] < 5000)]
print(f"After deduplication and length filtering: {len(df)} samples.")


After deduplication and length filtering: 4141 samples.


In [36]:
print("Computing toxicity scores (this may take ~1–2 min)...")

def compute_toxicity(text):
    try:
        return toxicity_model.predict(text)['toxicity']
    except Exception:
        return None

tqdm.pandas()
df['toxicity_score'] = df['clean_text'].progress_apply(compute_toxicity)

print("Done.")
print(df['toxicity_score'].describe())

Computing toxicity scores (this may take ~1–2 min)...


100%|███████████████████████████████████████| 4141/4141 [23:25<00:00,  2.95it/s]

Done.
count    4141.000000
mean        0.001796
std         0.009658
min         0.000515
25%         0.000633
50%         0.000741
75%         0.001039
max         0.432489
Name: toxicity_score, dtype: float64





In [37]:
fig = px.histogram(df, x='toxicity_score', nbins=50,
                   title='Toxicity Score Distribution')
fig.show()

In [38]:
def has_factual_marker(text):
    markers = r"\b(according to|data show|study|researchers|report|in\s+20\d{2})\b"
    return bool(re.search(markers, text, flags=re.I))

df['has_factual_marker'] = df['clean_text'].apply(has_factual_marker)
factual_coverage = df['has_factual_marker'].mean()
print(f"Factual marker present in {factual_coverage:.2%} of samples.")

Factual marker present in 34.82% of samples.


In [39]:
px.bar(df['has_factual_marker'].value_counts(normalize=True)
        .rename({True:'factualish', False:'other'}),
       title='Factual Marker Presence')

In [40]:
def extract_domain(text):
    text = text.lower()
    if any(w in text for w in ['sports', 'team', 'game']): return 'sports'
    if any(w in text for w in ['finance', 'stock', 'market', 'bank']): return 'finance'
    if any(w in text for w in ['technology', 'software', 'ai', 'data']): return 'technology'
    if any(w in text for w in ['politics', 'election', 'government']): return 'politics'
    return 'other'

df['domain'] = df['clean_text'].apply(extract_domain)
domain_counts = df['domain'].value_counts(normalize=True)
print(domain_counts)

px.pie(values=domain_counts.values,
       names=domain_counts.index,
       title='Domain Coverage')

domain
technology    0.543347
sports        0.283506
finance       0.100942
other         0.065926
politics      0.006279
Name: proportion, dtype: float64


In [41]:
summary = {
    'total_samples': 5000,
    'after_cleaning': len(df),
    'reduction_%': round(100 * (1 - len(df)/5000), 2),
    'avg_length': round(df['text_length'].mean(), 2),
    'avg_toxicity_score': round(df['toxicity_score'].mean(), 3),
    'high_toxicity_%': round(100 * (df['toxicity_score'] > 0.7).mean(), 2),
    'factual_marker_%': round(100 * factual_coverage, 2),
    'top_domains': domain_counts.head(3).to_dict()
}

print("\n--- Foundation Data Quality Report (Extended) ---")
for k, v in summary.items():
    print(f"{k}: {v}")


--- Foundation Data Quality Report (Extended) ---
total_samples: 5000
after_cleaning: 4141
reduction_%: 17.18
avg_length: 1824.59
avg_toxicity_score: 0.0020000000949949026
high_toxicity_%: 0.0
factual_marker_%: 34.82
top_domains: {'technology': 0.5433470176285922, 'sports': 0.2835063994204298, 'finance': 0.1009418014972229}
