In [1]:
from datasets import load_dataset
import torch
print("Loading dataset from huggingface...")

# Load the dataset from Hugging Face
# https://huggingface.co/datasets/julien040/hacker-news-posts
# NOTE: It only has a training split, so use that. And then we can divide it up into train/test ourselves
dataset = load_dataset("julien040/hacker-news-posts", split="train")
print(f"Dataset lazily loaded, size: {len(dataset)}")

dataset = [d for d in dataset]
print(f"Dataset loaded into memory")

  from .autonotebook import tqdm as notebook_tqdm


Loading dataset from huggingface...
Dataset lazily loaded, size: 4010957
Dataset loaded into memory


In [6]:
print(f"Missing score: {sum(1 for d in dataset if d["score"] is None)}")
print(f"Negative score: {sum(1 for d in dataset if d["score"] < 0)}")
print(f"Zero score: {sum(1 for d in dataset if d["score"] == 0)}")
print(f"Positive score: {sum(1 for d in dataset if d["score"] > 0)}")
print("----")
print(f"None title: {sum(1 for d in dataset if d["title"] is None)}")
print(f"Empty title: {sum(1 for d in dataset if d["title"] == "")}")
print(f"Has title: {sum(1 for d in dataset if isinstance(d["title"], str) and d["title"] != "")}")
print("----")
print(f"None url: {sum(1 for d in dataset if d["url"] is None)}")
print(f"Empty url: {sum(1 for d in dataset if d["url"] == "")}")
print(f"Has url: {sum(1 for d in dataset if isinstance(d["url"], str) and d["url"] != "")}")

Missing score: 0
Negative score: 1
Zero score: 1319
Positive score: 4009637
----
None title: 0
Empty title: 0
Has title: 4010957
----
None url: 243946
Empty url: 0
Has url: 3767011


In [7]:
import tldextract
import datasets

# Extract domains
def extract_domain(url):
    if not isinstance(url, str):
        return None

    if not url.startswith(('http://', 'https://')):
        url = 'https://' + url

    extracted = tldextract.extract(url)

    # Return domain.suffix (e.g., 'google.com', 'blogspot.com')
    if extracted.domain and extracted.suffix:
        return f"{extracted.domain}.{extracted.suffix}".lower()

    return None


cleaned_dataset = dataset.map()

for d in dataset:
    if not isinstance(d["score"], int):
        continue
    score = d["score"]

    if score < 0:
        continue

    if not isinstance(d["title"], str):
        continue
    title = d["title"]

    if not isinstance(d["url"], str):
        continue
    url = d["url"]
    domain = extract_domain(url)
    if domain is None:
        continue

    cleaned_dataset.append({
        "id": d["id"],
        "author": d["author"],
        "title": title,
        "domain": domain,
        "time": d["time"],
        "score": score,
    })

print(f"Original dataset size: {len(cleaned_dataset)}")
print(f"Cleaned dataset size: {len(cleaned_dataset)}")
print(f"The cleaned dataset has titles, domains and non-negative scores")


(train_dataset, validation_dataset, test_dataset) = torch.utils.data.random_split(
    cleaned_dataset,
    [0.8, 0.1, 0.1],
    torch.Generator().manual_seed(42)
)
print(f"Dataset split into train ({len(train_dataset)}), validation ({len(validation_dataset)}) and test ({len(test_dataset)}")

dataset = datasets.DatasetDict(

){
    "train": train_dataset,
    "validation": validation_dataset,
    "test": test_dataset
}

dataset.push_to_hub("hacker-news-posts-cleaned")

Cleaned dataset has length (3765851
Dataset split into train (3012681), validation (376585) and test (376585


In [None]:
def save_frequencies(dataset, column: str, keep_most_frequent_proportion: float = 1, save = True):
    frequencies = {}
    for d in dataset:
        value = d[column]
        if value in frequencies:
            frequencies[value] += 1
        else:
            frequencies[value] = 1

    counts_df = pd.DataFrame(
        {
            column: frequencies.keys(),
            "count": frequencies.values(),
        }
    )
    counts_df.set_index("count", inplace=True)
    counts_df.sort_values("count", ascending=False, inplace=True)
    original_keys = len(counts_df.index)
    original_count = counts_df["count"].sum()
    cut_off = counts_df["count"].quantile(1-keep_most_frequent_proportion, interpolation="higher")
    filtered_df = counts_df[counts_df["count"] >= cut_off]
    filtered_keys = len(filtered_df.index)
    filtered_count = filtered_df["count"].sum()

    file = f"{column}_counts.csv"
    print(f"Extracting column \"{column}\"")
    print(f"- {original_keys} distinct values across {original_count} entries")
    print(f"- Keeping at least {keep_most_frequent_proportion:.0%} of values. This corresponds to values with counts >= {cut_off} ({filtered_count/original_count:.0%} of total)")
    print(f"- Kept {filtered_keys} distinct values covering {filtered_count} entries")

    if save:
        print(f"- Saved to {file}")
        counts_df.to_csv(file, index=False)

save_frequencies([
    { "a": "hello" },
    { "a": "hello" },
    { "a": "hello" },
    { "a": "hello" },
    { "a": "world" },
    { "a": "world" },
    { "a": "world" },
    { "a": "bob" },
], "a", 0.8, save=False)

Extracting column "a"
- 3 distinct values across 8 entries
- Keeping at least 80% of values. This corresponds to values with counts >= 3 (88% of total)
- Kept 2 distinct values covering 7 entries
- Saved to a_counts.csv


In [20]:
import datasets
import pandas as pd

print("Downloading / importing dataset into memory (max 8GB)...")

datasets.config.IN_MEMORY_MAX_SIZE = 8 * 1024 * 1024 # 8GB

dataset = datasets.load_dataset("julien040/hacker-news-posts", split="train")

def filter_map(dataset: datasets.DatasetDict, map_or_none) -> datasets.DatasetDict:
    return dataset.map(map_or_none).filter(lambda x: x is not None)

# Extract domains
def extract_domain(url):
    if not isinstance(url, str):
        return None

    if not url.startswith(('http://', 'https://')):
        url = 'https://' + url

    extracted = tldextract.extract(url)

    # Return domain.suffix (e.g., 'google.com', 'blogspot.com')
    if extracted.domain and extracted.suffix:
        return f"{extracted.domain}.{extracted.suffix}".lower()

    return None

def map_item(d):
    if not isinstance(d["score"], int):
        return None
    score = d["score"]

    if score < 0:
        return None

    if not isinstance(d["title"], str):
        return None
    title = d["title"]

    if not isinstance(d["url"], str):
        return None
    url = d["url"]
    domain = extract_domain(url)
    if domain is None:
        return None

    return {
        "id": d["id"],
        "author": d["author"],
        "title": title,
        "domain": domain,
        "time": d["time"],
        "score": score,
    }

filtered_dataset = filter_map(dataset, map_item)

save_frequencies(filtered_dataset, "domain", keep_most_frequent_proportion=0.8)
save_frequencies(filtered_dataset, "author", keep_most_frequent_proportion=0.8)

filtered_dataset.save_to_disk("filtered_dataset")


Downloading / importing dataset into memory (max 8GB)...
Extracting column "domain"
- 350516 distinct values across 3765851 entries
- Keeping at least 80% of values. This corresponds to values with counts >= 1 (100% of total)
- Kept 350516 distinct values covering 3765851 entries
- Saved to domain_counts.csv
Extracting column "author"
- 320987 distinct values across 3765851 entries
- Keeping at least 80% of values. This corresponds to values with counts >= 1 (100% of total)
- Kept 320987 distinct values covering 3765851 entries
- Saved to author_counts.csv


Saving the dataset (2/2 shards): 100%|██████████| 3765851/3765851 [00:13<00:00, 285516.67 examples/s]
