In [23]:
from datasets import load_dataset
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import numpy as np
import random
import json

In [24]:
# Step 1: Load a small subset (1000 samples) of the C4 dataset in English
dataset = load_dataset("c4", "en", split="train", streaming=True)



In [25]:
small_dataset = [x["text"] for _, x in zip(range(10000), dataset) if "text" in x]

In [29]:
# Step 2: Initialize a sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')  # Efficient for semantic tasks

# Step 3: Compute embeddings for all sentences
embeddings = model.encode(small_dataset, show_progress_bar=True)

# Step 4: Calculate semantic differences
mean_embedding = np.mean(embeddings, axis=0)  # Average embedding for reference
semantic_differences = np.linalg.norm(embeddings - mean_embedding, axis=1)

# Step 5: Select top 10 samples with the largest semantic differences
top_indices = np.argsort(-semantic_differences)[:3]
top_samples = [small_dataset[i] for i in top_indices]

# Step 6: Write the top samples to a JSONL file
output_file = "3_samples.jsonl"
with open(output_file, "w") as f:
    for sample in top_samples:
        json.dump({"text": sample}, f)
        f.write("\n")  # JSONL format requires newline-separated JSON objects

print(f"Top samples have been stored in '{output_file}'.")

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Top samples have been stored in '3_samples.jsonl'.


In [27]:
embeddings.shape

(10000, 384)