In [None]:
import json

json_path = r"output/firefox_test2.json"
with open(json_path, "r", encoding="utf-8") as f:
    data = list(json.load(f))


def get_samples_by_id(sample_id):
    """ Get all samples with matching sample ID then rely on other functions for further filtering """
    return [sample for sample in data if sample["topic_id"] == sample_id]


def filter_sample_keys(*keys):
    """ Show the values of specified keys for all samples in the dataset. """
    return [{key: sample[key] for key in keys if key in sample} for sample in data]

def print_head(sample_id, n=5, keys=None):
    """ print the first n samples from the dataset matching id, with optional key filtering """
    samples = get_samples_by_id(str(sample_id))
    if keys:
        samples = filter_sample_keys(*keys)
    for i, sample in enumerate(samples[:n]): 
        print(json.dumps(sample, indent=2, ensure_ascii=False))
        print("\n" + "-" * 40 + "\n")  # Separator between samples
    if len(samples) > n:
        print(f"Showing first {n} of {len(samples)} samples with ID {sample_id}.")
    else:
        print(f"Total samples with ID {sample_id}: {len(samples)}.")

In [6]:
print_head(9, n=10, keys=["domain", "title", "topic_keywords"])

Sample 1:
{
  "domain": "duckduckgo.com",
  "title": "pytorch dataloader with iterableDataset at DuckDuckGo",
  "topic_keywords": {
    "torch": 0.2535,
    "utils": 0.1813,
    "documentation": 0.1532,
    "pytorch": 0.1484,
    "data": 0.1096,
    "dataloader": 0.1092,
    "iterabledataset": 0.1092,
    "export": 0.0877,
    "torchvision": 0.0877,
    "vision": 0.0737
  }
}

----------------------------------------

Sample 2:
{
  "domain": "duckduckgo.com",
  "title": "pytorch dataloader default_collate at DuckDuckGo",
  "topic_keywords": {
    "torch": 0.2535,
    "utils": 0.1813,
    "documentation": 0.1532,
    "pytorch": 0.1484,
    "data": 0.1096,
    "dataloader": 0.1092,
    "iterabledataset": 0.1092,
    "export": 0.0877,
    "torchvision": 0.0877,
    "vision": 0.0737
  }
}

----------------------------------------

Sample 3:
{
  "domain": "duckduckgo.com",
  "title": "types.LambdaType at DuckDuckGo",
  "topic_keywords": {
    "types": 0.2478,
    "subclass": 0.2097,
    "su