In [8]:
from datasets import load_dataset
import json

In [9]:
# Load dataset subsets
cnndm_test = load_dataset("Salesforce/rose", "cnndm_test")["data"]
cnndm_val = load_dataset("Salesforce/rose", "cnndm_validation")["data"]
xsum = load_dataset("Salesforce/rose", "xsum")["data"]

In [10]:
# Check dataset features
print(cnndm_test)
print(json.dumps(cnndm_test[0], indent=2))

Dataset({
    features: ['source', 'reference', 'reference_acus', 'count_id', 'example_id', 'annotations', 'system_outputs'],
    num_rows: 500
})
{
  "source": "Club Tijuana star Juan Arango conjured memories Luis Suarez in his team's 4-3 defeat by Monterrey in the Mexican league - but it was not through prodigious scoring. The Venezuelan icon Arango sank his teeth into the shoulder of Jesus Zavela as his temper flared in the defeat. He was not booked by the referee but could face a heavy retrospective ban. Juan Arango (left) bites the shoulder of opponent Jesus Zavela in a moment of madness . Zavala holds his shoulder after being bitten by Arango, in the game Zavala's side won 4-3 in Mexico . Zavala shows the referee the mark on his shoulder after being bittern by Arango . Arango (right) earlier scored a magnificent free kick to bring his Club Tijuana team level against Monterrey . Arango had earlier curled in a magnificent free kick for his team to bring them level after falling 2-0

In [11]:
# Step 3: Extract references and their ACUs into a structured format
reference_data = [
    {
        "source": entry["source"],                  # Original text
        "reference": entry["reference"],            # Human-written summary
        "reference_acus": entry["reference_acus"],  # ACUs for the summary
    }
    for entry in cnndm_test
]

In [12]:
# Step 4: Inspect the data
print(f"Loaded {len(reference_data)} examples.")
print(json.dumps(reference_data[0], indent=2))

Loaded 500 examples.
{
  "source": "Club Tijuana star Juan Arango conjured memories Luis Suarez in his team's 4-3 defeat by Monterrey in the Mexican league - but it was not through prodigious scoring. The Venezuelan icon Arango sank his teeth into the shoulder of Jesus Zavela as his temper flared in the defeat. He was not booked by the referee but could face a heavy retrospective ban. Juan Arango (left) bites the shoulder of opponent Jesus Zavela in a moment of madness . Zavala holds his shoulder after being bitten by Arango, in the game Zavala's side won 4-3 in Mexico . Zavala shows the referee the mark on his shoulder after being bittern by Arango . Arango (right) earlier scored a magnificent free kick to bring his Club Tijuana team level against Monterrey . Arango had earlier curled in a magnificent free kick for his team to bring them level after falling 2-0 down early on in the encounter. But the 34-year-old overshadowed his goal with the bite as television cameras picked up the m

In [13]:
# Step 5: Save the data for later use (optional)
with open("reference_summaries_with_acus.json", "w") as f:
    json.dump(reference_data, f, indent=2)

In [18]:
# Step 1: Define dataset configurations
datasets_config = [
    {"name": "cnndm_test", "hf_name": "cnndm_test"},
    {"name": "cnndm_validation", "hf_name": "cnndm_validation"},
    {"name": "xsum", "hf_name": "xsum"},
    {"name": "samsum", "hf_name": "samsum"},
]

# Step 2: Load and structure the datasets
all_datasets = {}

for config in datasets_config:
    dataset_name = config["name"]
    hf_name = config["hf_name"]

    # Load the dataset
    print(f"Loading dataset: {dataset_name}...")
    dataset = load_dataset("Salesforce/rose", hf_name)["data"]

    # Structure the data
    structured_data = [
        {
            "source": entry["source"],
            "reference": entry["reference"],
            "reference_acus": entry["reference_acus"],
        }
        for entry in dataset
    ]

    # Store the data in the dictionary
    all_datasets[dataset_name] = structured_data

# Step 3: Inspect the data
print(f"Loaded datasets: {list(all_datasets.keys())}")
print(f"Example from cnndm_test:\n{json.dumps(all_datasets['cnndm_test'][0], indent=2)}")

# Step 4: Optional - Save the data to disk
with open("all_datasets.json", "w") as f:
    json.dump(all_datasets, f, indent=2)

Loading dataset: cnndm_test...
Loading dataset: cnndm_validation...
Loading dataset: xsum...
Loading dataset: samsum...
Loaded datasets: ['cnndm_test', 'cnndm_validation', 'xsum', 'samsum']
Example from cnndm_test:
{
  "source": "Club Tijuana star Juan Arango conjured memories Luis Suarez in his team's 4-3 defeat by Monterrey in the Mexican league - but it was not through prodigious scoring. The Venezuelan icon Arango sank his teeth into the shoulder of Jesus Zavela as his temper flared in the defeat. He was not booked by the referee but could face a heavy retrospective ban. Juan Arango (left) bites the shoulder of opponent Jesus Zavela in a moment of madness . Zavala holds his shoulder after being bitten by Arango, in the game Zavala's side won 4-3 in Mexico . Zavala shows the referee the mark on his shoulder after being bittern by Arango . Arango (right) earlier scored a magnificent free kick to bring his Club Tijuana team level against Monterrey . Arango had earlier curled in a magn

In [19]:
# Function to count total sources and ACUs in a dataset
def count_sources_and_acus(dataset):
    """
    Counts the total number of source texts and ACUs in a given dataset.

    Args:
        dataset (list): A list of dictionaries containing "source" and "reference_acus".

    Returns:
        dict: A dictionary with the total number of sources and ACUs.
    """
    total_sources = 0
    total_acus = 0

    # Process each entry in the dataset
    for entry in dataset:
        total_sources += 1  # Count the source
        num_acus = len(entry["reference_acus"])  # Count the ACUs for this reference
        total_acus += num_acus  # Update the total ACU count

    return {
        "total_sources": total_sources,
        "total_acus": total_acus,
    }

In [20]:
# Apply the function to each dataset in the dictionary
dataset_stats = {}

for dataset_name, dataset in all_datasets.items():
    stats = count_sources_and_acus(dataset)
    dataset_stats[dataset_name] = stats

# Print the results
for dataset_name, stats in dataset_stats.items():
    print(f"Dataset: {dataset_name}")
    print(f"  Total number of source texts: {stats['total_sources']}")
    print(f"  Total number of ACUs: {stats['total_acus']}")

Dataset: cnndm_test
  Total number of source texts: 500
  Total number of ACUs: 5624
Dataset: cnndm_validation
  Total number of source texts: 1000
  Total number of ACUs: 11579
Dataset: xsum
  Total number of source texts: 500
  Total number of ACUs: 2307
Dataset: samsum
  Total number of source texts: 500
  Total number of ACUs: 2287
