In [17]:
from datasets import load_dataset, ClassLabel

dataset = load_dataset("dair-ai/emotion")
standard_emotions = ["neutral", "happy", "surprise", "sad", "angry", "fear", "disgust"]

original_to_standard = {
    0: 3,  # sadness -> sad
    1: 1,  # joy -> happy
    2: 1,  # love -> happy
    3: 4,  # anger -> angry
    4: 5,  # fear -> fear
    5: 2   # surprise -> surprise
}


def map_label(example):
    example["label"] = original_to_standard[example["label"]]
    return example

dataset = dataset.map(map_label)
dataset = dataset.cast_column("label", ClassLabel(names=standard_emotions))


output_path = "/Users/bilalkhalid/Desktop/TUS/S2/Project/Sprint2/UpdatedDataSet"
dataset.save_to_disk(output_path)

print(f"Dataset re-labeled and saved to {output_path}")

Saving the dataset (0/1 shards):   0%|          | 0/16000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2000 [00:00<?, ? examples/s]

Dataset re-labeled and saved to /Users/bilalkhalid/Desktop/TUS/S2/Project/Sprint2/UpdatedDataSet


In [18]:
# Validating
from datasets import load_from_disk, ClassLabel
from collections import Counter

dataset_path = "/Users/bilalkhalid/Desktop/TUS/S2/Project/Sprint2/UpdatedDataSet"
dataset = load_from_disk(dataset_path)

standard_emotions = ["neutral", "happy", "surprise", "sad", "angry", "fear", "disgust"]

# --- Verification Steps ---

# 1. Check dataset schema
print("Dataset Schema:", dataset["train"].features)
if isinstance(dataset["train"].features["label"], ClassLabel):
    print("Label feature names:", dataset["train"].features["label"].names)
    if dataset["train"].features["label"].names != standard_emotions:
        raise ValueError("Label names do not match the standard emotions")
else:
    raise ValueError("Label feature is not a ClassLabel")

# 2. Validate label values
for split in ["train", "validation", "test"]:
    labels = dataset[split]["label"]
    invalid_labels = [label for label in labels if label not in range(7)]
    if invalid_labels:
        print(f"Error in {split}: Invalid labels found: {invalid_labels}")
    else:
        print(f"{split} split: All labels are valid (0-6)")

# 3. Sample verification
print("\nFirst 5 examples from train:")
for i in range(min(5, len(dataset["train"]))):
    text = dataset["train"][i]["text"]
    label_idx = dataset["train"][i]["label"]
    label_name = standard_emotions[label_idx]
    print(f"Text: {text[:50]}... | Label: {label_idx} ({label_name})")

# 4. Check for missing data
for split in ["train", "validation", "test"]:
    missing_labels = [i for i, ex in enumerate(dataset[split]) if ex["label"] is None]
    missing_texts = [i for i, ex in enumerate(dataset[split]) if not ex["text"]]
    if missing_labels or missing_texts:
        print(f"Error in {split}: Missing labels at {missing_labels}, Missing texts at {missing_texts}")
    else:
        print(f"{split} split: No missing data")

# 5. Label distribution
all_labels = []
for split in dataset.keys():
    all_labels.extend(dataset[split]["label"])
label_counts = Counter(all_labels)
print("\nLabel Distribution Across All Splits:")
for i, emotion in enumerate(standard_emotions):
    count = label_counts.get(i, 0)
    print(f"{i}: {emotion} - {count} instances")

# Final confirmation
total_examples = sum(len(dataset[split]) for split in dataset.keys())
if total_examples == 20000 and sum(label_counts.values()) == 20000:
    # print("ABC 1")
    print("\nValidation successful: Dataset is correctly formatted with 20,000 examples.")
else:
    # print("Error ABC 2")
    print(f"\nWarning: Expected 20,000 examples, but found {total_examples} with {sum(label_counts.values())} labeled instances.")

Dataset Schema: {'text': Value(dtype='string', id=None), 'label': ClassLabel(names=['neutral', 'happy', 'surprise', 'sad', 'angry', 'fear', 'disgust'], id=None)}
Label feature names: ['neutral', 'happy', 'surprise', 'sad', 'angry', 'fear', 'disgust']
train split: All labels are valid (0-6)
validation split: All labels are valid (0-6)
test split: All labels are valid (0-6)

First 5 examples from train:
Text: i didnt feel humiliated... | Label: 3 (sad)
Text: i can go from feeling so hopeless to so damned hop... | Label: 3 (sad)
Text: im grabbing a minute to post i feel greedy wrong... | Label: 4 (angry)
Text: i am ever feeling nostalgic about the fireplace i ... | Label: 1 (happy)
Text: i am feeling grouchy... | Label: 4 (angry)
train split: No missing data
validation split: No missing data
test split: No missing data

Label Distribution Across All Splits:
0: neutral - 0 instances
1: happy - 8402 instances
2: surprise - 719 instances
3: sad - 5797 instances
4: angry - 2709 instances
5: f