# Looking at the New Dataset Distributions

This notebook analyzes the FER-New-Dataset original (non-augmented) images by visualizing distributions of emotions, fairness attributes (race, gender, age), and dataset sources.

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Set style for better-looking plots
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)

In [None]:
# Load the dataset
df = pd.read_csv('../FER-New-Dataset/dataset_new_attributs.csv')
print(f"Total images in CSV: {len(df)}")
df.head()

In [None]:
# Extract dataset source from image_path
# Path format: FER-New-Dataset/FER-New-Dataset/RAF/calm/1.jpg
def extract_dataset_source(path):
    parts = path.split('/')
    if len(parts) >= 3:
        return parts[2]  # RAF, FER, or CKP
    return 'unknown'

df['dataset_source'] = df['image_path'].apply(extract_dataset_source)

# Filter only original (non-augmented) images
df_orig = df[df['augmented'] == 'original'].copy()
print(f"\nOriginal (non-augmented) images: {len(df_orig)}")
print(f"Augmented images: {len(df[df['augmented'] != 'original'])}")

## Dataset Source Distribution

In [None]:
plt.figure(figsize=(8, 5))
counts = df_orig['dataset_source'].value_counts()
sns.barplot(x=counts.index, y=counts.values)
plt.title('Distribution by Dataset Source (Original Images)', fontsize=14, fontweight='bold')
plt.xlabel('Dataset Source', fontsize=12)
plt.ylabel('Count', fontsize=12)

# Add count labels
ax = plt.gca()
for i, v in enumerate(counts.values):
    ax.text(i, v + 50, str(v), ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()

print("\nDataset source counts:")
print(counts)

## Emotion Distribution

In [None]:
plt.figure(figsize=(10, 6))
emotion_order = ['anger', 'fear', 'calm', 'surprise']
counts = df_orig['emotion'].value_counts()[emotion_order]
sns.barplot(x=counts.index, y=counts.values)
plt.title('Distribution of Emotions (Original Images)', fontsize=14, fontweight='bold')
plt.xlabel('Emotion', fontsize=12)
plt.ylabel('Count', fontsize=12)

# Add count labels
ax = plt.gca()
for i, v in enumerate(counts.values):
    ax.text(i, v + 50, str(v), ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()

print("\nEmotion counts:")
print(counts)

## Race Distribution

In [None]:
# Filter out empty/unknown race values
df_race = df_orig[df_orig['race'].notna() & (df_orig['race'] != '') & (df_orig['race'] != 'unknown')]

plt.figure(figsize=(10, 6))
counts = df_race['race'].value_counts()
sns.barplot(x=counts.index, y=counts.values)
plt.title('Distribution of Race (Original Images)', fontsize=14, fontweight='bold')
plt.xlabel('Race', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.xticks(rotation=45, ha='right')

# Add count labels
ax = plt.gca()
for i, v in enumerate(counts.values):
    ax.text(i, v + 50, str(v), ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()

print("\nRace counts:")
print(df_orig['race'].value_counts())

## Gender Distribution

In [None]:
# Filter out empty/unknown gender values
df_gender = df_orig[df_orig['gender'].notna() & (df_orig['gender'] != '') & (df_orig['gender'] != 'unknown') & (df_orig['gender'] != 'unsure')]

plt.figure(figsize=(8, 5))
counts = df_gender['gender'].value_counts()
sns.barplot(x=counts.index, y=counts.values)
plt.title('Distribution of Gender (Original Images)', fontsize=14, fontweight='bold')
plt.xlabel('Gender', fontsize=12)
plt.ylabel('Count', fontsize=12)

# Add count labels
ax = plt.gca()
for i, v in enumerate(counts.values):
    ax.text(i, v + 50, str(v), ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()

print("\nGender counts:")
print(df_orig['gender'].value_counts())

## Age Distribution

In [None]:
# Filter out empty/unknown age values
df_age = df_orig[df_orig['age'].notna() & (df_orig['age'] != '') & (df_orig['age'] != 'unknown')]

# Define age order
age_order = ['0-3', '4-19', '20-39', '40-69', '70+']
# Get counts in the specified order
counts = df_age['age'].value_counts()[age_order]

plt.figure(figsize=(10, 6))
sns.barplot(x=counts.index, y=counts.values)
plt.title('Distribution of Age (Original Images)', fontsize=14, fontweight='bold')
plt.xlabel('Age Group', fontsize=12)
plt.ylabel('Count', fontsize=12)

# Add count labels
ax = plt.gca()
for i, v in enumerate(counts.values):
    ax.text(i, v + 50, str(v), ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()

print("\nAge counts:")
print(df_orig['age'].value_counts())

## Emotion by Dataset Source

In [None]:
# Emotion by Dataset Source
emotion_dataset_ct = pd.crosstab(df_orig['emotion'], df_orig['dataset_source'])
print("Emotion by Dataset Source:")
print(emotion_dataset_ct)
print()

plt.figure(figsize=(10, 6))
emotion_dataset_ct.plot(kind='bar', ax=plt.gca(), width=0.8)
plt.title('Emotion Distribution by Dataset Source', fontsize=14, fontweight='bold')
plt.xlabel('Emotion', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.legend(title='Dataset Source', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

## Emotion by Race

In [None]:
# Emotion by Race
emotion_race_ct = pd.crosstab(df_race['emotion'], df_race['race'])
print("Emotion by Race:")
print(emotion_race_ct)
print()

plt.figure(figsize=(12, 6))
emotion_race_ct.plot(kind='bar', ax=plt.gca(), width=0.8)
plt.title('Emotion Distribution by Race', fontsize=14, fontweight='bold')
plt.xlabel('Emotion', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.legend(title='Race', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

## Emotion by Gender

In [None]:
# Emotion by Gender
emotion_gender_ct = pd.crosstab(df_gender['emotion'], df_gender['gender'])
print("Emotion by Gender:")
print(emotion_gender_ct)
print()

plt.figure(figsize=(10, 6))
emotion_gender_ct.plot(kind='bar', ax=plt.gca(), width=0.8)
plt.title('Emotion Distribution by Gender', fontsize=14, fontweight='bold')
plt.xlabel('Emotion', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.legend(title='Gender')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

## Emotion by Age

In [None]:
# Emotion by Age
emotion_age_ct = pd.crosstab(df_age['emotion'], df_age['age'])
print("Emotion by Age:")
print(emotion_age_ct)
print()

plt.figure(figsize=(12, 6))
emotion_age_ct[age_order].plot(kind='bar', ax=plt.gca(), width=0.8)
plt.title('Emotion Distribution by Age Group', fontsize=14, fontweight='bold')
plt.xlabel('Emotion', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.legend(title='Age Group', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

## Summary Statistics

In [None]:
print("="*60)
print("DATASET SUMMARY (ORIGINAL IMAGES ONLY)")
print("="*60)

print(f"\nTotal original images: {len(df_orig):,}")

print("\n" + "-"*60)
print("DATASET SOURCES")
print("-"*60)
for source, count in df_orig['dataset_source'].value_counts().items():
    pct = (count / len(df_orig)) * 100
    print(f"{source}: {count:,} ({pct:.1f}%)")

print("\n" + "-"*60)
print("EMOTIONS")
print("-"*60)
for emotion, count in df_orig['emotion'].value_counts().items():
    pct = (count / len(df_orig)) * 100
    print(f"{emotion}: {count:,} ({pct:.1f}%)")

print("\n" + "-"*60)
print("FAIRNESS ATTRIBUTES")
print("-"*60)

print("\nRace:")
for race, count in df_orig['race'].value_counts().items():
    pct = (count / len(df_orig)) * 100
    print(f"  {race}: {count:,} ({pct:.1f}%)")

print("\nGender:")
for gender, count in df_orig['gender'].value_counts().items():
    pct = (count / len(df_orig)) * 100
    print(f"  {gender}: {count:,} ({pct:.1f}%)")

print("\nAge:")
for age in age_order:
    count = len(df_orig[df_orig['age'] == age])
    if count > 0:
        pct = (count / len(df_orig)) * 100
        print(f"  {age}: {count:,} ({pct:.1f}%)")

print("\n" + "="*60)