# Dog FACS Dataset - Analiza Statystyk

Notebook do analizy statystyk finalnego datasetu Dog FACS.

**Zawartość:**
1. Wczytanie danych
2. Ogólne statystyki
3. Rozkład emocji
4. Rozkład ras
5. Analiza keypoints
6. Rozkład confidence scores
7. Podsumowanie

In [None]:
# Importy
import json
from pathlib import Path
from collections import Counter, defaultdict

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Konfiguracja wykresów
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 12
sns.set_palette('husl')

print('Biblioteki załadowane.')

## 1. Wczytanie Danych

In [None]:
# Ścieżki do plików
DATA_DIR = Path('../data')
ANNOTATIONS_PATH = DATA_DIR / 'annotations' / 'merged.json'

# Alternatywne ścieżki dla finalnego datasetu
FINAL_DIR = DATA_DIR / 'final' / 'dog-facs-dataset' / 'annotations'
TRAIN_PATH = FINAL_DIR / 'train.json'
VAL_PATH = FINAL_DIR / 'val.json'
TEST_PATH = FINAL_DIR / 'test.json'

def load_coco(path: Path) -> dict:
    """Wczytuje plik COCO."""
    if not path.exists():
        print(f'Plik nie istnieje: {path}')
        return {'images': [], 'annotations': [], 'categories': []}
    
    with open(path, encoding='utf-8') as f:
        return json.load(f)

# Wczytaj dane
if ANNOTATIONS_PATH.exists():
    data = load_coco(ANNOTATIONS_PATH)
    print(f'Wczytano z: {ANNOTATIONS_PATH}')
elif TRAIN_PATH.exists():
    # Połącz splity
    train = load_coco(TRAIN_PATH)
    val = load_coco(VAL_PATH)
    test = load_coco(TEST_PATH)
    
    data = {
        'images': train['images'] + val['images'] + test['images'],
        'annotations': train['annotations'] + val['annotations'] + test['annotations'],
        'categories': train.get('categories', [])
    }
    print(f'Wczytano z splitów train/val/test')
else:
    print('Brak plików anotacji - używam danych przykładowych')
    data = {'images': [], 'annotations': [], 'categories': []}

images = data.get('images', [])
annotations = data.get('annotations', [])
categories = data.get('categories', [])

print(f'\nWczytano:')
print(f'  - Obrazy: {len(images)}')
print(f'  - Anotacje: {len(annotations)}')
print(f'  - Kategorie: {len(categories)}')

## 2. Ogólne Statystyki

In [None]:
# Podstawowe statystyki
total_images = len(images)
total_annotations = len(annotations)

# Anotacje per obraz
ann_per_image = defaultdict(int)
for ann in annotations:
    ann_per_image[ann.get('image_id')] += 1

ann_counts = list(ann_per_image.values())

stats = {
    'Łącznie obrazów': total_images,
    'Łącznie anotacji': total_annotations,
    'Średnio anotacji/obraz': np.mean(ann_counts) if ann_counts else 0,
    'Min anotacji/obraz': min(ann_counts) if ann_counts else 0,
    'Max anotacji/obraz': max(ann_counts) if ann_counts else 0,
    'Mediana anotacji/obraz': np.median(ann_counts) if ann_counts else 0,
}

# Wyświetl tabelę
stats_df = pd.DataFrame(list(stats.items()), columns=['Metryka', 'Wartość'])
stats_df['Wartość'] = stats_df['Wartość'].apply(lambda x: f'{x:.2f}' if isinstance(x, float) else x)
display(stats_df)

In [None]:
# Histogram anotacji per obraz
if ann_counts:
    plt.figure(figsize=(10, 5))
    plt.hist(ann_counts, bins=range(1, max(ann_counts) + 2), edgecolor='black', alpha=0.7)
    plt.xlabel('Liczba anotacji na obraz')
    plt.ylabel('Liczba obrazów')
    plt.title('Rozkład liczby anotacji per obraz')
    plt.xticks(range(1, max(ann_counts) + 1))
    plt.tight_layout()
    plt.savefig('../docs/reports/figures/annotations_per_image.png', dpi=150)
    plt.show()
else:
    print('Brak danych do wykresu')

## 3. Rozkład Emocji

In [None]:
# Ekstrakcja emocji
emotions = []
emotion_confidences = defaultdict(list)

for ann in annotations:
    emotion = ann.get('emotion', {})
    if isinstance(emotion, dict):
        name = emotion.get('name', 'unknown')
        conf = emotion.get('confidence', 0)
    else:
        name = str(emotion) if emotion else 'unknown'
        conf = 0
    
    emotions.append(name)
    emotion_confidences[name].append(conf)

# Zlicz emocje
emotion_counts = Counter(emotions)
emotion_df = pd.DataFrame(
    [(e, c, c/len(emotions)*100) for e, c in emotion_counts.most_common()],
    columns=['Emocja', 'Liczba', 'Procent']
)

display(emotion_df)

In [None]:
# Wykres rozkładu emocji
if emotion_counts:
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
    
    # Histogram
    colors = sns.color_palette('husl', len(emotion_counts))
    labels, values = zip(*emotion_counts.most_common())
    
    ax1.bar(labels, values, color=colors, edgecolor='black')
    ax1.set_xlabel('Emocja')
    ax1.set_ylabel('Liczba anotacji')
    ax1.set_title('Rozkład emocji w datasecie')
    ax1.tick_params(axis='x', rotation=45)
    
    # Dodaj wartości na słupkach
    for i, (label, value) in enumerate(zip(labels, values)):
        ax1.text(i, value + max(values)*0.02, str(value), ha='center', fontsize=10)
    
    # Wykres kołowy
    ax2.pie(values, labels=labels, autopct='%1.1f%%', colors=colors, startangle=90)
    ax2.set_title('Procentowy rozkład emocji')
    
    plt.tight_layout()
    plt.savefig('../docs/reports/figures/emotion_distribution.png', dpi=150)
    plt.show()
else:
    print('Brak danych o emocjach')

## 4. Rozkład Ras

In [None]:
# Ekstrakcja ras
breeds = []
breed_confidences = defaultdict(list)

for ann in annotations:
    breed = ann.get('breed', {})
    if isinstance(breed, dict):
        name = breed.get('name', 'unknown')
        conf = breed.get('confidence', 0)
    else:
        name = str(breed) if breed else 'unknown'
        conf = 0
    
    breeds.append(name)
    breed_confidences[name].append(conf)

# Zlicz rasy
breed_counts = Counter(breeds)

# Top 20 ras
top_breeds = breed_counts.most_common(20)
breed_df = pd.DataFrame(
    [(b, c, c/len(breeds)*100) for b, c in top_breeds],
    columns=['Rasa', 'Liczba', 'Procent']
)

print(f'Łącznie unikalnych ras: {len(breed_counts)}')
print('\nTop 20 ras:')
display(breed_df)

In [None]:
# Wykres top 20 ras
if top_breeds:
    plt.figure(figsize=(14, 6))
    
    labels, values = zip(*top_breeds)
    colors = sns.color_palette('viridis', len(top_breeds))
    
    bars = plt.barh(range(len(labels)), values, color=colors)
    plt.yticks(range(len(labels)), labels)
    plt.xlabel('Liczba anotacji')
    plt.ylabel('Rasa')
    plt.title('Top 20 ras w datasecie')
    plt.gca().invert_yaxis()
    
    # Dodaj wartości
    for bar, value in zip(bars, values):
        plt.text(value + max(values)*0.01, bar.get_y() + bar.get_height()/2, 
                 str(value), va='center', fontsize=9)
    
    plt.tight_layout()
    plt.savefig('../docs/reports/figures/breed_distribution.png', dpi=150)
    plt.show()
else:
    print('Brak danych o rasach')

## 5. Analiza Keypoints

In [None]:
# Analiza keypoints
keypoint_stats = {
    'total_with_keypoints': 0,
    'visible_counts': [],
    'per_keypoint_visibility': defaultdict(int),
}

NUM_KEYPOINTS = 46

for ann in annotations:
    keypoints = ann.get('keypoints', [])
    
    if keypoints:
        keypoint_stats['total_with_keypoints'] += 1
        
        # Zlicz widoczne keypoints
        visible = 0
        for i in range(0, len(keypoints) - 2, 3):
            v = keypoints[i + 2]
            kp_idx = i // 3
            
            if v > 0:
                visible += 1
                keypoint_stats['per_keypoint_visibility'][kp_idx] += 1
        
        keypoint_stats['visible_counts'].append(visible)

print(f"Anotacje z keypoints: {keypoint_stats['total_with_keypoints']} / {len(annotations)}")

if keypoint_stats['visible_counts']:
    visible = keypoint_stats['visible_counts']
    print(f"Średnio widocznych keypoints: {np.mean(visible):.1f}")
    print(f"Min: {min(visible)}, Max: {max(visible)}, Mediana: {np.median(visible):.1f}")

In [None]:
# Wykres widoczności keypoints
if keypoint_stats['visible_counts']:
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
    
    # Histogram widocznych keypoints
    ax1.hist(keypoint_stats['visible_counts'], bins=20, edgecolor='black', alpha=0.7)
    ax1.set_xlabel('Liczba widocznych keypoints')
    ax1.set_ylabel('Liczba anotacji')
    ax1.set_title('Rozkład liczby widocznych keypoints')
    ax1.axvline(np.mean(keypoint_stats['visible_counts']), color='red', 
                linestyle='--', label=f"Średnia: {np.mean(keypoint_stats['visible_counts']):.1f}")
    ax1.legend()
    
    # Widoczność per keypoint
    if keypoint_stats['per_keypoint_visibility']:
        kp_indices = sorted(keypoint_stats['per_keypoint_visibility'].keys())
        kp_counts = [keypoint_stats['per_keypoint_visibility'][i] for i in kp_indices]
        total = keypoint_stats['total_with_keypoints']
        kp_rates = [c / total * 100 for c in kp_counts]
        
        ax2.bar(kp_indices, kp_rates, alpha=0.7)
        ax2.set_xlabel('Indeks keypoint')
        ax2.set_ylabel('Procent widoczności')
        ax2.set_title('Wskaźnik widoczności per keypoint')
        ax2.axhline(np.mean(kp_rates), color='red', linestyle='--', 
                    label=f"Średnia: {np.mean(kp_rates):.1f}%")
        ax2.legend()
    
    plt.tight_layout()
    plt.savefig('../docs/reports/figures/keypoint_analysis.png', dpi=150)
    plt.show()
else:
    print('Brak danych o keypoints')

## 6. Rozkład Confidence Scores

In [None]:
# Ekstrakcja confidence scores
bbox_confidences = []
breed_conf_list = []
emotion_conf_list = []

for ann in annotations:
    # BBox confidence
    bbox_conf = ann.get('score', ann.get('confidence', 0))
    if bbox_conf > 0:
        bbox_confidences.append(bbox_conf)
    
    # Breed confidence
    breed = ann.get('breed', {})
    if isinstance(breed, dict):
        bc = breed.get('confidence', 0)
        if bc > 0:
            breed_conf_list.append(bc)
    
    # Emotion confidence
    emotion = ann.get('emotion', {})
    if isinstance(emotion, dict):
        ec = emotion.get('confidence', 0)
        if ec > 0:
            emotion_conf_list.append(ec)

print('Statystyki Confidence Scores:')
print(f'\nBBox Detection:')
if bbox_confidences:
    print(f'  Średnia: {np.mean(bbox_confidences):.3f}')
    print(f'  Mediana: {np.median(bbox_confidences):.3f}')
    print(f'  Min: {min(bbox_confidences):.3f}, Max: {max(bbox_confidences):.3f}')

print(f'\nBreed Classification:')
if breed_conf_list:
    print(f'  Średnia: {np.mean(breed_conf_list):.3f}')
    print(f'  Mediana: {np.median(breed_conf_list):.3f}')

print(f'\nEmotion Classification:')
if emotion_conf_list:
    print(f'  Średnia: {np.mean(emotion_conf_list):.3f}')
    print(f'  Mediana: {np.median(emotion_conf_list):.3f}')

In [None]:
# Wykresy confidence
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

# BBox confidence
if bbox_confidences:
    axes[0].hist(bbox_confidences, bins=50, edgecolor='black', alpha=0.7, color='steelblue')
    axes[0].set_xlabel('Confidence')
    axes[0].set_ylabel('Liczba')
    axes[0].set_title(f'BBox Detection\n(śr: {np.mean(bbox_confidences):.3f})')
    axes[0].axvline(np.mean(bbox_confidences), color='red', linestyle='--')
else:
    axes[0].text(0.5, 0.5, 'Brak danych', ha='center', va='center')
    axes[0].set_title('BBox Detection')

# Breed confidence
if breed_conf_list:
    axes[1].hist(breed_conf_list, bins=50, edgecolor='black', alpha=0.7, color='forestgreen')
    axes[1].set_xlabel('Confidence')
    axes[1].set_ylabel('Liczba')
    axes[1].set_title(f'Breed Classification\n(śr: {np.mean(breed_conf_list):.3f})')
    axes[1].axvline(np.mean(breed_conf_list), color='red', linestyle='--')
else:
    axes[1].text(0.5, 0.5, 'Brak danych', ha='center', va='center')
    axes[1].set_title('Breed Classification')

# Emotion confidence
if emotion_conf_list:
    axes[2].hist(emotion_conf_list, bins=50, edgecolor='black', alpha=0.7, color='coral')
    axes[2].set_xlabel('Confidence')
    axes[2].set_ylabel('Liczba')
    axes[2].set_title(f'Emotion Classification\n(śr: {np.mean(emotion_conf_list):.3f})')
    axes[2].axvline(np.mean(emotion_conf_list), color='red', linestyle='--')
else:
    axes[2].text(0.5, 0.5, 'Brak danych', ha='center', va='center')
    axes[2].set_title('Emotion Classification')

plt.tight_layout()
plt.savefig('../docs/reports/figures/confidence_distributions.png', dpi=150)
plt.show()

## 7. Podsumowanie

In [None]:
# Podsumowanie końcowe
summary = {
    'Dataset': {
        'Łącznie obrazów': total_images,
        'Łącznie anotacji': total_annotations,
        'Unikalne rasy': len(breed_counts),
        'Kategorie emocji': len(emotion_counts),
    },
    'Jakość': {
        'Śr. confidence bbox': f"{np.mean(bbox_confidences):.3f}" if bbox_confidences else 'N/A',
        'Śr. confidence rasa': f"{np.mean(breed_conf_list):.3f}" if breed_conf_list else 'N/A',
        'Śr. confidence emocja': f"{np.mean(emotion_conf_list):.3f}" if emotion_conf_list else 'N/A',
        'Anotacje z keypoints': f"{keypoint_stats['total_with_keypoints']} ({keypoint_stats['total_with_keypoints']/max(len(annotations),1)*100:.1f}%)",
    }
}

print('=' * 60)
print('PODSUMOWANIE DOG FACS DATASET')
print('=' * 60)

for section, metrics in summary.items():
    print(f'\n{section}:')
    print('-' * 40)
    for key, value in metrics.items():
        print(f'  {key}: {value}')

print('\n' + '=' * 60)

In [None]:
# Zapisz statystyki do JSON
output_stats = {
    'generated': pd.Timestamp.now().isoformat(),
    'dataset': {
        'total_images': total_images,
        'total_annotations': total_annotations,
        'unique_breeds': len(breed_counts),
        'emotion_categories': len(emotion_counts),
    },
    'emotion_distribution': dict(emotion_counts),
    'breed_distribution': dict(breed_counts.most_common(50)),
    'confidence_stats': {
        'bbox': {
            'mean': float(np.mean(bbox_confidences)) if bbox_confidences else 0,
            'median': float(np.median(bbox_confidences)) if bbox_confidences else 0,
        },
        'breed': {
            'mean': float(np.mean(breed_conf_list)) if breed_conf_list else 0,
            'median': float(np.median(breed_conf_list)) if breed_conf_list else 0,
        },
        'emotion': {
            'mean': float(np.mean(emotion_conf_list)) if emotion_conf_list else 0,
            'median': float(np.median(emotion_conf_list)) if emotion_conf_list else 0,
        },
    },
    'keypoints': {
        'annotations_with_keypoints': keypoint_stats['total_with_keypoints'],
        'avg_visible': float(np.mean(keypoint_stats['visible_counts'])) if keypoint_stats['visible_counts'] else 0,
    },
}

output_path = Path('../docs/reports/dataset_statistics.json')
output_path.parent.mkdir(parents=True, exist_ok=True)

with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(output_stats, f, indent=2, ensure_ascii=False)

print(f'Statystyki zapisane do: {output_path}')