# Extract Training Labels

This notebook recovers the label ordering used during model training.

**Two methods:**
1. **From MFCC JSON files** (if you have `mfcc_train_data.json`)
2. **From directory structure** (scans `TrainingData/AudioSamples/` folders)

⚠️ **Important:** The directory method assumes alphabetical ordering was used during training!

In [None]:
import json
from pathlib import Path
from typing import List, Optional

## Configuration

In [None]:
# Paths
MFCC_TRAIN_PATH = Path('../data/mfcc_train_data.json')
TRAINING_DATA_DIR = Path('../TrainingData/AudioSamples')
OUTPUT_PATH = Path('../models/label_mapping_from_dirs.json')

## Method 1: Extract from MFCC JSON

In [None]:
def extract_from_mfcc(path: Path) -> Optional[List[str]]:
    """Extract labels from MFCC JSON file."""
    if not path.exists():
        return None
    
    with open(path, 'r') as f:
        data = json.load(f)
    
    # Get unique labels in order they appear
    labels = []
    seen = set()
    for item in data['data']:
        label = item['label']
        if label not in seen:
            labels.append(label)
            seen.add(label)
    
    return labels

# Try to extract from MFCC file
if MFCC_TRAIN_PATH.exists():
    print(f"Found MFCC training data: {MFCC_TRAIN_PATH}")
    labels_from_mfcc = extract_from_mfcc(MFCC_TRAIN_PATH)
    if labels_from_mfcc:
        print(f"✓ Extracted {len(labels_from_mfcc)} labels from MFCC file")
        print(f"\nLabels in order:")
        for i, label in enumerate(labels_from_mfcc):
            print(f"  {i:2d}: {label}")
else:
    print(f"MFCC file not found: {MFCC_TRAIN_PATH}")
    print("Will try directory method instead...")
    labels_from_mfcc = None

## Method 2: Extract from Directory Structure

In [None]:
def extract_from_directories(base_dir: Path) -> Optional[List[str]]:
    """Extract labels from directory structure (alphabetically sorted)."""
    if not base_dir.exists():
        return None
    
    # Get all subdirectories and sort alphabetically
    subdirs = [d.name for d in base_dir.iterdir() if d.is_dir()]
    subdirs = sorted(subdirs)
    
    return subdirs

# Try to extract from directories
if TRAINING_DATA_DIR.exists():
    print(f"\nScanning training directory: {TRAINING_DATA_DIR}")
    labels_from_dirs = extract_from_directories(TRAINING_DATA_DIR)
    if labels_from_dirs:
        print(f"✓ Found {len(labels_from_dirs)} directories")
        print(f"\n⚠️  WARNING: This assumes alphabetical ordering was used during training!")
        print(f"\nLabels in alphabetical order:")
        for i, label in enumerate(labels_from_dirs):
            print(f"  {i:2d}: {label}")
else:
    print(f"\nTraining directory not found: {TRAINING_DATA_DIR}")
    labels_from_dirs = None

## Choose Method and Save

In [None]:
# Prefer MFCC method if available
if labels_from_mfcc:
    final_labels = labels_from_mfcc
    method = "MFCC file"
elif labels_from_dirs:
    final_labels = labels_from_dirs
    method = "directory structure"
else:
    print("\n❌ Could not extract labels from any source!")
    final_labels = None
    method = None

if final_labels:
    print(f"\n{'='*60}")
    print(f"Extracted {len(final_labels)} labels from {method}")
    print(f"{'='*60}")
    
    # Ask user if they want to save
    response = input(f"\nSave to {OUTPUT_PATH}? (y/n): ").strip().lower()
    
    if response == 'y':
        # Save to JSON
        with open(OUTPUT_PATH, 'w') as f:
            json.dump(final_labels, f, indent=2)
        
        print(f"\n✓ Labels saved to: {OUTPUT_PATH}")
        print(f"\nNext steps:")
        print(f"1. Verify the labels match your model's training order")
        print(f"2. If needed, adjust the order in the JSON file")
        print(f"3. Copy to ../models/label_mapping.json")
        print(f"4. Run ValidateMapping notebook to verify")
    else:
        print("\nNot saved.")

## Notes

**If using directory method:**
- Assumes `sorted()` (alphabetical) was used during training
- Compare output with your training notebook's label ordering
- May need manual adjustment if different ordering was used

**If using MFCC method:**
- More reliable (preserves original ordering from training data)
- Should match exactly with model training