# Bare-Bones Diagnostic for HVAC Dataset

**Objective:** To perform the simplest possible category mapping test to find the root cause of the failure. This script has no complex logic and only checks if 'Ductwork' and 'Valves' can be mapped.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import zipfile
import json
from pathlib import Path
import os
from collections import Counter
import shutil

# This MUST point to your ORIGINAL, multi-class dataset zip file.
INPUT_ZIP_PATH = Path("/content/drive/MyDrive/hvac-dataset.zip")
EXTRACT_PATH = Path("/content/hvac_dataset_raw_barebones")

print(f"Extracting {INPUT_ZIP_PATH} for final diagnostic...")
if EXTRACT_PATH.exists():
    shutil.rmtree(EXTRACT_PATH)
with zipfile.ZipFile(INPUT_ZIP_PATH, 'r') as zip_ref:
    zip_ref.extractall(EXTRACT_PATH)
print("Extraction complete.")

# --- Simplified, Hard-Coded Test Mapping ---
TEST_MAPPING = {
    # Ductwork types
    'duct': 'Ductwork',
    'bend': 'Ductwork',
    'reducer': 'Ductwork',
    # A few valve types for comparison
    'Valve-Ball': 'Valves',
    'Valve-Gate': 'Valves',
    'SDV': 'Valves'
}

def analyze_training_split():
    print("\n" + "="*40)
    print("   ANALYZING 'train' SPLIT... ")
    print("="*40)
    
    ann_file = EXTRACT_PATH / 'train' / '_annotations.coco.json'
    if not ann_file.exists():
        print("ERROR: Training annotation file not found!")
        return

    print(f"Found annotation file: {ann_file}")
    with open(ann_file) as f:
        coco_data = json.load(f)
    
    category_map = {cat['id']: cat['name'] for cat in coco_data.get('categories', [])}
    if not category_map:
        print("ERROR: No categories found in JSON.")
        return

    mapped_counts = Counter()
    unmapped_counts = Counter()

    for ann in coco_data.get('annotations', []):
        cat_id = ann.get('category_id')
        cat_name = category_map.get(cat_id, 'ID_NOT_FOUND')
        
        # Direct, case-sensitive lookup in our small test map
        mapped_category = TEST_MAPPING.get(cat_name)
        
        if mapped_category:
            mapped_counts[mapped_category] += 1
        else:
            unmapped_counts[cat_name] += 1
    
    print("\n--- Mapped Annotation Counts ---")
    for name, count in mapped_counts.items():
        print(f"{name:<20}: {count}")
    
    print("\n--- Top 10 UNMAPPED Annotation Counts ---")
    for name, count in unmapped_counts.most_common(10):
        print(f"{name:<20}: {count}")

# --- Run the analysis ---
analyze_training_split()
