# DEFINITIVE & CORRECT - HVAC Dataset Optimization Pipeline

**Objective**: This notebook uses a direct, case-sensitive mapping built from the successful diagnostic to finally and correctly process the **original** `hvac-dataset.zip` into 6 universal categories.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%pip install -q pycocotools tqdm

In [ ]:
import os, json, shutil, zipfile, logging
from pathlib import Path
from collections import defaultdict, Counter
from typing import Dict, List, Any, Optional
import numpy as np
from tqdm.auto import tqdm
from pycocotools import mask as mask_utils

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

### 1. Configuration (Using Your Exact Category List)

In [None]:
# This MUST point to your ORIGINAL, multi-class dataset zip file.
INPUT_ZIP_PATH = Path("/content/drive/MyDrive/hvac-dataset.zip") 
RAW_DATA_DIR = Path("/content/hvac-dataset-raw")
OPTIMIZED_DATA_DIR = Path("/content/drive/MyDrive/hvac-dataset-optimized")
MIN_ANNOTATION_AREA = 25

# *** FINAL MAPPING: Built from your exact, case-sensitive diagnostic output ***
CATEGORY_MAPPING = {
    "Equipment": ["Coil", "Fan", "e_motor_3p", "centrifugal-pump", "scroll_compressosr", "tank", "finned_tubes_HE"],
    "Ductwork": ["duct", "bend", "reducer"],
    "Piping": ["Pipe-Insulated", "u_trap"],
    "Valves": ["BDV", "Plug Valve", "SDV", "Valve-Ball", "Valve-Butterfly", "Valve-Check", "Valve-Control", "Valve-Gate", "Valve-Globe", "Valve-Needle", "Valve-SafetyRelief", "Valve-ThreeWay", "ball w insulate", "ball_valve_with_schrader", "ex_valve"],
    "Air Devices": ["Damper", "Filter", "fire-damper", "fire-detector", "smoke detector"],
    "Controls": ["Sensor-Temperature", "filter_drier", "maintainance-switch", "pressure sensor", "schraeder_vavle", "sight_glass", "solenoid_valve", "vd"]
}

# --- Build a Direct, Case-Sensitive Reverse Map --- 
REVERSE_CATEGORY_MAP = {}
for simp_cat, orig_cats in CATEGORY_MAPPING.items():
    for orig_cat in orig_cats:
        REVERSE_CATEGORY_MAP[orig_cat] = simp_cat # Direct 1-to-1 mapping

SIMPLIFIED_CATEGORIES = list(CATEGORY_MAPPING.keys())
SIMP_CAT_NAME_TO_ID = {name: i for i, name in enumerate(SIMPLIFIED_CATEGORIES)}

logger.info("Configuration loaded with DIRECT, case-sensitive mapping.")

### 2. Dataset Extraction

In [None]:
if RAW_DATA_DIR.exists(): shutil.rmtree(RAW_DATA_DIR)
with zipfile.ZipFile(INPUT_ZIP_PATH, 'r') as zip_ref:
    zip_ref.extractall(RAW_DATA_DIR)
logger.info(f"Successfully extracted ORIGINAL dataset to {RAW_DATA_DIR}")

### 3. Core Optimization Logic (Simplified & Corrected)

In [None]:
class DatasetOptimizer:
    def __init__(self, raw_dir: Path, optimized_dir: Path):
        self.raw_dir = raw_dir
        self.optimized_dir = optimized_dir
        self.stats = defaultdict(lambda: defaultdict(int))
        if self.optimized_dir.exists(): shutil.rmtree(self.optimized_dir)
        self.optimized_dir.mkdir(parents=True)

    def run(self):
        for item in self.raw_dir.iterdir():
            if item.is_dir() and item.name in ['train', 'valid', 'test']:
                logger.info(f"--- Processing split: {item.name} ---")
                self._process_split(item)
        self._print_summary_stats()

    def _process_split(self, raw_split_dir: Path):
        split_name = raw_split_dir.name
        optimized_split_dir = self.optimized_dir / split_name
        optimized_split_dir.mkdir()
        ann_file = next(raw_split_dir.glob('*_annotations.coco.json'), None)
        if not ann_file: return

        with open(ann_file) as f: coco_data = json.load(f)
        original_images = {img['id']: img for img in coco_data['images']}
        original_anns = defaultdict(list)
        for ann in coco_data['annotations']: original_anns[ann['image_id']].append(ann)
        original_cats = {cat['id']: cat['name'] for cat in coco_data['categories']}

        optimized_coco = {"images": [], "annotations": [], "categories": self._build_simplified_categories()}
        ann_id_counter = 1

        for img_id, img_info in tqdm(original_images.items(), desc=f"Optimizing {split_name}"):
            refined_annotations = []
            for ann in original_anns.get(img_id, []):
                refined_ann = self._refine_annotation(ann, img_info, original_cats, split_name)
                if refined_ann:
                    refined_ann['id'] = ann_id_counter
                    refined_annotations.append(refined_ann)
                    ann_id_counter += 1
            
            if refined_annotations:
                optimized_coco["images"].append(img_info)
                optimized_coco["annotations"].extend(refined_annotations)
                shutil.copy(raw_split_dir / img_info['file_name'], optimized_split_dir)

        with open(optimized_split_dir / "_annotations.coco.json", 'w') as f:
            json.dump(optimized_coco, f, indent=2)

    def _refine_annotation(self, ann: Dict, img_info: Dict, original_cats: Dict, split: str) -> Optional[Dict]:
        # *** FINAL FIX: Direct, case-sensitive lookup with a strip() for safety ***
        orig_cat_name = original_cats.get(ann['category_id'])
        if not orig_cat_name: return None
        
        simp_cat_name = REVERSE_CATEGORY_MAP.get(orig_cat_name.strip())
        if not simp_cat_name: return None

        if 'segmentation' not in ann or not ann['segmentation']: return None
        h, w = img_info['height'], img_info['width']
        rle = mask_utils.frPyObjects(ann['segmentation'], h, w)
        mask = mask_utils.decode(rle)
        if mask.ndim == 3: mask = np.any(mask, axis=2)
        rle = mask_utils.encode(np.asfortranarray(mask))
        area = mask_utils.area(rle).sum()
        if area < MIN_ANNOTATION_AREA: return None
        bbox = mask_utils.toBbox(rle).flatten().tolist()
        self.stats[split][simp_cat_name] += 1
        return {
            "image_id": ann['image_id'], "category_id": SIMP_CAT_NAME_TO_ID[simp_cat_name],
            "segmentation": ann['segmentation'], "bbox": bbox, "area": float(area), "iscrowd": 0
        }

    def _build_simplified_categories(self) -> List[Dict]:
        return [{"id": i, "name": name, "supercategory": "hvac"} for i, name in enumerate(SIMPLIFIED_CATEGORIES)]

    def _print_summary_stats(self):
        logger.info("\n" + "="*25 + " FINAL OPTIMIZATION SUMMARY " + "="*25)
        grand_total = Counter()
        for split in ['train', 'valid', 'test']:
            if split in self.stats:
                print(f"\n--- Split: {split} ---")
                counts = self.stats[split]
                for cat, count in sorted(counts.items()): print(f"{cat:<20}: {count} annotations")
                grand_total.update(counts)
        print("\n--- Grand Total Across All Splits ---")
        for cat, count in sorted(grand_total.items()): print(f"{cat:<20}: {count} annotations")
        logger.info("="*76 + "\n")

print("✓ Final optimizer logic defined.")

### 4. Execute Pipeline & Verify Results

In [None]:
optimizer = DatasetOptimizer(RAW_DATA_DIR, OPTIMIZED_DATA_DIR)
optimizer.run()

### 5. Final Export & Cleanup

In [None]:
output_zip_file = OPTIMIZED_DATA_DIR.with_suffix('.zip')
logger.info(f"Creating final CORRECTED ZIP archive at: {output_zip_file}")
if output_zip_file.exists():
    os.remove(output_zip_file)
shutil.make_archive(str(OPTIMIZED_DATA_DIR), 'zip', str(OPTIMIZED_DATA_DIR))
logger.info("✓ Archiving complete!")
print("\n---")
print("✅ PIPELINE FINISHED SUCCESSFULLY")
print(f"A new, correct ZIP file has been created at: {output_zip_file}")