In [None]:

from google.colab import drive
drive.mount('/content/drive')

import os
import shutil
import glob
import yaml
from tqdm.notebook import tqdm

BASE_PATH = '/content/drive/MyDrive/inDrive_hackathon'
RAW_DATA_PATH = os.path.join(BASE_PATH, 'data', 'raw')
TEMP_WORK_DIR = '/content/temp_data'
PROCESSED_DATA_PATH = os.path.join(BASE_PATH, 'data', 'processed', 'damage_dataset_yolo')

if os.path.exists(TEMP_WORK_DIR):
    shutil.rmtree(TEMP_WORK_DIR)
os.makedirs(TEMP_WORK_DIR)


os.makedirs(PROCESSED_DATA_PATH, exist_ok=True)
os.makedirs(os.path.join(PROCESSED_DATA_PATH, 'images'), exist_ok=True)
os.makedirs(os.path.join(PROCESSED_DATA_PATH, 'labels'), exist_ok=True)


print("✅ Setup complete!")
print(f"Raw data will be read from: {RAW_DATA_PATH}")
print(f"Final processed data will be saved to: {PROCESSED_DATA_PATH}")


if not os.path.exists(BASE_PATH):
    print(f"\n❌ WARNING: The base path '{BASE_PATH}' does not exist in your Google Drive.")
    print("Please double-check the path is correct and the folder has been created.")
else:
    print(f"\n✅ Successfully located base path: {BASE_PATH}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Setup complete!
Raw data will be read from: /content/drive/MyDrive/inDrive_hackathon/data/raw
Final processed data will be saved to: /content/drive/MyDrive/inDrive_hackathon/data/processed/damage_dataset_yolo

✅ Successfully located base path: /content/drive/MyDrive/inDrive_hackathon


In [None]:
zip_files = glob.glob(os.path.join(RAW_DATA_PATH, '*.zip'))

if not zip_files:
    print("❌ ERROR: No .zip files found in the raw data directory.")
    print(f"Please upload your Roboflow zip files to: {RAW_DATA_PATH}")
else:
    print(f"Found {len(zip_files)} zip files. Unzipping into separate folders...")
    for zip_file in zip_files:
        base_name = os.path.splitext(os.path.basename(zip_file))[0]
        extract_dir = os.path.join(TEMP_WORK_DIR, base_name)
        os.makedirs(extract_dir, exist_ok=True)
        shutil.unpack_archive(zip_file, extract_dir)
        print(f"  -> Unzipped {os.path.basename(zip_file)} to {extract_dir}")

    print("\n✅ All datasets unzipped to their respective temporary folders.")
    print("Temporary directory contents:")

    !ls -R {TEMP_WORK_DIR}

Found 3 zip files. Unzipping into separate folders...
  -> Unzipped Car Scratch and Dent.v1i.yolov5pytorch.zip to /content/temp_data/Car Scratch and Dent.v1i.yolov5pytorch
  -> Unzipped Rust and Scrach.v1i.yolov5pytorch.zip to /content/temp_data/Rust and Scrach.v1i.yolov5pytorch
  -> Unzipped car  scratch.v1i.yolov5pytorch.zip to /content/temp_data/car  scratch.v1i.yolov5pytorch

✅ All datasets unzipped to their respective temporary folders.
Temporary directory contents:
/content/temp_data:
'Car Scratch and Dent.v1i.yolov5pytorch'  'Rust and Scrach.v1i.yolov5pytorch'
'car  scratch.v1i.yolov5pytorch'

'/content/temp_data/Car Scratch and Dent.v1i.yolov5pytorch':
data.yaml  README.dataset.txt  README.roboflow.txt  test  train  valid

'/content/temp_data/Car Scratch and Dent.v1i.yolov5pytorch/test':
images	labels

'/content/temp_data/Car Scratch and Dent.v1i.yolov5pytorch/test/images':
6-ways-to-protect-your-car-door-from-scratches-614178_1024x1024_jpg.rf.8bbd16b62466e7ed332b3e56aa889ac1.j

In [None]:
UNIFIED_CLASSES = {
    'scratch': 0,
    'dent': 1,
    'rust': 2,
}
print(f"Target unified classes: {UNIFIED_CLASSES}\n")


TYPO_MAP = {
    'scratch': 'scratch',
    'scracth': 'scratch', # Handles the typo
    'dent': 'dent',
    'dunt': 'dent',       # Handles the typo
    'rust': 'rust',
    # We intentionally leave out 'car' and 'dirt' as we want to ignore them.
}


source_yamls = glob.glob(os.path.join(TEMP_WORK_DIR, '**', 'data.yaml'), recursive=True)
global_class_map = {}

print("Scanning source datasets for class mappings...")
for yaml_path in source_yamls:
    with open(yaml_path, 'r') as f:
        data = yaml.safe_load(f)
        source_classes = data['names']
        print(f"\n--- Found Dataset at: {os.path.dirname(yaml_path)}")
        print(f"  Source Classes: {source_classes}")

        for source_idx, source_name in enumerate(source_classes):
            source_name_lower = source_name.lower().strip()

            if source_name_lower in TYPO_MAP:
                target_name = TYPO_MAP[source_name_lower]
                target_idx = UNIFIED_CLASSES[target_name]
                global_class_map[(os.path.dirname(yaml_path), source_idx)] = target_idx
                print(f"    - Mapping '{source_name}' ({source_idx}) -> '{target_name}' ({target_idx})")
            else:
                print(f"    - WARNING: Unhandled class '{source_name}'. It will be ignored.")

print("\n✅ Global class mapping created successfully.")

Target unified classes: {'scratch': 0, 'dent': 1, 'rust': 2}

Scanning source datasets for class mappings...

--- Found Dataset at: /content/temp_data/Rust and Scrach.v1i.yolov5pytorch
  Source Classes: ['car', 'dunt', 'rust', 'scracth']
    - Mapping 'dunt' (1) -> 'dent' (1)
    - Mapping 'rust' (2) -> 'rust' (2)
    - Mapping 'scracth' (3) -> 'scratch' (0)

--- Found Dataset at: /content/temp_data/Car Scratch and Dent.v1i.yolov5pytorch
  Source Classes: ['dent', 'dirt', 'scratch']
    - Mapping 'dent' (0) -> 'dent' (1)
    - Mapping 'scratch' (2) -> 'scratch' (0)

--- Found Dataset at: /content/temp_data/car  scratch.v1i.yolov5pytorch
  Source Classes: ['Scratch']
    - Mapping 'Scratch' (0) -> 'scratch' (0)

✅ Global class mapping created successfully.


In [None]:


print("Starting file processing and unification...")

image_files = glob.glob(os.path.join(TEMP_WORK_DIR, '**', '*.jpg'), recursive=True)
image_files.extend(glob.glob(os.path.join(TEMP_WORK_DIR, '**', '*.jpeg'), recursive=True))
image_files.extend(glob.glob(os.path.join(TEMP_WORK_DIR, '**', '*.png'), recursive=True))

image_files = [f for f in image_files if any(s in f for s in ['/train/', '/valid/', '/test/'])]
print(f"Found {len(image_files)} total images to process.")


image_counter = 0
for img_path in tqdm(image_files, desc="Processing Images"):
    label_path = img_path.replace('/images/', '/labels/').replace('.jpg', '.txt').replace('.jpeg', '.txt').replace('.png', '.txt')

    if not os.path.exists(label_path):
        continue
    dataset_dir = os.path.dirname(os.path.dirname(os.path.dirname(label_path)))

    new_label_lines = []
    with open(label_path, 'r') as f:
        lines = f.readlines()
        for line in lines:
            try:
                parts = line.strip().split()
                if not parts: continue # Skip empty lines
                source_idx = int(parts[0])

                map_key = (dataset_dir, source_idx)
                if map_key in global_class_map:
                    target_idx = global_class_map[map_key]
                    new_line = f"{target_idx} {' '.join(parts[1:])}"
                    new_label_lines.append(new_line)
            except (ValueError, IndexError):
                # This handles cases where a line in the label file might be malformed
                print(f"Warning: Skipping malformed line in {label_path}: {line.strip()}")


    if new_label_lines:
        base_name = f"damage_{image_counter:05d}"

        img_ext = os.path.splitext(img_path)[1]
        new_img_path = os.path.join(PROCESSED_DATA_PATH, 'images', f"{base_name}{img_ext}")
        shutil.copy2(img_path, new_img_path)

        new_label_path = os.path.join(PROCESSED_DATA_PATH, 'labels', f"{base_name}.txt")
        with open(new_label_path, 'w') as f:
            f.write('\n'.join(new_label_lines))

        image_counter += 1

print(f"\n✅ Processing complete!")
print(f"Unified {image_counter} images and labels into the final dataset.")

Starting file processing and unification...
Found 1559 total images to process.


Processing Images:   0%|          | 0/1559 [00:00<?, ?it/s]


✅ Processing complete!
Unified 1511 images and labels into the final dataset.


In [None]:


class_names = [name for name, index in sorted(UNIFIED_CLASSES.items(), key=lambda item: item[1])]

yaml_content = {
    'path': PROCESSED_DATA_PATH,  # Absolute path to the dataset root
    'train': 'images',           # Relative path to training images
    'val': 'images',             # Relative path to validation images (we will split later)
    'test': 'images',            # Relative path to test images (we will split later)
    'names': class_names
}

final_yaml_path = os.path.join(PROCESSED_DATA_PATH, 'data.yaml')
with open(final_yaml_path, 'w') as f:
    yaml.dump(yaml_content, f, sort_keys=False)

print(f"Created final data.yaml file at: {final_yaml_path}")
print("\n--- Contents of data.yaml ---")
with open(final_yaml_path, 'r') as f:
    print(f.read())

print("\n\n🎉 ALL DONE! Your unified damage dataset is ready for training in your Google Drive.")

Created final data.yaml file at: /content/drive/MyDrive/inDrive_hackathon/data/processed/damage_dataset_yolo/data.yaml

--- Contents of data.yaml ---
path: /content/drive/MyDrive/inDrive_hackathon/data/processed/damage_dataset_yolo
train: images
val: images
test: images
names:
- scratch
- dent
- rust



🎉 ALL DONE! Your unified damage dataset is ready for training in your Google Drive.
