In [None]:
import os
import shutil
from google.colab import drive

drive.mount('/content/drive')


BASE_PATH = '/content/drive/MyDrive/inDrive_hackathon'
RAW_ZIP_PATH = os.path.join(BASE_PATH, 'data', 'raw', 'dirt_finding_dataset.zip')
TEMP_DIR = '/content/temp_cleanliness'
PROCESSED_DATA_DIR = os.path.join(BASE_PATH, 'data', 'processed', 'cleanliness_dataset')
PROCESSED_CLEAN_DIR = os.path.join(PROCESSED_DATA_DIR, 'clean')
PROCESSED_DIRTY_DIR = os.path.join(PROCESSED_DATA_DIR, 'dirty')

if os.path.exists(TEMP_DIR):
    shutil.rmtree(TEMP_DIR)
os.makedirs(TEMP_DIR)

os.makedirs(PROCESSED_CLEAN_DIR, exist_ok=True)
os.makedirs(PROCESSED_DIRTY_DIR, exist_ok=True)


print("✅ Setup complete!")
print(f"Base project path: {BASE_PATH}")
print(f"Final 'clean' images will be saved to: {PROCESSED_CLEAN_DIR}")
print(f"Final 'dirty' images will be saved to: {PROCESSED_DIRTY_DIR}")

if not os.path.exists(RAW_ZIP_PATH):
    print(f"\n❌ ERROR: Raw zip file not found!")
    print(f"Please check that this path is correct and the file exists: {RAW_ZIP_PATH}")
else:
    print(f"\n✅ Successfully located raw zip file: {RAW_ZIP_PATH}")

Mounted at /content/drive
✅ Setup complete!
Base project path: /content/drive/MyDrive/inDrive_hackathon
Final 'clean' images will be saved to: /content/drive/MyDrive/inDrive_hackathon/data/processed/cleanliness_dataset/clean
Final 'dirty' images will be saved to: /content/drive/MyDrive/inDrive_hackathon/data/processed/cleanliness_dataset/dirty

✅ Successfully located raw zip file: /content/drive/MyDrive/inDrive_hackathon/data/raw/dirt_finding_dataset.zip


In [None]:
import glob
from tqdm.notebook import tqdm


print(f"Unzipping {RAW_ZIP_PATH}...")
shutil.unpack_archive(RAW_ZIP_PATH, TEMP_DIR)
print("Unzipping complete.")

print("\nSorting images into 'clean' and 'dirty' categories...")


label_files = glob.glob(os.path.join(TEMP_DIR, '**', 'labels', '*.txt'), recursive=True)


def find_image_for_label(label_p):
    base_path = label_p.replace('/labels/', '/images/').rsplit('.', 1)[0]
    for ext in ['.jpg', '.jpeg', '.png']:
        img_p = base_path + ext
        if os.path.exists(img_p):
            return img_p
    return None

clean_count = 0
dirty_count = 0
not_found_count = 0

for label_path in tqdm(label_files, desc="Processing files"):
    img_path = find_image_for_label(label_path)

    if img_path is None:
        not_found_count += 1
        continue

    if os.path.getsize(label_path) > 0:
        shutil.copy2(img_path, PROCESSED_DIRTY_DIR)
        dirty_count += 1
    else:
        shutil.copy2(img_path, PROCESSED_CLEAN_DIR)
        clean_count += 1

print(f"\n✅ Processing complete!")
print(f"Sorted {clean_count} images into the 'clean' category.")
print(f"Sorted {dirty_count} images into the 'dirty' category.")
if not_found_count > 0:
    print(f"Warning: Could not find a matching image for {not_found_count} label files.")

Unzipping /content/drive/MyDrive/inDrive_hackathon/data/raw/dirt_finding_dataset.zip...
Unzipping complete.

Sorting images into 'clean' and 'dirty' categories...


Processing files:   0%|          | 0/512 [00:00<?, ?it/s]


✅ Processing complete!
Sorted 1 images into the 'clean' category.
Sorted 511 images into the 'dirty' category.


In [None]:
import os
import shutil
import glob
from tqdm.notebook import tqdm

CLEAN_CARS_ZIP_PATH = os.path.join(BASE_PATH, 'data', 'raw', 'Car_model_detection_subset.zip')
TEMP_CLEAN_DIR = '/content/temp_clean_cars'
PROCESSED_CLEAN_DIR = os.path.join(BASE_PATH, 'data', 'processed', 'cleanliness_dataset', 'clean')

if os.path.exists(TEMP_CLEAN_DIR):
    shutil.rmtree(TEMP_CLEAN_DIR)
os.makedirs(TEMP_CLEAN_DIR)

print(f"Unzipping {os.path.basename(CLEAN_CARS_ZIP_PATH)}...")
if os.path.exists(CLEAN_CARS_ZIP_PATH):
    shutil.unpack_archive(CLEAN_CARS_ZIP_PATH, TEMP_CLEAN_DIR)
    print("Unzipping complete.")
else:
    print(f"❌ ERROR: Zip file not found at {CLEAN_CARS_ZIP_PATH}. Please check the path and filename.")
    raise FileNotFoundError

train_images = glob.glob(os.path.join(TEMP_CLEAN_DIR, 'train', '**', '*.*'), recursive=True)
valid_images = glob.glob(os.path.join(TEMP_CLEAN_DIR, 'valid', '**', '*.*'), recursive=True)

all_clean_images_to_copy = train_images + valid_images
print(f"Found {len(all_clean_images_to_copy)} new clean images to copy.")

copied_count = 0
for img_path in tqdm(all_clean_images_to_copy, desc="Copying clean images"):
    if img_path.lower().endswith(('.png', '.jpg', '.jpeg')):
        shutil.copy2(img_path, PROCESSED_CLEAN_DIR)
        copied_count += 1

total_clean_images = len(os.listdir(PROCESSED_CLEAN_DIR))
total_dirty_images = len(os.listdir(os.path.join(BASE_PATH, 'data', 'processed', 'cleanliness_dataset', 'dirty')))


print(f"\n✅ Copied {copied_count} new images to the 'clean' directory.")
print("\n--- Final Dataset Status ---")
print(f"Total images in 'clean' folder: {total_clean_images}")
print(f"Total images in 'dirty' folder: {total_dirty_images}")

if total_clean_images > 200:
    print("\n🎉 Excellent! The dataset is now much more balanced and ready for training.")
else:
    print("\nWarning: The number of clean images is still low. The dataset is imbalanced.")

Unzipping Car_model_detection_subset.zip...
Unzipping complete.
Found 962 new clean images to copy.


Copying clean images:   0%|          | 0/962 [00:00<?, ?it/s]


✅ Copied 481 new images to the 'clean' directory.

--- Final Dataset Status ---
Total images in 'clean' folder: 482
Total images in 'dirty' folder: 511

🎉 Excellent! The dataset is now much more balanced and ready for training.
