<a href="https://colab.research.google.com/github/fatimaatharkhan/AI-Hackathon24/blob/main/preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Mount the Drive

In [None]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


Import Libraries

In [None]:
import os
import shutil
from sklearn.model_selection import train_test_split

Count the number of files in a given folder

In [None]:
def count_files_in_folder(folder_path):
    """Count the number of files in a given folder."""
    try:
        items = os.listdir(folder_path)
        files = [item for item in items if os.path.isfile(os.path.join(folder_path, item))]
        return len(files)
    except FileNotFoundError:
        print(f"Folder not found: {folder_path}")
        return 0

Remove unpaired images or labels

In [None]:
import os

def find_and_handle_unpaired_files(image_folder, label_folder):
    """Remove unpaired images or labels."""
    # Get the list of image and label files without extensions
    image_files = {os.path.splitext(img)[0] for img in os.listdir(image_folder)}
    label_files = {os.path.splitext(lbl)[0] for lbl in os.listdir(label_folder)}

    # Find unpaired files
    images_without_labels = image_files - label_files
    labels_without_images = label_files - image_files

    # Handle images without labels
    for image_name in images_without_labels:
        for ext in [".JPG", ".jpg", ".png", ".jpeg"]:  # All extensions
            image_path = os.path.join(image_folder, image_name + ext)
            if os.path.exists(image_path):
                os.remove(image_path)  # Delete the unpaired image
                print(f"Deleted unpaired image: {image_path}")
                break

    # Handle labels without images
    for label_name in labels_without_images:
        label_path = os.path.join(label_folder, label_name + ".txt")  # All txt files
        if os.path.exists(label_path):
            os.remove(label_path)  # Delete the unpaired label
            print(f"Deleted unpaired label: {label_path}")


Split Dataset into training, validation and testing sets

In [None]:
def split_dataset(image_folder, label_folder, output_dirs, train_ratio=0.7, valid_ratio=0.15):
    """Split dataset into training, validation, and testing."""
    # Ensure output directories exist
    for split in output_dirs.values():
        for folder in split.values():
            os.makedirs(folder, exist_ok=True)

    # Get paired image-label files
    image_files = {os.path.splitext(img)[0] for img in os.listdir(image_folder)}
    label_files = {os.path.splitext(lbl)[0] for lbl in os.listdir(label_folder)}
    paired_files = list(image_files & label_files)

    # Split the data
    train_files, temp_files = train_test_split(paired_files, test_size=(1 - train_ratio), random_state=42)
    valid_files, test_files = train_test_split(temp_files, test_size=valid_ratio / (1 - train_ratio), random_state=42)

    # Function to copy files
    def copy_files(file_list, src_img_dir, src_label_dir, dest_img_dir, dest_label_dir):
        for file_name in file_list:
            for ext in [".JPG", ".jpg", ".png", ".jpeg"]:  # Adjust for your image extensions
                src_image_path = os.path.join(src_img_dir, file_name + ext)
                if os.path.exists(src_image_path):
                    shutil.copy(src_image_path, dest_img_dir)
                    break
            label_path = os.path.join(src_label_dir, file_name + ".txt")
            if os.path.exists(label_path):
                shutil.copy(label_path, dest_label_dir)

    # Copy files to respective directories
    copy_files(train_files, image_folder, label_folder, output_dirs["train"]["img"], output_dirs["train"]["labels"])
    copy_files(valid_files, image_folder, label_folder, output_dirs["valid"]["img"], output_dirs["valid"]["labels"])
    copy_files(test_files, image_folder, label_folder, output_dirs["test"]["img"], output_dirs["test"]["labels"])

    print("Dataset split completed!")

Preprocessing Non Flooded Data

In [None]:
print("\nProcessing allFloodedData dataset")

base_folder_allFlooded = "/content/drive/MyDrive/unityData/allFloodedData/finaleval"
image_folder_allFlooded = os.path.join(base_folder_allFlooded, "images")
label_folder_allFlooded = os.path.join(base_folder_allFlooded, "labels")

output_dirs_allFlooded = {
    "train": {"img": os.path.join(base_folder_allFlooded, "train/images"), "labels": os.path.join(base_folder_allFlooded, "train/labels")},
    "valid": {"img": os.path.join(base_folder_allFlooded, "valid/images"), "labels": os.path.join(base_folder_allFlooded, "valid/labels")},
    "test": {"img": os.path.join(base_folder_allFlooded, "test/images"), "labels": os.path.join(base_folder_allFlooded, "test/labels")},
}

# Count files
print(f"Number of images: {count_files_in_folder(image_folder_allFlooded)}")
print(f"Number of labels: {count_files_in_folder(label_folder_allFlooded)}")

# Handle unpaired files
find_and_handle_unpaired_files(image_folder_allFlooded, label_folder_allFlooded)

# Split dataset
split_dataset(image_folder_allFlooded, label_folder_allFlooded, output_dirs_allFlooded)



Processing allFloodedData dataset
Folder not found: /content/drive/MyDrive/unityData/allFloodedData/finaleval/images
Number of images: 0
Folder not found: /content/drive/MyDrive/unityData/allFloodedData/finaleval/labels
Number of labels: 0


FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/unityData/allFloodedData/finaleval/images'

Preprocessing Flooded Data

In [None]:
print("\nProcessing nonFlooded dataset")

base_folder = "/content/drive/MyDrive/unityData/nonflooded"
image_folder = os.path.join(base_folder, "images")
label_folder = os.path.join(base_folder, "labels")

output_dirs = {
    "train": {"img": os.path.join(base_folder, "train/images"), "labels": os.path.join(base_folder, "train/labels")},
    "valid": {"img": os.path.join(base_folder, "valid/images"), "labels": os.path.join(base_folder, "valid/labels")},
    "test": {"img": os.path.join(base_folder, "test/images"), "labels": os.path.join(base_folder, "test/labels")},
}

# Count files
print(f"Number of images: {count_files_in_folder(image_folder)}")
print(f"Number of labels: {count_files_in_folder(label_folder)}")

# Handle unpaired files
find_and_handle_unpaired_files(image_folder, label_folder)

# Split dataset
split_dataset(image_folder, label_folder, output_dirs)


Processing nonFlooded dataset
Folder not found: /content/drive/MyDrive/unityData/nonflooded/images
Number of images: 0
Folder not found: /content/drive/MyDrive/unityData/nonflooded/labels
Number of labels: 0


FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/unityData/nonflooded/images'