In [None]:
import os
import shutil

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Copy all the images from their folders into a single folder

In [None]:
def copy_images(source_dir, target_dir, extensions=None):
    if extensions is None:
        extensions = ['.jpg', '.jpeg', '.png']

    if not os.path.exists(target_dir):
        os.makedirs(target_dir)

    for root, dirs, files in os.walk(source_dir):
        for file in files:
            if any(file.lower().endswith(ext) for ext in extensions):
                source_file = os.path.join(root, file)
                target_file = os.path.join(target_dir, file)

                if not os.path.exists(target_file):  # Avoid overwriting files with same name
                    shutil.copy2(source_file, target_file)
                else:
                    # Handle duplicates
                    base, extension = os.path.splitext(file)
                    i = 1
                    new_target_file = os.path.join(target_dir, f"{base}_{i}{extension}")
                    while os.path.exists(new_target_file):
                        i += 1
                        new_target_file = os.path.join(target_dir, f"{base}_{i}{extension}")
                    shutil.copy2(source_file, new_target_file)

source_directory = '/content/drive/MyDrive/Masterthesis/Datasets/Waterbird/raw_224/separate'
target_directory = '/content/drive/MyDrive/Masterthesis/Datasets/Waterbird/raw_224'

copy_images(source_directory, target_directory)

resize to 224x224

In [None]:
from PIL import Image

def resize_images(directory, size=(224, 224)):
    for filename in os.listdir(directory):
        if filename.lower().endswith(('.png', '.jpg', '.jpeg')):
            file_path = os.path.join(directory, filename)
            try:
                with Image.open(file_path) as img:
                    img = img.resize(size, Image.ANTIALIAS)
                    img.save(file_path)
                    print(f"Resized {filename}")
            except Exception as e:
                print(f"Error processing {filename}: {e}")

target_directory = '/content/drive/MyDrive/Masterthesis/Datasets/Waterbird/raw_224'

resize_images(target_directory)


Load meta.csv & remove all characters before + the slash from the meta file to match the image files names later on

In [None]:
import pandas as pd

# Define the path to the CSV file
csv_path = '/content/drive/MyDrive/Masterthesis/Datasets/Waterbird/meta.csv'

# Load the CSV file into a DataFrame
df = pd.read_csv(csv_path)

# Define a function to remove characters before the slash and the slash itself
def modify_filename(filename):
    return filename.split('/')[-1]

# Apply the function to the 'img_filename' column
df['img_filename'] = df['img_filename'].apply(modify_filename)

print("First 10 filenames after modification:")
print(df['img_filename'].head(10))

# Save the updated DataFrame back to a CSV file
df.to_csv(csv_path, index=False)

First 10 filenames after modification:
0        Black_Footed_Albatross_0046_18.jpg
1        Black_Footed_Albatross_0009_34.jpg
2        Black_Footed_Albatross_0002_55.jpg
3        Black_Footed_Albatross_0074_59.jpg
4        Black_Footed_Albatross_0014_89.jpg
5        Black_Footed_Albatross_0085_92.jpg
6       Black_Footed_Albatross_0031_100.jpg
7    Black_Footed_Albatross_0051_796103.jpg
8    Black_Footed_Albatross_0010_796097.jpg
9    Black_Footed_Albatross_0025_796057.jpg
Name: img_filename, dtype: object


Filter with image name
1. images that have y = 1 and copy images to data/processed/waterbird_224 directory
2. images that have y = 0 and copy images to data/processed/no_waterbird_224 directory

In [None]:
# Define the paths
source_directory = '/content/drive/MyDrive/Masterthesis/Datasets/Waterbird/raw_224'
waterbird_directory = '/content/drive/MyDrive/Masterthesis/Datasets/Waterbird/data/processed/waterbird_224'
no_waterbird_directory = '/content/drive/MyDrive/Masterthesis/Datasets/Waterbird/data/processed/no_waterbird_224'

# Load the CSV file into a DataFrame
df = pd.read_csv(csv_path)

# Ensure the target directories exist
os.makedirs(waterbird_directory, exist_ok=True)
os.makedirs(no_waterbird_directory, exist_ok=True)

# Function to copy images based on the filter
def copy_filtered_images(df, source_dir, target_dir, filter_value):
    filtered_df = df[df['y'] == filter_value]
    for filename in filtered_df['img_filename']:
        source_path = os.path.join(source_dir, filename)
        target_path = os.path.join(target_dir, filename)
        if os.path.exists(source_path):  # Check if the source file exists
            shutil.copy2(source_path, target_path)
            print(f"Copied {filename} to {target_dir}")
        else:
            print(f"File {filename} not found in source directory.")

# Copy images with y=1 to waterbird directory
copy_filtered_images(df, source_directory, waterbird_directory, 1)

# Copy images with y=0 to no_waterbird directory
copy_filtered_images(df, source_directory, no_waterbird_directory, 0)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Copied American_Redstart_0137_102848.jpg to /content/drive/MyDrive/Masterthesis/Datasets/Waterbird/data/processed/no_waterbird_224
Copied American_Redstart_0096_102853.jpg to /content/drive/MyDrive/Masterthesis/Datasets/Waterbird/data/processed/no_waterbird_224
Copied American_Redstart_0087_103371.jpg to /content/drive/MyDrive/Masterthesis/Datasets/Waterbird/data/processed/no_waterbird_224
Copied American_Redstart_0047_102860.jpg to /content/drive/MyDrive/Masterthesis/Datasets/Waterbird/data/processed/no_waterbird_224
Copied American_Redstart_0034_102866.jpg to /content/drive/MyDrive/Masterthesis/Datasets/Waterbird/data/processed/no_waterbird_224
Copied American_Redstart_0088_103892.jpg to /content/drive/MyDrive/Masterthesis/Datasets/Waterbird/data/processed/no_waterbird_224
Copied American_Redstart_0138_102869.jpg to /content/drive/MyDrive/Masterthesis/Datasets/Waterbird/data/processed/no_waterbird_224
Copied American_Re

Filter all image file names that have y = 1 and place = 0 and select all images of these names and copy them to the folder: waterbirds_patched_224

Filter all image file names that have y = 1 and place = 1 and select all images of these names and copy them to the folder: waterbirds_nopatch_224

Filter all image file names that have y = 0 and place = 1 and select all images of these names and copy them to the folder: no_waterbirds_patch_224

Filter all image file names that have y = 0 and place = 0 and select all images of these names and copy them to the folder: no_waterbirds_nopatch_224

In [None]:
waterbirds_patch_directory = '/content/drive/MyDrive/Masterthesis/Datasets/Waterbird/waterbirds_patch_224'
waterbirds_nopatch_directory = '/content/drive/MyDrive/Masterthesis/Datasets/Waterbird/waterbirds_nopatch_224'
no_waterbirds_patch_directory = '/content/drive/MyDrive/Masterthesis/Datasets/Waterbird/no_waterbirds_patch_224'
no_waterbirds_nopatch_directory = '/content/drive/MyDrive/Masterthesis/Datasets/Waterbird/no_waterbirds_nopatch_224'


# Ensure the target directory exists
os.makedirs(waterbirds_patch_directory, exist_ok=True)
os.makedirs(waterbirds_nopatch_directory, exist_ok=True)
os.makedirs(no_waterbirds_patch_directory, exist_ok=True)
os.makedirs(no_waterbirds_nopatch_directory, exist_ok=True)



# Function to copy images based on the filter
def copy_filtered_images(df, source_dir, target_dir, y_value, place_value):
    filtered_df = df[(df['y'] == y_value) & (df['place'] == place_value)]
    for filename in filtered_df['img_filename']:
        source_path = os.path.join(source_dir, filename)
        target_path = os.path.join(target_dir, filename)
        if os.path.exists(source_path):  # Check if the source file exists
            shutil.copy2(source_path, target_path)
            print(f"Copied {filename} to {target_dir}")
        else:
            print(f"File {filename} not found in source directory.")

# Copy images with y=1 and place=0 to waterbirds_patched_224 directory
copy_filtered_images(df, source_directory, waterbirds_patch_directory, 1, 0)
# Copy images with y=1 and place=1 to waterbirds_nopatch_224 directory
copy_filtered_images(df, source_directory, waterbirds_nopatch_directory, 1, 1)
# Copy images with y=0 and place=1 to no_waterbirds_patched_224 directory
copy_filtered_images(df, source_directory, no_waterbirds_patch_directory, 0, 1)
# Copy images with y=0 and place=0 to no_waterbirds_nopatch_224 directory
copy_filtered_images(df, source_directory, no_waterbirds_nopatch_directory, 0, 0)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Copied Vermilion_Flycatcher_0002_42390.jpg to /content/drive/MyDrive/Masterthesis/Datasets/Waterbird/no_waterbirds_nopatch_224
Copied Vermilion_Flycatcher_0004_42395.jpg to /content/drive/MyDrive/Masterthesis/Datasets/Waterbird/no_waterbirds_nopatch_224
Copied Vermilion_Flycatcher_0061_42397.jpg to /content/drive/MyDrive/Masterthesis/Datasets/Waterbird/no_waterbirds_nopatch_224
Copied Vermilion_Flycatcher_0040_42398.jpg to /content/drive/MyDrive/Masterthesis/Datasets/Waterbird/no_waterbirds_nopatch_224
Copied Vermilion_Flycatcher_0039_42423.jpg to /content/drive/MyDrive/Masterthesis/Datasets/Waterbird/no_waterbirds_nopatch_224
Copied Vermilion_Flycatcher_0034_42356.jpg to /content/drive/MyDrive/Masterthesis/Datasets/Waterbird/no_waterbirds_nopatch_224
Copied Vermilion_Flycatcher_0071_42429.jpg to /content/drive/MyDrive/Masterthesis/Datasets/Waterbird/no_waterbirds_nopatch_224
Copied Vermilion_Flycatcher_0063_42179.jpg to 

In [None]:
# Count the number of rows in the CSV file
num_rows = len(df)
print(f"Number of rows in the CSV file: {num_rows}")

# Function to count the number of images in a directory
def count_images(directory):
    valid_extensions = ('.png', '.jpg', '.jpeg')
    num_images = sum([1 for filename in os.listdir(directory) if filename.lower().endswith(valid_extensions)])
    return num_images

# Count the number of images in the source directory
num_images = count_images(source_directory)
print(f"Number of images in the source directory: {num_images}")

Number of rows in the CSV file: 11788
Number of images in the source directory: 20203


## Check for duplicates