## Step 1 ‚Äì Imports & Paths

In [9]:
import os
import numpy as np
import pandas as pd
from PIL import Image
import cv2

BASE_DIR = "."

TRAIN_CSV = os.path.join(BASE_DIR, "train.csv")
TRAIN_IMG_DIR = os.path.join(BASE_DIR, "train_images")

print("train.csv exists:", os.path.exists(TRAIN_CSV))
print("train_images folder exists:", os.path.isdir(TRAIN_IMG_DIR))

train.csv exists: True
train_images folder exists: True


## Step 2 ‚Äì Load train.csv and attach ONE image per pet

In [10]:
train_df = pd.read_csv(TRAIN_CSV)
print(train_df.shape)
train_df.head()

(14993, 24)


Unnamed: 0,Type,Name,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,...,Health,Quantity,Fee,State,RescuerID,VideoAmt,Description,PetID,PhotoAmt,AdoptionSpeed
0,2,Nibble,3,299,0,1,1,7,0,1,...,1,1,100,41326,8480853f516546f6cf33aa88cd76c379,0,Nibble is a 3+ month old ball of cuteness. He ...,86e1089a3,1.0,2
1,2,No Name Yet,1,265,0,1,1,2,0,2,...,1,1,0,41401,3082c7125d8fb66f7dd4bff4192c8b14,0,I just found it alone yesterday near my apartm...,6296e909a,2.0,0
2,1,Brisco,1,307,0,1,2,7,0,2,...,1,1,0,41326,fa90fa5b1ee11c86938398b60abc32cb,0,Their pregnant mother was dumped by her irresp...,3422e4906,7.0,3
3,1,Miko,4,307,0,2,1,2,0,2,...,1,1,150,41401,9238e4f44c71a75282e62f7136c6b240,0,"Good guard dog, very alert, active, obedience ...",5842f1ff5,8.0,2
4,1,Hunter,1,307,0,1,1,0,0,2,...,1,1,0,41326,95481e953f8aed9ec3d16fc4509537e8,0,This handsome yet cute boy is up for adoption....,850a43f90,3.0,2


In [11]:
def get_first_image_path(pet_id: str, max_imgs: int = 10) -> str | None:
    """
    Return the path of the first existing image for a PetID, or None if none exist.
    """
    for i in range(1, max_imgs + 1):
        candidate = os.path.join(TRAIN_IMG_DIR, f"{pet_id}-{i}.jpg")
        if os.path.exists(candidate):
            return candidate
    return None

In [12]:
train_df["image_path"] = train_df["PetID"].astype(str).apply(get_first_image_path)

# Keep only rows where at least one image exists
img_df = train_df[train_df["image_path"].notnull()].reset_index(drop=True)
print(img_df.shape)
img_df[["PetID", "image_path"]].head()

(14652, 25)


Unnamed: 0,PetID,image_path
0,86e1089a3,./train_images/86e1089a3-1.jpg
1,6296e909a,./train_images/6296e909a-1.jpg
2,3422e4906,./train_images/3422e4906-1.jpg
3,5842f1ff5,./train_images/5842f1ff5-1.jpg
4,850a43f90,./train_images/850a43f90-1.jpg


## Step 3 ‚Äì Define image feature functions

We‚Äôll compute 3 simple features per image:

- brightness ‚Äì average pixel intensity
- colorfulness ‚Äì how ‚Äúcolorful‚Äù the image is
- blur ‚Äì variance of Laplacian (low = blurry)

In [14]:
def load_image_cv2(path: str):
    """
    Load an image with cv2 in BGR format.
    Returns None if loading fails.
    """
    img = cv2.imread(path)
    if img is None:
        return None
    return img

def compute_brightness(img_bgr: np.ndarray) -> float:
    """
    Approx brightness = mean over grayscale values.
    """
    gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
    return float(gray.mean())

def compute_colorfulness(img_bgr: np.ndarray) -> float:
    """
    Hasler & S√ºsstrunk colorfulness metric approximation.
    """
    (B, G, R) = cv2.split(img_bgr.astype("float"))
    rg = np.abs(R - G)
    yb = np.abs(0.5 * (R + G) - B)

    rg_mean, rg_std = rg.mean(), rg.std()
    yb_mean, yb_std = yb.mean(), yb.std()

    std_root = np.sqrt((rg_std ** 2) + (yb_std ** 2))
    mean_root = np.sqrt((rg_mean ** 2) + (yb_mean ** 2))

    return float(std_root + (0.3 * mean_root))

def compute_blur(img_bgr: np.ndarray) -> float:
    """
    Variance of Laplacian: lower = blurrier.
    """
    gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
    return float(cv2.Laplacian(gray, cv2.CV_64F).var())

In [15]:
import time

brightness_list = []
colorfulness_list = []
blur_list = []

start_time = time.time()

for idx, row in img_df.iterrows():
    img_path = row["image_path"]
    img = load_image_cv2(img_path)

    if img is None:
        # If the image can't be read, store NaNs
        brightness_list.append(np.nan)
        colorfulness_list.append(np.nan)
        blur_list.append(np.nan)
    else:
        brightness_list.append(compute_brightness(img))
        colorfulness_list.append(compute_colorfulness(img))
        blur_list.append(compute_blur(img))

    # üîç Debug progress print
    if (idx + 1) % 1000 == 0:
        elapsed = time.time() - start_time
        print(f"Processed {idx + 1:,} images ‚Äî elapsed {elapsed:.2f} seconds")

# Attach results to dataframe
img_df["img_brightness"] = brightness_list
img_df["img_colorfulness"] = colorfulness_list
img_df["img_blur"] = blur_list

total_time = time.time() - start_time
print(f"\nüéâ Done! Processed {len(img_df):,} images in {total_time:.2f} seconds.")

Processed 1,000 images ‚Äî elapsed 2.94 seconds
Processed 2,000 images ‚Äî elapsed 5.83 seconds
Processed 3,000 images ‚Äî elapsed 8.65 seconds
Processed 4,000 images ‚Äî elapsed 11.51 seconds
Processed 5,000 images ‚Äî elapsed 14.39 seconds
Processed 6,000 images ‚Äî elapsed 17.29 seconds
Processed 7,000 images ‚Äî elapsed 20.28 seconds
Processed 8,000 images ‚Äî elapsed 23.42 seconds
Processed 9,000 images ‚Äî elapsed 26.46 seconds
Processed 10,000 images ‚Äî elapsed 29.54 seconds
Processed 11,000 images ‚Äî elapsed 32.44 seconds
Processed 12,000 images ‚Äî elapsed 35.45 seconds
Processed 13,000 images ‚Äî elapsed 38.35 seconds
Processed 14,000 images ‚Äî elapsed 41.30 seconds

üéâ Done! Processed 14,652 images in 43.21 seconds.


In [17]:
import pathlib

OUTPUT_DIR = os.path.join(BASE_DIR, "processed")
pathlib.Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)

img_features_path = os.path.join(OUTPUT_DIR, "image_features.csv")
img_df.to_csv(img_features_path, index=False)

img_features_path

'./processed/image_features.csv'