# Preprocess Image Data

In [23]:
import os
import json
import cv2
import pandas as pd
from skimage.feature import hog
from skimage import io
from pathlib import Path

## Load Data

In [24]:
# Directory paths
base_dir = "../filtered_data"
train_images_dir = os.path.join(base_dir, "train", "images")
train_ann_file = os.path.join(base_dir, "train", "final_ann.json")

print("Train directory:", train_images_dir)
print("Number of images in train directory:", len(os.listdir(train_images_dir)))
print("Train annotations file:", train_ann_file)

# Load COCO annotations
with open(train_ann_file) as f:
    data = json.load(f)

Train directory: ../filtered_data/train/images
Number of images in train directory: 23568
Train annotations file: ../filtered_data/train/final_ann.json


Remove the space at the end of the category names

In [25]:
for category in data['categories']:
  category['name_readable'] = category['name_readable'].strip()

## Extract Features

We are using the hog package from scikit-learn image libary, which is a popular open-source Python library package for image processing and computer vision tasks.

In [26]:
def extract_features(image_path):
    try:
        image = io.imread(image_path)
        # Check if the image is grayscale or color
        if len(image.shape) == 3 and image.shape[2] == 3:  # Color image
            features, hog_image = hog(image, pixels_per_cell=(16, 16), cells_per_block=(1, 1), visualize=True, channel_axis=-1)
        else:  # Grayscale image
            features, hog_image = hog(image, pixels_per_cell=(16, 16), cells_per_block=(1, 1), visualize=True)
        return features
    except FileNotFoundError:
        # If the file is not found, return None
        return None

In [27]:
# Create a DataFrame
df = pd.DataFrame(columns=["image", "features", "bbox", "category"])

## Generate Features CSV

In [34]:
def generate_features(data, train_images_dir):
    # Initialize the counter for skipped images
    skipped_images_count = 0

    # Create a dictionary to map image IDs to file names
    image_id_to_file_name = {image['id']: image['file_name'] for image in data['images']}

    # Initialize an empty DataFrame
    df = pd.DataFrame(columns=["image", "features", "bbox", "category"])

    for ann in data['annotations']:
        image_id = ann['image_id']
        image_name = image_id_to_file_name[image_id]
        image_path = os.path.join(train_images_dir, image_name)

        # check if image exists
        if not Path(image_path).is_file():
            print(f"Image {image_id} not found")
            skipped_images_count = 0
            continue

        # Attempt to extract features
        features = extract_features(image_path)

        # Check if features were successfully extracted
        if features is not None:
            # Append to DataFrame if features are found
            temp_df = pd.DataFrame({
                "image": [image_id],
                "features": [features],
                "bbox": [ann['bbox']],
                "category": [ann['category_id']]
            })
            df = pd.concat([df, temp_df], ignore_index=True)
        else:
            # Increment the counter if the image was skipped
            skipped_images_count += 1

    # After the loop
    print(f"Number of skipped images: {skipped_images_count}")
    # Save DataFrame
    df.to_csv("output/image_features.csv", index=False)

In [35]:
generate_features(data, train_images_dir)

Number of skipped images: 0
