# Libraries

In [2]:
import os
import sys
import cv2
import numpy as np
import pandas as pd
import pickle
from paths import LABEL_CSV, IMAGES_DIR, SKIMAGE_FEATURES_PATH, FEATURE_EXTRACTORS_DIR, IMG2VEC_FEATURES_PATH

sys.path.append(str(FEATURE_EXTRACTORS_DIR))

In [3]:
from skimage_feature_extractor import extract_features as sk_extractor
from img2vec import img2vec_extractor

# Extraction

In [4]:
def main(output, extractor):
    if not LABEL_CSV.exists():
        print(f"Missing labels at: {LABEL_CSV}")
        return

    df = pd.read_csv(LABEL_CSV)
    features, labels = [], []

    skipped_img_count = 0
    failed_img_count = 0
    for i, row in df.iterrows():
        filename = row['Filename']
        label = row['Label']
        img_path = IMAGES_DIR / filename

        if not img_path.exists():
            print(f"[{i}] Skipping missing image: {filename}")
            skipped_img_count +=1
            continue

        image = cv2.imread(str(img_path))
        if image is None:
            print(f"[{i}] Failed to load image: {filename}")
            failed_img_count +=1
            continue

        try:
            feature = extractor(image)
            features.append(feature)
            labels.append(label)
        except Exception as e:
            print(f"[{i}] Error processing {filename}: {e}")

        if i % 1000 == 0: 
            print(f"Processed {i}/{len(df)} images...")

    features = np.array(features)
    labels = np.array(labels)

    os.makedirs(output.parent, exist_ok=True)
    with open(output, "wb") as f:
        pickle.dump((features, labels), f)

    print(f"Process completed. Saved features to: {output}")
    print(f"X shape: {features.shape}, Y shape: {labels.shape}")
    print(f"Skipped Images: {skipped_img_count}, Failed Images: {failed_img_count}")

## SKIMAGE

In [None]:
main(SKIMAGE_FEATURES_PATH, sk_extractor)

## IMG2VEC

In [5]:
main(IMG2VEC_FEATURES_PATH, img2vec_extractor)

Processed 0/17509 images...
Processed 1000/17509 images...
Processed 2000/17509 images...
Processed 3000/17509 images...
Processed 4000/17509 images...
Processed 5000/17509 images...
Processed 6000/17509 images...
Processed 7000/17509 images...
Processed 8000/17509 images...
Processed 9000/17509 images...
Processed 10000/17509 images...
Processed 11000/17509 images...
Processed 12000/17509 images...
Processed 13000/17509 images...
Processed 14000/17509 images...
Processed 15000/17509 images...
Processed 16000/17509 images...
Processed 17000/17509 images...
Process completed. Saved features to: C:\Users\chrst\Desktop\Personal Projects\idk_yet\data\models\features (img2vec).pkl
X shape: (17509, 512), Y shape: (17509,)
Skipped Images: 0, Failed Images: 0


# Features

In [None]:
df = pd.read_csv(LABEL_CSV)

In [None]:
df_labels = df[['Label', 'Species']].drop_duplicates().sort_values('Label').reset_index(drop=True)
print(df_labels)

In [None]:
df2 = df.drop(columns=["Filename"])
result  = df2.groupby(['Label']).value_counts()
print(result)