### Extract Embeddings

Based on https://github.com/rom1504/clip-retrieval

First, pip install clip-retrieval or pip install git+https://github.com/openai/CLIP.git


TODO:
- need to split noisy embeddings from clean embeddings into different files
- labels need to be npy (check if they're needed)
- split embedding files to train (80% for Kairos) and test (20% for ResNet)

In [3]:
import os
import torch
import clip
from PIL import Image
from tqdm import tqdm
import numpy as np
from datasets import load_dataset
from datasets import Image
import pandas as pd
from utils.label_mappings import *

In [4]:
# Load model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

#### iNaturalist Embeddings
Need to get embeddings for all images in iNat dataset (3.3GB) for Kairos to curate the insects from the rest (noisy). 36355 rows/images

In [5]:
# Settings
iNat36 = load_dataset("sxj1215/inaturalist", split='train') #36k rows #3.3 GB



In [6]:
# Need to use all images for embeddings
# Need to generate unique IDs for metadata/filenames 
iNat36_label_df = pd.DataFrame({'messages': iNat36['messages']})

def get_iNat_label(messages):
    return messages[1]['content']
    
iNat36_label_df['species'] = iNat36_label_df['messages'].apply(get_iNat_label)
#list(iNat36_df.groupby('species').count().index)

def map_inat_to_clean_label(label):
    if label in iNat_to_clean_map:
        return iNat_to_clean_map[label]
    else:
        return 'noise'
        
iNat36_label_df['clean_label'] = iNat36_label_df['species'].apply(map_inat_to_clean_label)

In [7]:
iNat36_label_df.iloc[:10].get('clean_label')

0    noise
1    noise
2    noise
3    noise
4    noise
5    noise
6    noise
7    noise
8    noise
9    noise
Name: clean_label, dtype: object

In [8]:
OUT_DIR = "inat_embs"
os.makedirs(OUT_DIR, exist_ok=True)

# Extraction loop
inat_embeddings = []
inat_metadata = []

for idx in tqdm(range(len(iNat36))):
    try:
        row = iNat36[idx]
        img = row["images"][0]
        img = preprocess(img).unsqueeze(0).to(device)
        with torch.no_grad():
            feats = model.encode_image(img)
            feats /= feats.norm(dim=-1, keepdim=True)
        inat_embeddings.append(feats.cpu().numpy())
        inat_metadata.append(iNat36_label_df.iloc[idx]["clean_label"])
    except Exception as e:
        inat_embeddings.append(np.zeros(512).reshape(1, 512)) #need to add these so indexes line up later
        inat_metadata.append('skip')
        continue

print(f"Successfully processed {len(inat_embeddings)} examples")

  1%|          | 358/36355 [11:42<19:37:38,  1.96s/it]


KeyboardInterrupt: 

In [7]:
len(inat_embeddings)

36355

In [8]:
len(inat_metadata)

36355

In [14]:
if inat_embeddings:
    emb_matrix = np.vstack(inat_embeddings)
    
    np.save(os.path.join(OUT_DIR, "embeddings.npy"), emb_matrix)
    
    with open(os.path.join(OUT_DIR, "metadata.txt"), "w") as f:
        f.write("\n".join(inat_metadata))
        
    print(f"Success! Saved {emb_matrix.shape} matrix to {OUT_DIR}/embeddings.npy")

Success! Saved (36355, 512) matrix to inat_embs/embeddings.npy


#### Kaggle Embeddings

In [70]:
from sample_clean_data import sampled_clean_data # variable that hold stratefied random sampled data

# Settings
IMG_DIR = "clean_insect_images"
OUT_DIR = "clean_embs"
os.makedirs(OUT_DIR, exist_ok=True)

# Extraction loop
clean_embeddings = []
clean_metadata = []

for path in tqdm(sampled_clean_data): # only look at the samples clean images
    try:
        image = preprocess(Image.open(path)).unsqueeze(0).to(device)
        
        with torch.no_grad():
            features = model.encode_image(image)
            features /= features.norm(dim=-1, keepdim=True)    # normalize for cosine similarity
            
        clean_embeddings.append(features.cpu().numpy())
        clean_metadata.append(path)
        
    except Exception as e:
        print(f"Skipping corrupt image {path}: {e}")

100%|██████████| 400/400 [00:08<00:00, 46.34it/s]


In [71]:
if embeddings:
    emb_matrix = np.vstack(clean_embeddings)
    
    np.save(os.path.join(OUT_DIR, "embeddings.npy"), emb_matrix)
    
    with open(os.path.join(OUT_DIR, "metadata.txt"), "w") as f:
        f.write("\n".join(clean_metadata))
        
    print(f"Success! Saved {emb_matrix.shape} matrix to {OUT_DIR}/embeddings.npy")

Success! Saved (400, 512) matrix to clean_embs/embeddings.npy
