<a href="https://colab.research.google.com/github/gu-ma/hgk-ml-workshop/blob/main/notebooks/Image_Search_01_Process_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Process your dataset with CLIP

This notebook processes all the downloaded photos using OpenAI's [CLIP neural network](https://github.com/openai/CLIP). For each image we get a feature vector containing 512 float numbers, which we will store in a file. These feature vectors will be used later to compare them to the text feature vectors.

This step will be significantly faster if you have a GPU, but it will also work on the CPU.

## Setup

In [None]:
! pip install git+https://github.com/openai/CLIP.git

In [None]:
import clip
import torch
from PIL import Image

# Load the open CLIP model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

# Function that computes the feature vectors for a batch of images
def compute_clip_features(photos_batch):
    # Load all the photos from the files
    photos = [Image.open(photo_file) for photo_file in photos_batch]
    
    # Preprocess all photos
    photos_preprocessed = torch.stack([preprocess(photo) for photo in photos]).to(device)
    
    with torch.no_grad():
        # Encode the photos batch to compute the feature vectors and normalize them
        photos_features = model.encode_image(photos_preprocessed)
        photos_features /= photos_features.norm(dim=-1, keepdim=True)

    # Transfer the feature vectors back to the CPU and convert to numpy
    return photos_features.cpu().numpy()

## Connect to Gdrive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## Process the photos

Load all photos from the folder they were stored. We will then compute the features for all photos. We will do that in batches, because it is much more efficient. You should tune the batch size so that it fits on your GPU. The processing on the GPU is fairly fast, so the bottleneck will probably be loading the photos from the disk.

In [None]:
import re
import os
import math
import shutil
import numpy as np
import pandas as pd
from pathlib import Path

# @markdown Path to source directory on google drive. Right click your directory and choose "copy path" then paste it below.

# @markdown ⚠️ __This is the folder with the scenes and images, not the original videos dataset__ ⚠️
gdrive_input_dir = "/content/drive/MyDrive/AI/hgk_workshop/playlist01_output"  # @param { type:'string' }

# @markdown Must be smaller than the number of images
batch_size = 32  # @param { type:"number" }

# @markdown Copy all files to google drive when done
copy_to_gdrive = True  # @param { type:"boolean" }

# Some other dir / vars
(gdrive_path, gdrive_folder) = os.path.split(gdrive_input_dir)

input_dir = gdrive_folder
output_dir = f'{gdrive_folder}_clip'

gdrive_output_dir = os.path.join(gdrive_path, output_dir)

# Create directories
os.makedirs(input_dir, exist_ok=True)
os.makedirs(output_dir, exist_ok=True)

# Copy all jpg locally
! cp -r {gdrive_input_dir}/*.jpg {input_dir}

# Delete existing features
! rm {output_dir}/*.*


# Set the path to the photos
photos_path = Path(input_dir)

# List all JPGs in the folder
photos_files = list(photos_path.glob("*.jpg"))
photos_files.sort()

# Print some statistics
print(f"Photos found: {len(photos_files)}")
print(*photos_files[:10], sep='\n')

# Path where the feature vectors will be stored
features_path = Path(output_dir)

# Compute how many batches are needed
batches = math.ceil(len(photos_files) / batch_size)

# Process each batch
for i in range(batches):
    print(f"Processing batch {i+1}/{batches}")

    batch_ids_path = features_path / f"{i:010d}.csv"
    batch_features_path = features_path / f"{i:010d}.npy"
    
    # Only do the processing if the batch wasn't processed yet
    if not batch_features_path.exists():
        try:
            # Select the photos for the current batch
            batch_files = photos_files[i*batch_size : (i+1)*batch_size]

            # Compute the features and save to a numpy file
            batch_features = compute_clip_features(batch_files)
            np.save(batch_features_path, batch_features)

            # Save the photo IDs to a CSV file
            photo_ids = [photo_file.name.split(".")[0] for photo_file in batch_files]
            photo_ids_data = pd.DataFrame(photo_ids, columns=['photo_id'])
            photo_ids_data.to_csv(batch_ids_path, index=False)
        except:
            # Catch problems with the processing to make the process more robust
            print(f'Problem with batch {i}')

# Merge the features and the photo IDs. 

# Load all numpy files
features_list = [np.load(features_file) for features_file in sorted(features_path.glob("*.npy"))]

# Concatenate the features and store in a merged file
features = np.concatenate(features_list)
np.save(features_path / "features.npy", features)

# Load all the photo IDs
photo_ids = pd.concat([pd.read_csv(ids_file) for ids_file in sorted(features_path.glob("*.csv"))])
photo_ids.to_csv(features_path / "photo_ids.csv", index=False)

# Delete inter results 
! rm {features_path}/*[0-9]*.*

# Copy result to gdrive
if copy_to_gdrive:
    shutil.copytree(output_dir, gdrive_output_dir, dirs_exist_ok=True)