# Setup

In [1]:
feature_shape = None
features_dir = None
cpu_bin_name = None
gpu_bin_name = None
ocr_bin_name = None
multi_tag_bin_name = None
metadata_encoded_path = None

In [2]:
import os

dir_path = os.getcwd()
parent_dir_path = os.path.dirname(dir_path)

if not feature_shape:
    feature_shape = 512
    
if not features_dir:
    features_dir = f'{parent_dir_path}/data_extraction/clip/CLIPv2_features'

if not cpu_bin_name:
    cpu_bin_name = 'faiss_clipv2_cosine_cpu.bin'
    
if not gpu_bin_name:
    gpu_bin_name = 'faiss_clipv2_cosine_gpu.bin'

if not ocr_bin_name:
    ocr_bin_name = "faiss_ocr_cosine.bin"
    
if not metadata_encoded_path:
    metadata_encoded_path = f"{dir_path}/metadata_encoded"
    
if not ocr_bin_name:
    ocr_bin_name = "faiss_ocr_cosine.bin"
if not multi_tag_bin_name:
    multi_tag_bin_name = "faiss_multi_tag_cosine.bin"

In [3]:
! pip install faiss-cpu


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [4]:
import os
import glob
import faiss
import numpy as np
from tqdm import tqdm
import json
import cv2
import matplotlib.pyplot as plt
import math
import logging

In [5]:
# def search_similar_frames(query, index, vectorizer, frame_ids, top_k=5):
#     """
#     Search for similar frames using FAISS index.

#     Parameters:
#     - query: The query text
#     - index: The FAISS index
#     - vectorizer: The vectorizer to transform the query text
#     - frame_ids: List of frame IDs corresponding to the vectors in the index
#     - top_k: Number of top results to return

#     Returns:
#     - List of dictionaries containing frame_id and similarity score
#     """
#     # Transform the query using the vectorizer
#     query_vector = vectorizer.embed(query)

#     # Perform the search
#     distances, indices = index.search(query_vector, top_k)

#     # Prepare the results
#     results = []
#     for i, idx in enumerate(indices[0]):
#         results.append({
#             'frame_id': frame_ids[idx],
#             # Convert distance to similarity
#             'similarity': 1 - distances[0][i],
#         })

#     return results


# def visualize_search_results(query, results, visual_encoding, metadata, image_dir):
#     print(f"\nTop {len(results)} frames similar to query '{query}':")

#     n_images = len(results)
#     n_cols = min(3, n_images)
#     n_rows = math.ceil(n_images / n_cols)

#     fig, axs = plt.subplots(n_rows, n_cols, figsize=(5*n_cols, 5*n_rows))
#     if n_images == 1:
#         axs = np.array([axs])
#     axs = axs.flatten()

#     with open(metadata, 'r') as file:
#         keyframe_metadata = json.load(file)

#     for i, result in enumerate(results):
#         print(
#             f"Frame ID: {result['frame_id']}, Similarity: {result['similarity']:.4f}")

#         # Load the image
#         frame_path = keyframe_metadata[result['frame_id']]["frame_path"]
#         print(result['frame_id'])
#         image_path = os.path.join(image_dir, frame_path)
#         image = cv2.imread(image_path)
#         image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

#         if visual_encoding:
#             image_with_grid = visual_encoding.visualize_grid(image.copy())

#             axs[i].imshow(image_with_grid)
#             axs[i].set_title(
#                 f"Frame ID: {result['frame_id']}\nSimilarity: {result['similarity']:.4f}")
#             axs[i].axis('off')
#         else:
#             axs[i].imshow(image)
#             axs[i].set_title(
#                 f"Frame ID: {result['frame_id']}\nSimilarity: {result['similarity']:.4f}")
#             axs[i].axis('off')

#     for j in range(i+1, len(axs)):
#         axs[j].axis('off')

#     plt.tight_layout()
#     plt.suptitle(f"Search Results for Query: '{query}'", fontsize=16, y=1.02)
#     plt.show()

# Indexing CLIP

In [6]:
# import numpy as np
# import torch
# from typing import Optional, Tuple
# from open_clip import create_model_and_transforms, get_tokenizer

# class OpenClipEmbedder:
#     def __init__(self, model_name: str = 'ViT-L-14', pretrained: str = 'datacomp_xl_s13b_b90k', feature_shape: Optional[Tuple[int, ...]] = None):
#         # self.device = "cuda" if torch.cuda.is_available() else "cpu"
#         self.device = "cpu"
#         self.model, _, _ = create_model_and_transforms(
#             model_name, device=self.device, pretrained=pretrained)
#         self.model.eval()
#         self.tokenizer = get_tokenizer(model_name)
#         self.feature_shape = feature_shape

#     @torch.no_grad()
#     def embed(self, text: str) -> np.ndarray:
#         text_tokens = self.tokenizer([text]).to(self.device)
#         text_features = self.model.encode_text(text_tokens)
#         embedding = text_features.cpu().numpy()[0]


#         resized_embedding = self.resize_embedding(
#             embedding, self.feature_shape)


#         return resized_embedding

In [7]:


def setup_logging():
    """Set up logging configuration."""
    logging.basicConfig(level=logging.INFO,
                        format='%(asctime)s - %(levelname)s - %(message)s')


def validate_input(features_dir):
    """Validate input directory and find .npy files."""
    if not os.path.isdir(features_dir):
        raise ValueError(f"Directory not found: {features_dir}")

    npy_files = glob.glob(os.path.join(
        features_dir, '**', '*.npy'), recursive=True)
    if not npy_files:
        raise ValueError(f"No .npy files found in {features_dir}")

    return npy_files


def initialize_index(npy_files, expected_feature_shape):
    """Initialize FAISS index based on the first feature file."""
    first_feature = np.load(npy_files[0])
    if len(first_feature.shape) != 2:
        first_feature = first_feature.reshape(-1, expected_feature_shape)
    feature_dim = first_feature.shape[1]

    if feature_dim != expected_feature_shape:
        logging.warning(
            f"Actual feature dimension {feature_dim} doesn't match expected {expected_feature_shape}")
        logging.info(
            f"Using actual feature dimension {feature_dim} for index creation")

    return faiss.IndexFlatIP(feature_dim), feature_dim


def create_gpu_index(cpu_index):
    """Attempt to create a GPU index."""
    try:
        res = faiss.StandardGpuResources()
        gpu_index = faiss.index_cpu_to_gpu(res, 0, cpu_index)
        logging.info("GPU index creation is available")
        return gpu_index, True
    except Exception as e:
        logging.warning(f"GPU index creation not available: {e}")
        return None, False


def process_feature_file(feature_path, cpu_index, gpu_index, feature_shape):
    """Process a single feature file and add to indexes."""
    try:
        feats = np.load(feature_path)
        if feats.size == 0:
            logging.warning(
                f"Empty array loaded from {feature_path}. Skipping this file.")
            return 0

        if len(feats.shape) != 2:
            feats = feats.reshape(-1, feature_shape)

        feats = feats.astype(np.float32)

        if feats.shape[1] != feature_shape:
            logging.warning(f"Feature dimension mismatch in {feature_path}. "
                            f"Expected {feature_shape}, got {feats.shape[1]}. Skipping this file.")
            return 0

        faiss.normalize_L2(feats)

        cpu_index.add(feats)
        if gpu_index:
            gpu_index.add(feats)

        return feats.shape[0]
    except Exception as e:
        logging.error(f"Error processing {feature_path}: {e}")
        return 0


def save_indexes(cpu_index, gpu_index, cpu_bin_name, gpu_bin_name, total_vectors):
    """Save CPU and GPU indexes to disk."""
    faiss.write_index(cpu_index, cpu_bin_name)
    logging.info(
        f"CPU FAISS index with {total_vectors} vectors saved to {cpu_bin_name}")

    if gpu_index:
        gpu_index_cpu = faiss.index_gpu_to_cpu(gpu_index)
        faiss.write_index(gpu_index_cpu, gpu_bin_name)
        logging.info(
            f"GPU FAISS index with {total_vectors} vectors saved to {gpu_bin_name}")


def create_faiss_indexes_clip(cpu_bin_name, gpu_bin_name, features_dir, feature_shape):
    """
    Create both CPU and GPU FAISS indexes for CLIP v2 features.

    Parameters:
    - cpu_bin_name: Name of the output CPU FAISS index file
    - gpu_bin_name: Name of the output GPU FAISS index file
    - features_dir: Directory containing feature files
    - feature_shape: Expected shape of each feature vector

    Returns:
    - None (saves the indexes to disk)
    """
    setup_logging()
    npy_files = validate_input(features_dir)
    cpu_index, feature_dim = initialize_index(npy_files, feature_shape)
    gpu_index, use_gpu = create_gpu_index(cpu_index)

    total_vectors = 0
    with tqdm(total=len(npy_files), desc="Processing feature files", unit="file") as pbar:
        for feature_path in npy_files:
            vectors_added = process_feature_file(
                feature_path, cpu_index, gpu_index, feature_dim)
            total_vectors += vectors_added
            pbar.update(1)
            pbar.set_postfix({'Total Vectors': total_vectors})

    save_indexes(cpu_index, gpu_index, cpu_bin_name,
                 gpu_bin_name, total_vectors)
    logging.info("Indexing complete.")

In [8]:
create_faiss_indexes_clip(cpu_bin_name, gpu_bin_name, features_dir, feature_shape)

2024-09-08 11:42:29,935 - INFO - Using actual feature dimension 768 for index creation
Processing feature files: 100%|██████████| 4/4 [00:00<00:00, 142.51file/s, Total Vectors=2132]
2024-09-08 11:42:29,974 - INFO - CPU FAISS index with 2132 vectors saved to faiss_clipv2_cosine_cpu.bin
2024-09-08 11:42:29,975 - INFO - Indexing complete.


In [9]:
# index = faiss.read_index(cpu_bin_name)

# vectorizer = OpenClipEmbedder()
# query = "the policeman"
# results = search_similar_frames(query, index, vectorizer)

# Indexing OCR

In [10]:
def load_vectors(vector_path):
    for item in os.listdir(vector_path):
        if item.split('_')[-1] == "vectors.npz":
            # Load the vectors from the .npz file
            vectors = load_npz(f"{vector_path}/{item}")
    
            # Convert to dense numpy array if it's a sparse matrix
            if isinstance(vectors, np.ndarray):
                return vectors.astype('float32')
            else:
                return vectors.toarray().astype('float32')
            
def create_and_save_faiss_index(vector_path, output_path):
    # Load vectors
    vectors = load_vectors(vector_path)
    
    # Create FAISS index
    dimension = vectors.shape[1]
    index = faiss.IndexFlatIP(dimension)
    index.add(vectors)
    
    # Save the index
    faiss.write_index(index, output_path)
    print(f"FAISS index saved to {output_path}")

In [11]:
vector_path = f"{metadata_encoded_path}/ocr"
output_path = f"{dir_path}/{ocr_bin_name}"

# create_and_save_faiss_index(vector_path, output_path)

# Indexing multi-tag

In [12]:
def load_vectors(vector_path):
    for item in os.listdir(vector_path):
        if item.split('_')[-1] == "vectors.npz":
            # Load the vectors from the .npz file
            vectors = load_npz(f"{vector_path}/{item}")
    
            # Convert to dense numpy array if it's a sparse matrix
            if isinstance(vectors, np.ndarray):
                return vectors.astype('float32')
            else:
                return vectors.toarray().astype('float32')
            
# def create_faiss_index(vectors, nlist=100):
#     # vectors = vectors.astype('float32').toarray()
#     dimension = vectors.shape[1]
    
#     quantizer = faiss.IndexFlatIP(dimension)
#     index = faiss.IndexIVFFlat(quantizer, dimension, nlist, faiss.METRIC_INNER_PRODUCT)
    
#     faiss.normalize_L2(vectors)
    
#     index.train(vectors)
#     index.add(vectors)
#     return index

def create_faiss_index_flat(vectors, output_path):
    # vectors = vectors.astype('float32').toarray()
    faiss.normalize_L2(vectors)
    
    dimension = vectors.shape[1]
    index = faiss.IndexFlatIP(dimension)
    index.add(vectors)
    
    # Save the index    
    faiss.write_index(index, output_path)

In [13]:
# Create and save FAISS index
vector_path = f"{metadata_encoded_path}/multi_tag"
output_path = f"{dir_path}/{multi_tag_bin_name}"

vectors = load_vectors(vector_path)
# create_faiss_index_flat(vectors, output_path)

NameError: name 'load_npz' is not defined