# Setup

In [None]:
feature_shape = None
features_dir = None
cpu_bin_name = None
gpu_bin_name = None
ocr_bin_name = None
multi_tag_bin_name = None
metadata_encoded_path = None

In [None]:
import os

dir_path = os.getcwd()
parent_dir_path = os.path.dirname(dir_path)

if not feature_shape:
    feature_shape = 512
    
if not features_dir:
    features_dir = f'{parent_dir_path}/data_extraction/clip/CLIPv2_features'

if not cpu_bin_name:
    cpu_bin_name = 'faiss_clipv2_cosine_cpu.bin'
    
if not gpu_bin_name:
    gpu_bin_name = 'faiss_clipv2_cosine_gpu.bin'

if not ocr_bin_name:
    ocr_bin_name = "faiss_ocr_cosine.bin"
    
if not metadata_encoded_path:
    metadata_encoded_path = f"{dir_path}/metadata_encoded"
    
if not ocr_bin_name:
    ocr_bin_name = "faiss_ocr_cosine.bin"
if not multi_tag_bin_name:
    multi_tag_bin_name = "faiss_multi_tag_cosine.bin"

In [None]:
! pip install faiss-gpu faiss-cpu



In [None]:
import os
import glob
import faiss
import numpy as np
from tqdm import tqdm
import pickle
from scipy.sparse import load_npz

# Indexing CLIP

In [None]:
def create_faiss_indexes_clip(cpu_bin_name, gpu_bin_name, features_dir, feature_shape):
    """
    Create both CPU and GPU FAISS indexes.

    Parameters:
    - cpu_bin_name: Name of the output CPU FAISS index file
    - gpu_bin_name: Name of the output GPU FAISS index file
    - features_dir: Directory containing feature files
    - feature_shape: Shape of each feature vector

    Returns:
    - None (saves the indexes to disk)
    """
    # Initialize the index with the first feature file to ensure correct dimensionality
    npy_files = list(glob.iglob(os.path.join(features_dir, '*.npy')))
    if not npy_files:
        raise ValueError(f"No .npy files found in {features_dir}")

    first_feature_file = npy_files[0]
    first_feature = np.load(first_feature_file)
    feature_dim = first_feature.shape[1]
    

    if feature_dim != feature_shape:
        print(f"Warning: Actual feature dimension {feature_dim} doesn't match expected {feature_shape}")
        print(f"Using actual feature dimension {feature_dim} for index creation")
    feature_shape = feature_dim

    cpu_index = faiss.IndexFlatIP(feature_shape)

    try:
        res = faiss.StandardGpuResources()
        gpu_index = faiss.index_cpu_to_gpu(
            res, 0, faiss.IndexFlatIP(feature_dim))
        print("GPU index creation is available")
        use_gpu = True
    except Exception as e:
        print(f"GPU index creation not available: {e}")
        use_gpu = False

    for data_part in tqdm(sorted(os.listdir(features_dir)), desc="Processing data parts"):
        print(data_part)
        for feature_path in tqdm(sorted(glob.glob(os.path.join(features_dir, data_part, '*.npy'))),
                                 desc=f"Processing {data_part}"):
            print(feature_path)
            try:
                feats = np.load(feature_path)
                feats = feats.astype(np.float32)

                if feats.shape[1] != feature_shape:
                    print(f"Warning: Feature dimension mismatch in {feature_path}. "
                          f"Expected {feature_shape}, got {feats.shape[1]}. Skipping this file.")
                    continue

                cpu_index.add(feats)
                if use_gpu:
                    gpu_index.add(feats)
            except Exception as e:
                print(f"Error processing {feature_path}: {e}")
                continue

    faiss.write_index(cpu_index, cpu_bin_name)
    print(f"CPU FAISS index saved to {cpu_bin_name}")

    if use_gpu:
        gpu_index_cpu = faiss.index_gpu_to_cpu(gpu_index)
        faiss.write_index(gpu_index_cpu, gpu_bin_name)
        print(f"GPU FAISS index saved to {gpu_bin_name}")

In [None]:
create_faiss_indexes_clip(cpu_bin_name, gpu_bin_name, features_dir, feature_shape)

Using actual feature dimension 768 for index creation
GPU index creation not available: module 'faiss' has no attribute 'StandardGpuResources'


Processing data parts:   0%|          | 0/4 [00:00<?, ?it/s]

L01_V001.npy


Processing L01_V001.npy: 0it [00:00, ?it/s]


L01_V001_extra.npy


Processing L01_V001_extra.npy: 0it [00:00, ?it/s]


L01_V002.npy


Processing L01_V002.npy: 0it [00:00, ?it/s]


L01_V002_extra.npy


Processing L01_V002_extra.npy: 0it [00:00, ?it/s]
Processing data parts: 100%|██████████| 4/4 [00:00<00:00, 377.14it/s]

CPU FAISS index saved to faiss_clipv2_cosine_cpu.bin





# Indexing OCR

In [None]:
def load_vectors(vector_path):
    for item in os.listdir(vector_path):
        if item.split('_')[-1] == "vectors.npz":
            # Load the vectors from the .npz file
            vectors = load_npz(f"{vector_path}/{item}")
    
            # Convert to dense numpy array if it's a sparse matrix
            if isinstance(vectors, np.ndarray):
                return vectors.astype('float32')
            else:
                return vectors.toarray().astype('float32')
            
def create_and_save_faiss_index(vector_path, output_path):
    # Load vectors
    vectors = load_vectors(vector_path)
    
    # Create FAISS index
    dimension = vectors.shape[1]
    index = faiss.IndexFlatIP(dimension)
    index.add(vectors)
    
    # Save the index
    faiss.write_index(index, output_path)
    print(f"FAISS index saved to {output_path}")

In [None]:
vector_path = f"{metadata_encoded_path}/ocr"
output_path = f"{dir_path}/{ocr_bin_name}"

create_and_save_faiss_index(vector_path, output_path)

FAISS index saved to /home/heigatvu/my-project/competition/HCMC-AI/pipeline-hcm-ai/notebooks/indexing/faiss_ocr_cosine.bin


# Indexing multi-tag

In [None]:
def load_vectors(vector_path):
    for item in os.listdir(vector_path):
        if item.split('_')[-1] == "vectors.npz":
            # Load the vectors from the .npz file
            vectors = load_npz(f"{vector_path}/{item}")
    
            # Convert to dense numpy array if it's a sparse matrix
            if isinstance(vectors, np.ndarray):
                return vectors.astype('float32')
            else:
                return vectors.toarray().astype('float32')
            
# def create_faiss_index(vectors, nlist=100):
#     # vectors = vectors.astype('float32').toarray()
#     dimension = vectors.shape[1]
    
#     quantizer = faiss.IndexFlatIP(dimension)
#     index = faiss.IndexIVFFlat(quantizer, dimension, nlist, faiss.METRIC_INNER_PRODUCT)
    
#     faiss.normalize_L2(vectors)
    
#     index.train(vectors)
#     index.add(vectors)
#     return index

def create_faiss_index_flat(vectors, output_path):
    # vectors = vectors.astype('float32').toarray()
    faiss.normalize_L2(vectors)
    
    dimension = vectors.shape[1]
    index = faiss.IndexFlatIP(dimension)
    index.add(vectors)
    
    # Save the index    
    faiss.write_index(index, output_path)

In [None]:
# Create and save FAISS index
vector_path = f"{metadata_encoded_path}/multi_tag"
output_path = f"{dir_path}/{multi_tag_bin_name}"

vectors = load_vectors(vector_path)
create_faiss_index_flat(vectors, output_path)