In [2]:
import h5py
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer

In [4]:
def load_embeddings_in_chunks(h5_file_path, chunk_size):
    with h5py.File(h5_file_path, 'r') as h5_file:
        total_size = h5_file['clean_embeddings'].shape[0]
        for start in range(0, total_size, chunk_size):
            end = min(start + chunk_size, total_size)
            embeddings_chunk = h5_file['clean_embeddings'][start:end]
            yield embeddings_chunk

chunk_size = 100000  # 한 번에 로드할 청크 크기
embeddings_generator = load_embeddings_in_chunks('/data/shared/nlp/clean_embeddings.h5', chunk_size)

In [5]:
def normalize_embeddings(embeddings):
    norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
    return embeddings / norms

# 임베딩 차원
d = 384

# IVF 인덱스 설정
nlist = 1000  # 클러스터 수
quantizer = faiss.IndexFlatIP(d)  # 코사인 유사도는 내적을 사용
index = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_INNER_PRODUCT)

# 첫 번째 청크로 인덱스 훈련
first_chunk = next(embeddings_generator)
first_chunk_normalized = normalize_embeddings(first_chunk)
index.train(first_chunk_normalized)
index.add(first_chunk_normalized)

# 나머지 청크 추가
for embeddings_chunk in embeddings_generator:
    embeddings_chunk_normalized = normalize_embeddings(embeddings_chunk)
    index.add(embeddings_chunk_normalized)

faiss.write_index(index, '/data/shared/nlp/clean_index.ivf')