In [4]:
import duckdb
import uuid
import numpy as np
from tqdm import tqdm
import pandas as pd

embeddings = pd.read_parquet('embeddings.parquet')
#embeddings = np.load('embeddings.npy')

# Conectar ao banco de dados (ou criar um novo)
conn = duckdb.connect('embeddings_2.db')

## carrega o vss
conn.execute("""LOAD vss;""")
conn.execute("""SET hnsw_enable_experimental_persistence = True""")

<duckdb.duckdb.DuckDBPyConnection at 0x1b7c826b830>

In [5]:
extensoes_duckdb = conn.execute("""SELECT extension_name, installed, description
                FROM duckdb_extensions();""").fetchdf()

In [7]:
duckdb.__version__

'1.0.0'

In [6]:
extensoes_duckdb

Unnamed: 0,extension_name,installed,description
0,arrow,False,A zero-copy data integration between Apache Ar...
1,autocomplete,False,Adds support for autocomplete in the shell
2,aws,False,Provides features that depend on the AWS SDK
3,azure,False,Adds a filesystem abstraction for Azure blob s...
4,delta,False,Adds support for Delta Lake
5,excel,False,Adds support for Excel-like format strings
6,fts,True,Adds support for Full-Text Search Indexes
7,httpfs,False,Adds support for reading and writing files ove...
8,iceberg,False,Adds support for Apache Iceberg
9,icu,True,Adds support for time zones and collations usi...


In [2]:
conn.execute('''
CREATE TABLE IF NOT EXISTS embeddings (
    id VARCHAR PRIMARY KEY,
    emb FLOAT[512]
)
''')

<duckdb.duckdb.DuckDBPyConnection at 0x28e2b67ef70>

In [3]:
# Prepare the insert statement
insert_query = 'INSERT INTO embeddings (id, emb) VALUES (?, ?) ON CONFLICT (id) DO NOTHING'

In [4]:
for idx, col in embeddings.sample(3).iterrows():
    print(col['filename'])

Janica_Kostelic_0001.jpg
Tim_Floyd_0002.jpg
Benazir_Bhutto_0004.jpg


In [5]:
for idx, col in tqdm(embeddings.iterrows(), total=embeddings.shape[0], desc="Processando embeddings"):
    # Generate a unique UUID for each embedding
    conn.execute(insert_query, (col['filename'], col['emb'].tolist())) 

Processando embeddings:   0%|          | 0/13233 [00:00<?, ?it/s]

Processando embeddings: 100%|██████████| 13233/13233 [03:47<00:00, 58.23it/s]


In [6]:
conn.execute("""SELECT * FROM embeddings LIMIT 5""").fetchdf()

Unnamed: 0,id,emb
0,Aaron_Eckhart_0001.jpg,"[-0.2617725, 0.13214485, 0.45549777, 0.5045976..."
1,Aaron_Guiel_0001.jpg,"[-0.12853914, 0.22124381, 0.6593986, 0.3849791..."
2,Aaron_Patterson_0001.jpg,"[-0.3216693, -0.09038943, 0.6965564, 0.2071240..."
3,Aaron_Peirsol_0001.jpg,"[-0.25898868, 0.5890372, 0.4740734, 0.07177776..."
4,Aaron_Peirsol_0002.jpg,"[-0.29221955, 0.22037442, 0.3486757, 0.3206531..."


In [7]:
# cria o index HNSW
conn.execute("""CREATE INDEX idx ON embeddings USING HNSW (emb);""")

<duckdb.duckdb.DuckDBPyConnection at 0x28e2b67ef70>

In [12]:
import random

aleatorio = random.randint(0, len(embeddings))

print(f'Realizando busca com {embeddings.iloc[aleatorio]['filename']}')

# Realizando a busca
conn.execute(f"""SELECT * 
                FROM embeddings 
                ORDER BY array_distance(emb, {embeddings.iloc[aleatorio]['emb'].tolist()}::FLOAT[512]) 
                LIMIT 5;""").fetch_df()

Realizando busca com Marco_Archer_Cardoso_Moreira_0001.jpg


Unnamed: 0,id,emb
0,Marco_Archer_Cardoso_Moreira_0001.jpg,"[-0.3044981, 0.121498756, 0.5377461, 0.4247412..."
1,Odilia_Collazo_0001.jpg,"[-0.36626455, -0.032252192, 0.39912647, 0.3220..."
2,Dyab_Abou_Jahjah_0001.jpg,"[-0.23786123, -0.08731351, 0.4422093, 0.335136..."
3,Colin_Farrell_0001.jpg,"[-0.22449528, 0.15800536, 0.37670103, 0.302245..."
4,Charlie_Sheen_0001.jpg,"[-0.30399513, -0.036129907, 0.5045782, 0.36948..."


In [5]:
# por padrão, o index HNSW será criado utiizando a distancia euclidiana l2sq (L2-norm squared).
# para trocar a distância, pode ser utilizado:
conn.execute("""CREATE INDEX my_hnsw_cosine_index
                ON embeddings
                USING HNSW (vec)
                WITH (metric = 'cosine');""")

# as opções são l2sq, cosine e ip (inner product)

<duckdb.duckdb.DuckDBPyConnection at 0x1a3546e44b0>