In [3]:
from fastembed import TextEmbedding
from tagmatch.vec_db import Embedder
import numpy as np
from tagmatch.vec_db import VecDB
from pydantic_settings import BaseSettings
import pickle
from sklearn.decomposition import PCA
from create_dummy_tags import tags
import time
class Settings(BaseSettings):
    model_name: str
    cache_dir: str
    qdrant_host: str
    qdrant_port: int
    qdrant_collection: str
    reduced_embed_dim: int
    embed_dim: str

    class Config:
        env_file = ".env"
settings = Settings()





In [9]:
embedder = Embedder(model_name=settings.model_name,cache_dir=settings.cache_dir)
tag_embeddings = [embedder.embed(tag) for tag in tags]
embed_vec=np.asarray(tag_embeddings)

Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 27165.18it/s]


In [10]:
pca = PCA(n_components=settings.reduced_embed_dim)
pca.fit(embed_vec)
sum(pca.explained_variance_ratio_)

0.9513890146045014

In [12]:
red_embed_vecs = pca.transform(embed_vec)
red_embed_vecs.shape

(189, 112)

In [16]:
vec_db = VecDB(
    host="http://localhost",
    port=6333,
    collection="pca_train_test_reduced",
    vector_size=settings.reduced_embed_dim,
)

In [17]:
for name,vec in zip(tags,red_embed_vecs):
    vec_db.store(vec,{"name":name})

In [19]:

measured_times= []
for i in range(100):
    start = time.time()
    reduced_vector_matches = [vec_db.find_closest(query_vector, 5) for query_vector in red_embed_vecs]
    end = time.time()
    measured_times.append(end-start)
print(np.mean(measured_times))

0.5253142952919007


In [60]:
reduced_vector_matches[0]

[ScoredPoint(id=10356360874192234358, version=0, score=0.99999994, payload={'name': 'Apollo 11'}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id=9793711092478664242, version=46, score=0.3996141, payload={'name': 'Moon Mission'}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id=12207098595657543863, version=1, score=0.3779359, payload={'name': 'Moon'}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id=12894050092224864253, version=80, score=0.37239692, payload={'name': 'Lunar Base'}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id=11172108735014080718, version=49, score=0.28158426, payload={'name': 'Saturn Mission'}, vector=None, shard_key=None, order_value=None)]

In [24]:
vec_db_full = VecDB(
    host="http://localhost",
    port=6333,
    collection="train_test",
    vector_size=embedder.embedding_dim,
)

In [25]:
for name,vec in zip(tags,embed_vec):
    vec_db_full.store(vec,{"name":name})

In [28]:

measured_times= []
for i in range(100):
    start = time.time()
    vector_matches = [vec_db_full.find_closest(query_vector, 5) for query_vector in embed_vec]
    end = time.time()
    measured_times.append(end-start)
print(np.mean(measured_times))


0.5317504286766053


In [30]:
vector_matches[0]

[ScoredPoint(id=9878243588891342145, version=0, score=0.9999999, payload={'name': 'Apollo 11'}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id=11749326104151005753, version=46, score=0.7471009, payload={'name': 'Moon Mission'}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id=9865584330471821811, version=1, score=0.72500336, payload={'name': 'Moon'}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id=11089151293137662142, version=80, score=0.7157799, payload={'name': 'Lunar Base'}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id=12676724818710015134, version=14, score=0.6981933, payload={'name': 'NASA'}, vector=None, shard_key=None, order_value=None)]

In [61]:
pickle.dump(pca,open("pca.pkl","wb"))