In [1]:
!pip install vectordb

Collecting vectordb
  Downloading vectordb-0.0.19.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting jina>=3.20.0
  Downloading jina-3.22.1-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m115.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting docarray[hnswlib]>=0.34.0
  Downloading docarray-0.39.0-py3-none-any.whl (265 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.1/265.1 kB[0m [31m85.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting orjson>=3.8.2
  Downloading orjson-3.9.9-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (138 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m138.6/138.6 kB[0m [31m55.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pydantic>=1.10.8
  Downloading pydantic-2.4.2-py3-none-any.whl (395 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━

In [11]:
import numpy as np
from collections import defaultdict
from typing import List, Tuple

import pandas as pd

In [12]:
class VectorDatabase:
    def __init__(self):
        self.vectors = defaultdict(np.ndarray)

    def insert(self, key: str, vector: np.ndarray) -> None:
        self.vectors[key] = vector

    def search(self, query_vector: np.ndarray, k: int) -> List[Tuple[str, float]]:
        similarities = [(key, cosine_similarity(query_vector, vector)) for key, vector in self.vectors.items()]
        similarities.sort(key=lambda x: x[1], reverse=True)
        return similarities[:k]

    def retrieve(self, key: str) -> np.ndarray:
        return self.vectors.get(key, None)

In [13]:
def cosine(v1: np.ndarray, v2: np.ndarray) -> float:
    result = np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))
    return result

In [23]:
# get embeddings from the file
df = pd.read_csv("cord_19_embeddings_2022-06-02_test.csv", header=None)

In [28]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,759,760,761,762,763,764,765,766,767,768
0,ug7v899j,-2.939984,-6.312201,-1.045903,5.164163,-0.325646,-2.507413,1.735609,1.936357,0.622501,...,-0.741917,3.394711,3.163691,1.37289,-4.107113,-3.817415,1.897619,5.811529,-2.932384,0.305205
1,02tnwd4m,4.688466,-6.28446,1.228246,5.5699,5.224824,-0.406792,-2.296234,-4.276381,-0.781381,...,-0.850256,2.700679,-1.659213,-1.057281,-1.801275,0.276088,0.880289,4.176544,-5.686561,0.512474
2,ejv2xln0,0.55992,-6.754057,2.271945,7.644496,3.592816,-1.320622,-0.752549,1.384198,0.892751,...,2.043988,4.330732,-0.018087,4.812413,-2.565279,-2.131349,1.669153,4.521635,-2.847652,-1.536697
3,2b73a28n,2.35254,-3.997074,2.399774,4.594238,2.343638,1.867916,-3.466859,-3.148963,-0.073157,...,0.109116,3.49289,-3.039974,1.304383,-1.309967,-2.441383,-1.25418,-0.573785,-1.402024,3.922583
4,9785vg6d,-1.290922,-5.849864,1.557572,7.989901,5.805941,-4.537519,1.914945,-1.324861,0.861168,...,1.049299,1.902446,2.64463,1.695888,-2.38642,-3.247509,2.203239,4.226255,1.244299,-4.117685


In [17]:
# l = []

# for idx, row in df.iterrows():
#     l.append(np.asarray(row[1:].tolist()))

In [19]:
vector_db = VectorDatabase()

In [26]:
# insert embeddings into the database
for idx, row in df.iterrows():
    key = row[0]
    embedding = np.asarray(row[1:].tolist())
    vector_db.insert(key, embedding)

In [29]:
vector_db.retrieve("ug7v899j")

array([-2.93998361e+00, -6.31220055e+00, -1.04590309e+00,  5.16416264e+00,
       -3.25646371e-01, -2.50741339e+00,  1.73560870e+00,  1.93635666e+00,
        6.22501016e-01,  1.56131625e+00,  2.74769139e+00, -6.93097711e-01,
       -3.71040940e-01, -5.88572502e+00, -1.09748483e+00, -4.65481567e+00,
       -2.31646371e+00,  2.87495947e+00,  3.87578082e+00, -2.95981139e-01,
        1.77025485e+00,  4.40059328e+00, -4.05934751e-01, -1.71624398e+00,
       -4.76663411e-01, -2.57071376e+00,  4.59941769e+00,  1.63527846e-01,
       -1.83473134e+00,  2.55786443e+00, -3.79167843e+00, -6.22489691e+00,
       -3.53383869e-01, -1.08378506e+00, -1.04406953e-01, -2.09846401e+00,
       -1.63153899e+00, -5.29200658e-02, -4.06286144e+00, -1.28650916e+00,
       -1.26636600e+00, -1.17296696e+00,  4.73816156e+00, -3.18683219e+00,
       -1.56449735e+00,  7.11834431e-01, -3.77992725e+00,  8.74405861e-01,
        3.15015078e+00, -1.44715142e+00,  1.34749025e-01,  1.89116776e+00,
        2.67670965e+00, -