# Load data (from previous notebook)

In [None]:
sentences = open("sentences.txt").read().split("@@@")

In [None]:
len(sentences)

In [None]:
import numpy as np
with open("sentences-mqa.npy", "rb") as f:
    sembeddings = np.load(f)

# Vector DB

In [None]:
from pymilvus import MilvusClient

In [None]:
client = MilvusClient("un-77.db")

In [None]:
data = [ { "id": i, 
           "vector": sembeddings[i], 
           "text": sentences[i] } for i in range(len(sembeddings)) ]

We could use many more fields here, like `country`. These fields can be used for filtering then.

In [None]:
client.drop_collection(collection_name="mqa")

In [None]:
client.create_collection(collection_name="mqa", dimension=sembeddings[0].shape[0])
res = client.insert(collection_name="mqa", data=data)

In [None]:
# need model for calculating new embeddings
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')

In [None]:
import pandas as pd
def search(query, client, collection, model, query_prompt_name=None, top=20):
    # code query to restrict search space
    question_embedding = model.encode(query, normalize_embeddings=True, prompt_name=query_prompt_name)
    
    # search vector database
    hits = client.search(collection_name=collection, data=[question_embedding], limit=top,
                        output_fields=["text"])
    
    # Return as dataframe
    return pd.DataFrame([{ "id": r["id"], 
                           "text": r["entity"]["text"], 
                           "score": r["distance"] } for r in hits[0]])

In [None]:
pd.set_option('display.max_colwidth', 0)

In [None]:
search("The climate crisis is worse in poorer countries", client, "mqa", model)

In [None]:
model2 = SentenceTransformer('mixedbread-ai/mxbai-embed-large-v1')
with open("sentences-mbread.npy", "rb") as f:
    sembeddings2 = np.load(f)

In [None]:
data = [ { "id": i, 
           "vector": sembeddings2[i], 
           "text": sentences[i] } for i in range(len(sembeddings2)) ]

In [None]:
client.create_collection(collection_name="mxbai", dimension=sembeddings2[0].shape[0])
res = client.insert(collection_name="mxbai", data=data)

In [None]:
search("The climate crisis is worse for poorer countries.", 
       client, "mxbai", model2, query_prompt_name="query")

In [None]:
model3 = SentenceTransformer("NovaSearch/stella_en_1.5B_v5", trust_remote_code=True)
with open("sentences-stella.npy", "rb") as f:
    sembeddings3 = np.load(f)

In [None]:
data = [ { "id": i, 
           "vector": sembeddings3[i], 
           "text": sentences[i] } for i in range(len(sembeddings3)) ]

In [None]:
client.create_collection(collection_name="stella", dimension=sembeddings3[0].shape[0])
res = client.insert(collection_name="stella", data=data)

In [None]:
search("The climate crisis is worse for poorer countries.", 
       client, "stella", model3, query_prompt_name="s2p_query")