Install required packages

In [18]:
%pip install -q -r requirements.txt

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


Import packages and show versions

In [None]:
import numpy as np
import json
import sys

print("Python ", sys.version)
print("Numpy ", np.__version__)

PYTHON  3.12.6 (MAIN, SEP  9 2024, 21:33:51) [CLANG 18.1.8 ]
Numpy  1.26.4


Set up transformer

"SentenceTransformer", is the SentenceTransformer class is part of the SentenceTransformers library, which is designed to simplify the process of generating semantically meaningful embeddings for sentences, paragraphs, or larger texts.


_all-MiniLM-L6-v2_ is a MiniLM model fine tuned on a large dataset of over 1 billion training pairs. WHAT OTHERS ARE THERE AND pros and cons of each

In [20]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")

In [21]:
# Experiment with the model
# The model takes a list of strings as input
# It then returns a numpy array with the embeddings for each sentence
sentences = [
    "The Yankees, the first mechanicians in the world, are engineers - just as the Italians are musicians and the Germans metaphysicians - by right of birth. Nothing is more natural, therefore, than to perceive them applying their audacious ingenuity to the science of gunnery.",
    "If Providence has created the stars and the planets, man has called the cannonball into existence",
    "Imagine a society in which there were neither rich nor poor",
    "We may brave human laws, but we cannot resist natural ones.",
]

# The embeddings are 384-dimensional vectors
embeddings = model.encode(sentences)

print(embeddings.shape)



(4, 384)


In [22]:
# Now lets try with a single question

user_question = "What is the meaning of life?"

# NOTE: convert to list to avoid numpy serialization issues!!!
question_embedding = model.encode(user_question).tolist()

print(question_embedding)

[-0.058717917650938034, 0.09474466741085052, -0.062446754425764084, 0.022712193429470062, -0.05048515647649765, 0.003738639410585165, 0.09869316220283508, -0.015877489000558853, 0.0985545814037323, 0.004240574315190315, 0.037245701998472214, -0.06093671917915344, -0.016290348023176193, -0.008019939996302128, -0.004419141449034214, -0.037102941423654556, -0.07821599394083023, -0.03404517099261284, -0.0024612615816295147, -0.020916618406772614, 0.04273824021220207, 0.011293352581560612, -0.03769345209002495, 0.05371250584721565, -0.09862415492534637, 0.11139770597219467, -0.005809531081467867, 0.023701783269643784, -0.0002125901955878362, 0.02244630455970764, 0.02845384180545807, 0.05352574959397316, 0.08441056311130524, -0.02425570599734783, 0.021382782608270645, 0.035087231546640396, 0.09516579657793045, -0.03466906026005745, 0.020812030881643295, 0.018211036920547485, -0.03947436064481735, -0.024760212749242783, -0.005441810470074415, -0.003152210731059313, 0.028805026784539223, -0.00

Next step,

build query/vector search.

mongodb vectorSearch, pipeline, match, group, sort and limit

In [23]:
DOC_LIMIT = 10
ATLAS_VECTOR_INDEX = "context"
DOCUMENT_EMBEDDINGS_FIELD = "embeddings"
CLIENT_ID="client_id" # USed to segregate data

def format_mql_query(question: list):
    return [
        {
            "$vectorSearch": {
                "index": ATLAS_VECTOR_INDEX,
                "queryVector": question,
                "path": DOCUMENT_EMBEDDINGS_FIELD,
                "numCandidates": 180,
                "limit": 120,
                "filter": {"rating": {"$gte": 3}},
            }
        },
        {
            "$project": {
                "text": 1,  # review  field
                "date_upload": 1,  # date
                "client_id": CLIENT_ID,  # locator for restaurant
                "score": {"$meta": "vectorSearchScore"},  # score
            }
        },
        {"$match": {"$expr": {"$gt": [{"$strLenCP": "$text"}, 100]}}},
        {
            "$group": {
                "_id": "$gmap_id",
                "context": {"$push": {"text": "$text", "date_upload": "$date_upload"}},
                "n": {"$count": {}},
                "max_score": {"$max": "$score"},
                "avg_score": {"$avg": "$score"},
            }
        },
        {"$sort": {"n": -1}},
        {"$limit": DOC_LIMIT},
    ]

In [24]:
# Take the users (now transformed/embedded) question and format it for the MQL query
vector_search = format_mql_query(question_embedding)

# print(vector_search)

In [25]:
# Here we will create a DB connection, send the query and get the results
# For now we will just print the query
print(json.dumps(vector_search, indent=2))

[
  {
    "$vectorSearch": {
      "index": "context",
      "queryVector": [
        -0.058717917650938034,
        0.09474466741085052,
        -0.062446754425764084,
        0.022712193429470062,
        -0.05048515647649765,
        0.003738639410585165,
        0.09869316220283508,
        -0.015877489000558853,
        0.0985545814037323,
        0.004240574315190315,
        0.037245701998472214,
        -0.06093671917915344,
        -0.016290348023176193,
        -0.008019939996302128,
        -0.004419141449034214,
        -0.037102941423654556,
        -0.07821599394083023,
        -0.03404517099261284,
        -0.0024612615816295147,
        -0.020916618406772614,
        0.04273824021220207,
        0.011293352581560612,
        -0.03769345209002495,
        0.05371250584721565,
        -0.09862415492534637,
        0.11139770597219467,
        -0.005809531081467867,
        0.023701783269643784,
        -0.0002125901955878362,
        0.02244630455970764,
        0.0284538

In [26]:
def craft_prompt(user_prompt: str, vector_search_documents: list) -> str:
    # Only take the first 10 documents from the list
    docs_to_consider = "\n\n".join(
        [json.dumps(doc) for doc in vector_search_documents[:10]]
    )

    llm_prompt = (
        f"Using these documents answer the user's question in the style of Donald Trump\n\n"
        f"User Question: {user_prompt}\nDocuments: {docs_to_consider}."
    )

    return llm_prompt

In [27]:
print(craft_prompt(user_question, vector_search))

Using these documents answer the user's question in the style of Donald Trump

User Question: What is the meaning of life?
Documents: {"$vectorSearch": {"index": "context", "queryVector": [-0.058717917650938034, 0.09474466741085052, -0.062446754425764084, 0.022712193429470062, -0.05048515647649765, 0.003738639410585165, 0.09869316220283508, -0.015877489000558853, 0.0985545814037323, 0.004240574315190315, 0.037245701998472214, -0.06093671917915344, -0.016290348023176193, -0.008019939996302128, -0.004419141449034214, -0.037102941423654556, -0.07821599394083023, -0.03404517099261284, -0.0024612615816295147, -0.020916618406772614, 0.04273824021220207, 0.011293352581560612, -0.03769345209002495, 0.05371250584721565, -0.09862415492534637, 0.11139770597219467, -0.005809531081467867, 0.023701783269643784, -0.0002125901955878362, 0.02244630455970764, 0.02845384180545807, 0.05352574959397316, 0.08441056311130524, -0.02425570599734783, 0.021382782608270645, 0.035087231546640396, 0.095165796577930