In [11]:
from dotenv import load_dotenv
# Load the .env file
load_dotenv()

True

In [12]:
from langchain.schema import Document
import json

def prepare_qa_documents(file_path):
    with open(file_path, 'r') as f:
        qa_data = json.load(f)
    
    documents = [
        Document(
            page_content=item["answer"],
            metadata={"question": item["question"]}
        )
        for item in qa_data
    ]
    
    return documents

test_documents = prepare_qa_documents("../data/home0001qa.json")

In [13]:
def prepare_qa_texts(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
    
    texts = [f"Q: {item['question']} A: {item['answer']}" for item in data]
    
    return texts

test_texts = prepare_qa_texts("../data/home0001qa.json")
print(test_texts[:1])

['Q: Do i own my 0001 home outright? A: When you buy a 0001 home, you own the title in the traditional way. If you need, we’ll help you find the right mortgage and can recommend real estate lawyers. You keep full legal ownership of your home, with the added benefit that you can spend time in other locations whenever you want.']


In [14]:
from langchain_openai import OpenAIEmbeddings

openai_embeddings = OpenAIEmbeddings(
    model="text-embedding-3-large",
    dimensions=1024  # size of the embeddings you want returned.
)

In [15]:
from langchain_huggingface import HuggingFaceEmbeddings

def get_hf_embeddings(model_name):

    embeddings = HuggingFaceEmbeddings(
        model_name=model_name
    )

    return embeddings

mpnet_embeddings = get_hf_embeddings("sentence-transformers/all-mpnet-base-v2")

In [16]:
from langchain_community.embeddings import HuggingFaceBgeEmbeddings

def get_bge_embeddings(model_name, model_kwargs, encode_kwargs):

    embeddings = HuggingFaceBgeEmbeddings(
        model_name=model_name, 
        model_kwargs=model_kwargs, 
        encode_kwargs=encode_kwargs
    )

    return embeddings

model_name = "BAAI/bge-small-en"
model_kwargs = {"device": "cuda"}
encode_kwargs = {"normalize_embeddings": True}
bge_embeddings = get_bge_embeddings(model_name, model_kwargs, encode_kwargs)

### Nomic
https://huggingface.co/nomic-ai/nomic-embed-text-v1.5

Important: the text prompt must include a task instruction prefix, instructing the model which task is being performed.  

For example, if you are implementing a RAG application, you embed your documents as search_document: <text here> and embed your user queries as search_query: <text here>.  

Embed texts as documents:  
`documents = ['search_document: TSNE is a dimensionality reduction algorithm created by Laurens van Der Maaten']`  

Embed texts as queries:  
`queries = ['search_query: Who is Laurens van Der Maaten?']`  


In [None]:
model_name = "nomic-ai/nomic-embed-text-v1.5"
model_kwargs = {'device': 'cuda', 'trust_remote_code':True}
encode_kwargs = {'normalize_embeddings': True}
nomic_embeddings = get_bge_embeddings(
    model_name, 
    model_kwargs, 
    encode_kwargs,
)

<All keys matched successfully>


In [1]:
from langchain_nomic.embeddings import NomicEmbeddings

nomic_embeddings_alt = NomicEmbeddings(model='nomic-embed-text-v1.5', inference_mode='local')

### Stella
https://huggingface.co/dunzhang/stella_en_1.5B_v5

The models have multiple dimensions: 512, 768, 1024, 2048, 4096, 6144 and 8192.

The higher the dimension, the better the performance. Generally speaking, 1024d is good enough. The MTEB score of 1024d is only 0.001 lower than 8192d.

Important: Queries need one out of two prompts. "s2p_query" (e.g. retrieve task) and "s2s_query" (e.g. semantic textual similarity task) for sentence-to-passage and sentence-to-sentence tasks, respectively.  
Documents don't need prompts.

Prompt of s2p task(e.g. retrieve task):

`Instruct: Given a web search query, retrieve relevant passages that answer the query.\nQuery: {query}`  

Prompt of s2s task(e.g. semantic textual similarity task):  

`Instruct: Retrieve semantically similar text.\nQuery: {query}`  


They are defined in `config_sentence_transformers.json`  



In [None]:
### EXAMPLE ###
query_prompt_name = "s2p_query"
queries = [
    "What are some ways to reduce stress?",
    "What are the benefits of drinking green tea?",
]
query_embeddings = model.encode(queries, prompt_name=query_prompt_name)

In [22]:
model_name = "dunzhang/stella_en_1.5B_v5"
model_kwargs = {'device': 'cuda', 'trust_remote_code':True}
encode_kwargs = {'normalize_embeddings': True}
stella_15_embeddings = get_bge_embeddings(
    model_name,
    model_kwargs, 
    encode_kwargs,
)

In [23]:
model_name = "dunzhang/stella_en_400M_v5"
model_kwargs = {'device': 'cuda', 'trust_remote_code':True}
encode_kwargs = {'normalize_embeddings': True}
stella_400_embeddings = get_bge_embeddings(
    model_name,
    model_kwargs, 
    encode_kwargs,
)

Some weights of the model checkpoint at dunzhang/stella_en_400M_v5 were not used when initializing NewModel: ['new.pooler.dense.bias', 'new.pooler.dense.weight']
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
from langchain_cohere import CohereEmbeddings

cohere_embeddings = CohereEmbeddings(
    model="embed-english-v3.0"
)

In [9]:
from langchain_mistralai import MistralAIEmbeddings

mistral_embeddings = MistralAIEmbeddings(
    model="mistral-embed",
)

In [None]:
from langchain_community.vectorstores import FAISS

def basic_retriever(documents, embeddings):

    # vectorstore = Chroma.from_documents(documents, embeddings)
    vectorstore = FAISS.from_documents(documents, embeddings)

    retriever = vectorstore.as_retriever()

    return retriever

In [23]:
test_embeddings = {
    "openai": openai_embeddings,
    "mpnet": mpnet_embeddings,
    "bge": bge_embeddings
}

In [28]:
print(test_embeddings.keys())

for model in test_embeddings:

    vectorstore = FAISS.from_documents(test_documents, test_embeddings[model])
    vectorstore.save_local("./FAISS", model)
    # retriever = vectorstore.as_retriever()


dict_keys(['openai', 'mpnet', 'bge'])


In [29]:
test_retrievers = []

for model in test_embeddings:
    vectorstore = FAISS.load_local(
        folder_path="./FAISS", 
        embeddings=test_embeddings[model], 
        index_name=model, 
        allow_dangerous_deserialization=True
    )
    retriever = vectorstore.as_retriever(
        search_type="mmr",
        search_kwargs={"k": 4}
    )
    test_retrievers.append(retriever)



In [34]:
test_retrievers[0].invoke("what is home0001")

[Document(metadata={'question': 'Who is behind home0001?'}, page_content='Home0001 is initiated by a multi-disciplinary collective working across art, architecture, technology, and design, and currently based in los angeles, new york, paris, berlin, and london. Designed together with world renowned architects, 0001 homes are fully equipped and furnished and are part of an expanding network.'),
 Document(metadata={'question': 'Can i change the design of my home?'}, page_content="Legally you own your home and are free to do what you want with it. However, to maintain access to home0001's network in other locations, your home does need to meet our standards and our team can support you in making changes where desired."),
 Document(metadata={'question': 'How do i book an 0001 home somewhere else?'}, page_content="Whenever you want to spend time in other home0001 locations, just text us your dates and we'll confirm availability right away. You cover one cleaning fee each time you swap homes

In [31]:
test_retrievers[1].invoke("what is home0001")

[Document(metadata={'question': 'What are the perks of joining the home0001 network?'}, page_content='Home0001 is a distributed housing collective: in addition to community dinners and events, homeowners get access to 0001 homes in other cities for free. No nightly rate; just a cleaning fee each time. Own one home, live in many places. '),
 Document(metadata={'question': 'Are 0001 homes move-in ready?'}, page_content='Developed in collaboration with world-renowned architects, every single thing in an 0001 home is thoughtfully designed with a focus on simplicity and functionality, so homebuyers can literally move in with nothing but their suitcase.'),
 Document(metadata={'question': 'Can i change the design of my home?'}, page_content="Legally you own your home and are free to do what you want with it. However, to maintain access to home0001's network in other locations, your home does need to meet our standards and our team can support you in making changes where desired."),
 Document(

In [33]:
test_retrievers[2].invoke("what is home0001")

[Document(metadata={'question': 'How does the home0001 network function?'}, page_content='Home0001 is a distributed housing collective: in addition to community dinners and events, homeowners get access to 0001 homes in other cities for free. No nightly rate; just a cleaning fee each time. Own one home; live flexibly between multiple locations.'),
 Document(metadata={'question': 'Who founded home0001?'}, page_content='Home0001 is a new form of housing initiated by a collective of architects, artists, technologists, and designers currently based in los angeles, new york, paris, rotterdam, berlin, and london.'),
 Document(metadata={'question': 'What is home0001?'}, page_content='Home0001 is a global housing network. Each 0001 home is fully-equipped and furnished. Move in with just your suitcase. Swap cities whenever you like.'),
 Document(metadata={'question': 'Can i buy a home as a non-us citizen?'}, page_content="The process for buying an 0001 home is the same wherever you’re from. The

In [None]:
EVAL_PROMPT = """
Expected Response: {expected_response}
Actual Response: {actual_response}
---
(Answer with 'true' or 'false') Does the actual response match the expected response? 
"""

def test_ticket_to_ride_rules():
    assert query_and_validate(
        question="How many points does the longest continuous train get in Ticket to Ride? (Answer with the number only)",
        expected_response="10 points",
    )


def query_and_validate(question: str, expected_response: str):
    response_text = query_rag(question)
    prompt = EVAL_PROMPT.format(
        expected_response=expected_response, actual_response=response_text
    )

    # model = Ollama(model="mistral")
    evaluation_results_str = model.invoke(prompt)
    evaluation_results_str_cleaned = evaluation_results_str.strip().lower()

    print(prompt)

    if "true" in evaluation_results_str_cleaned:
        # Print response in Green if it is correct.
        print("\033[92m" + f"Response: {evaluation_results_str_cleaned}" + "\033[0m")
        return True
    elif "false" in evaluation_results_str_cleaned:
        # Print response in Red if it is incorrect.
        print("\033[91m" + f"Response: {evaluation_results_str_cleaned}" + "\033[0m")
        return False
    else:
        raise ValueError(
            f"Invalid evaluation result. Cannot determine if 'true' or 'false'."
        )
    