# 2.3 Vectorstores and Embeddings - part 2

## Using other embedding models

## Setup

### Install dependencies

In [1]:
%pip install python-dotenv~=1.0 docarray~=0.40.0 pypdf~=5.1 --upgrade --quiet
%pip install chromadb~=0.5.18 sentence-transformers~=3.3 --upgrade --quiet 
%pip install langchain~=0.3.7 langchain_openai~=0.2.6 langchain_community~=0.3.5 langchain-huggingface~=0.1.2 --upgrade --quiet
%pip install unstructured[md]~=0.16.5 --upgrade --quiet

# If running locally, you can do this instead:
#%pip install -r ../requirements.txt


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
ragatouille 0.0.8.post4 requires sentence-transformers<3.0.0,>=2.2.2, but you have sentence-transformers 3.3.1 which is incompatible.[0m[31m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m]

### Load environment variables

In [2]:
import os
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

# If running in Google Colab, you can use this code instead:
# from google.colab import userdata
# os.environ["AZURE_OPENAI_API_KEY"] = userdata.get("AZURE_OPENAI_API_KEY")
# os.environ["AZURE_OPENAI_ENDPOINT"] = userdata.get("AZURE_OPENAI_ENDPOINT")

### Setup Models

In [3]:
from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
api_version = "2024-10-01-preview"
oai_embedding_model = AzureOpenAIEmbeddings(model="text-embedding-3-large", openai_api_version=api_version)
print(f"Dimension in OpenAI embedding model: {len(oai_embedding_model.embed_query('test'))}")

Dimension in OpenAI embedding model: 3072


### Setup path to data 

In [4]:
data_path = "../data"

## Setup HuggingFace Embedding Model

In [5]:
from langchain_core.embeddings import Embeddings
from langchain_huggingface.embeddings import HuggingFaceEmbeddings

# Try using an open-source embedding function from HuggingFace

# See https://huggingface.co/spaces/mteb/leaderboard
hf_embedding_model: Embeddings = HuggingFaceEmbeddings(
    #model_name="avsolatorio/GIST-all-MiniLM-L6-v2" # 23M params, 0.08GB mem use, 384 dim, 512 tokens, 59 avg score
    #model_name="intfloat/multilingual-e5-large-instruct" # 560M params, 2.09GB mem use, 1024 dim, 514 tokens, 63.61 avg score
    model_name="HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1" # 494M params, 1.84GB mem use, 896 dim, 131k tokens, 64.74 avg score
    #model_name="Salesforce/SFR-Embedding-2_R" # 7B params, 26GB mem use, 4096 dim, 32k tokens, 70.32 avg score
    #model_name="nvidia/NV-Embed-v2" # 7B params, 29GB mem use, 4096 dim, 32k tokens, 72.31 avg score
)
print(f"Dimension in HF model: {len(hf_embedding_model.embed_query('test'))}")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/208 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/600k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/55.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/833 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/80.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/370 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

Dimension in HF model: 896


### Load the documents

In [6]:
from langchain_community.document_loaders import UnstructuredMarkdownLoader

loaders = [
    UnstructuredMarkdownLoader(f"{data_path}/listing1.md"),
    UnstructuredMarkdownLoader(f"{data_path}/listing2.md"),
    UnstructuredMarkdownLoader(f"{data_path}/listing3.md"),
]
documents = []
for loader in loaders:
    documents.extend(loader.load())


In [8]:
documents

[Document(metadata={'source': '../data/listing1.md'}, page_content="Luxurious 2-Bedroom Apartment in Downtown Vista Verde\n\nPrice: $450,000 Property Type: Apartment Year Built: 2018 Square Footage: 1,200 sqft Bedrooms: 2 Bathrooms: 2 Parking: 1 dedicated parking spot in underground garage HOA Fees: $300/month Location: 1200 Grand Avenue, Downtown Vista Verde\n\nOverview\n\nExperience the epitome of urban living in this meticulously designed 2-bedroom apartment located in the heart of Downtown Vista Verde. Boasting a spacious 1,200 square feet of modern living space, this apartment offers a seamless blend of luxury, comfort, and convenience. With its prime location, you're just steps away from the city's finest dining, shopping, and entertainment options. The building's amenities include a state-of-the-art fitness center, a rooftop terrace with panoramic city views, and a private underground parking garage.\n\nLiving Room\n\nThe expansive living room serves as the heart of the apartmen

### Split

In [9]:
from langchain.text_splitter import CharacterTextSplitter

# split it into chunks
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=32)
splitDocs = text_splitter.split_documents(documents)

# embeddings = []
# for sp in splitDocs:
#     embeddings = embedding.embed_query(sp.page_content)

print(f"splitDocs count: {len(splitDocs)}")

splitDocs count: 9


### Setup vector stores

In [10]:
from langchain_community.vectorstores import Chroma

print('Loading the vector store(s)...')
oai_vectorstore = Chroma.from_documents(collection_name="listings_oai", documents=splitDocs, embedding=oai_embedding_model)
hf_vectorstore = Chroma.from_documents(collection_name="listings_hf", documents=splitDocs, embedding=hf_embedding_model)

Loading the vector store(s)...


### Query time (similarity search)!

In [11]:
question = "I'm looking for a 2-bedroom apartment"
#question = "I'm looking for an apartment with a laundry closet and preferably a stackable washer and dryer."
#question = "I'm looking for an electric car with autopilot"
# TODO: Write your own questions

print("Similarity search...")
# Compare results from different embeddings
#docs = oai_vectorstore.similarity_search(question, k=1)
docs = hf_vectorstore.similarity_search(question, k=1)

length = len(docs)
print(f"Result: {length}")

for d in docs:
    print(d.metadata)
    print(f'Content: \n"{d.page_content}"')
    
    

Similarity search...
Result: 1
{'source': '../data/listing1.md'}
Content: 
"Luxurious 2-Bedroom Apartment in Downtown Vista Verde

Price: $450,000 Property Type: Apartment Year Built: 2018 Square Footage: 1,200 sqft Bedrooms: 2 Bathrooms: 2 Parking: 1 dedicated parking spot in underground garage HOA Fees: $300/month Location: 1200 Grand Avenue, Downtown Vista Verde

Overview

Experience the epitome of urban living in this meticulously designed 2-bedroom apartment located in the heart of Downtown Vista Verde. Boasting a spacious 1,200 square feet of modern living space, this apartment offers a seamless blend of luxury, comfort, and convenience. With its prime location, you're just steps away from the city's finest dining, shopping, and entertainment options. The building's amenities include a state-of-the-art fitness center, a rooftop terrace with panoramic city views, and a private underground parking garage.

Living Room"
