In [1]:
# Import the env_vars module
import sys
import os

from modules.env_vars import set_os_env_vars, check_missing_vars
from modules.neon_db import run_neon_query, load_sql_query
from modules.date_functions import get_current_date
from modules.reference_extraction import create_content_from_df
from modules.prompt_templates import one_shot_example, system_message_example

set_os_env_vars() # This will execute the code in env_vars.py and put the environment variables in os

In [2]:
from modules.langchain_config import set_langsmith_client, get_langsmith_tracer, get_llm_model, load_model_costs

set_langsmith_client()
tracer = get_langsmith_tracer()

In [3]:
# Load the costs
MODEL_COSTS = load_model_costs()
MODEL_COSTS


{'claude-3-sonnet-20240229': {'provider': 'anthropic',
  'input': 0.003,
  'output': 0.015},
 'claude-3-5-sonnet-20241022': {'provider': 'anthropic',
  'input': 0.003,
  'output': 0.015},
 'gpt-4o-mini': {'provider': 'openai'}}

In [4]:
# Initialize the language model
model_name = "gpt-4o-mini"
streaming = True # Streaming is when the LLM returns a token at a time, instead of the entire response at once

# Initialize the language model
llm = get_llm_model(model_name, streaming, MODEL_COSTS)


In [5]:
query = load_sql_query("web_pages.sql")
df = run_neon_query(query)

print("Number of rows:", len(df.index))
df.head(1)

Number of rows: 60


Unnamed: 0,id,url,media_type,status,created_at,title,description,summary,author,published_at
0,b0762d1d-a825-4427-a785-cb52229f4c67,https://aidanmclaughlin.notion.site/reasoners-...,web-page,completed,2024-11-29 07:51:53.011015,Notion – The all-in-one workspace for your not...,A new tool that blends your everyday work apps...,The article discusses the limitations of curre...,,NaT


In [6]:
# Print out the results (summary, titles, etc.)
all_content, all_content_list, all_content_dict = create_content_from_df(df)

print(len(all_content_list))
print(all_content_list[0])


60

<START Article Number: 1>
Title: Notion – The all-in-one workspace for your notes, tasks, wikis, and databases.
URL: https://aidanmclaughlin.notion.site/reasoners-problem
Summary: The article discusses the limitations of current reasoning models, particularly OpenAI's o1, which utilize reinforcement learning (RL) to enhance reasoning capabilities. While these models show promise in structured environments with clear rewards, they struggle with open-ended tasks that lack frequent feedback, such as creative writing or philosophical reasoning. The author argues that despite the advancements in RL, these models do not generalize well beyond their training domains, leading to subpar performance in tasks requiring nuanced understanding. The piece highlights the challenges of scaling model size and the potential stagnation in AI development if the focus remains solely on improving reasoning without addressing the need for larger, more capable models. Key insights include the importance of

In [7]:
from openai import OpenAI

openai_client = OpenAI()

In [8]:
def emb_text(text):
    return (
        openai_client.embeddings.create(input=text, model="text-embedding-3-small")
        .data[0]
        .embedding
    )

In [9]:
test_embedding = emb_text("This is a test")
embedding_dim = len(test_embedding)
print(embedding_dim)
print(test_embedding[:10])

1536
[0.009889289736747742, -0.005578675772994757, 0.00683477520942688, -0.03805781528353691, -0.01824733428657055, -0.04121600463986397, -0.007636285852640867, 0.03225184231996536, 0.018949154764413834, 9.352207416668534e-05]


### As for the argument of MilvusClient:
- Setting the uri as a local file, e.g../milvus.db, is the most convenient method, as it automatically utilizes Milvus Lite to store all data in this file.
- If you have large scale of data, you can set up a more performant Milvus server on docker or kubernetes. In this setup, please use the server uri, e.g.http://localhost:19530, as your uri.
- If you want to use Zilliz Cloud, the fully managed cloud service for Milvus, adjust the uri and token, which correspond to the Public Endpoint and Api key in Zilliz Cloud.

In [45]:
# import modules.milvus_helper
import importlib

# import modules.milvus_wrapper
# importlib.reload(modules.milvus_helper)

# from modules.milvus_helper import (
#     get_milvus_client, create_milvus_collection, create_demo_hybrid_milvus_schema, get_dense_embedding_details, create_demo_hybrid_milvus_indices
# )

import modules.milvus_wrapper
importlib.reload(modules.milvus_wrapper)
from modules.milvus_wrapper import MilvusLiteClient, MilvusFullClient, get_dense_embedding_details
from pymilvus import utility

milvus_lite_client = MilvusLiteClient()
milvus_full_client = MilvusFullClient()


### Basic Vector Database Implementation with Milvus Lite

In [46]:
milvus_lite_client.create_collection(dimension=embedding_dim,
                                     collection_name="my_rag_collection",
                                     metric_type="IP", consistency_level="Strong", drop_if_exists=True
                                     )

### Hybrid Search Vector Database Implementation with Milvus


In [47]:
dense_dim, dense_embedding_function = get_dense_embedding_details(use_fp16=False, device="cpu")
schema = milvus_full_client.create_demo_hybrid_schema(embedding_dim=dense_dim)
milvus_hybrid_collection = milvus_full_client.create_collection(collection_name="my_hybrid_collection",
                         schema=schema, consistency_level="Strong", drop_if_exists=True)
milvus_full_client.create_demo_hybrid_indices(milvus_hybrid_collection)

Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

In [48]:
# Check the load state of the collections
res = milvus_lite_client.client.get_load_state(
    collection_name="my_hybrid_collection"
)
print("my_hybrid_collection:", res)

res = utility.load_state(
    collection_name="my_rag_collection"
)
print("my_rag_collection:", res)

my_hybrid_collection: {'state': <LoadState: Loaded>}
my_rag_collection: Loaded


In [15]:
# Generate embeddings using BGE-M3 model
docs_embeddings = dense_embedding_function(all_content_list)
# docs_embeddings = ef.encode_documents(docs)
print(docs_embeddings)

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'dense': [array([-0.06493555,  0.00202186, -0.0297969 , ..., -0.01162049,
        0.00279936, -0.03092   ], dtype=float32), array([-0.02462117,  0.00991646, -0.05236292, ..., -0.01837842,
       -0.00888776,  0.01576928], dtype=float32), array([-0.02958542,  0.00172268, -0.02052973, ..., -0.04473894,
        0.06390611, -0.00569023], dtype=float32), array([-0.02300773, -0.03120776, -0.01790264, ...,  0.00014177,
       -0.00694255, -0.02852612], dtype=float32), array([-0.0784234 , -0.03567975, -0.01384762, ..., -0.039869  ,
        0.07898825,  0.00345745], dtype=float32), array([-0.02693632, -0.03140344, -0.02048412, ...,  0.02268932,
        0.06205763,  0.00993566], dtype=float32), array([-0.02342726, -0.01784593, -0.05326441, ..., -0.00431715,
        0.02822074, -0.01705653], dtype=float32), array([-0.05189113, -0.0021922 , -0.01734922, ..., -0.03891531,
        0.00068981, -0.00937664], dtype=float32), array([-0.01532753, -0.00075699, -0.03951575, ..., -0.00957837,
       -0.011

### Iterate through the text lines, create embeddings, and then insert the data into Milvus.
- Here is a new field text, which is a non-defined field in the collection schema. It will be automatically added to the reserved JSON dynamic field, which can be treated as a normal field at a high level.

In [49]:
# Insert the embeddings into the Milvus collection
for i in range(0, len(all_content_list), 50): # Batch the embeddings to insert into Milvus to avoid memory issues in sets of 50
    batched_entities = [
        docs_embeddings["dense"][i : i + 50], # Dense embeddings
        docs_embeddings["sparse"][i : i + 50], # Sparse embeddings
        all_content_list[i : i + 50], # Raw content in text format
    ]
    milvus_hybrid_collection.insert(batched_entities)
print("Number of entities inserted:", milvus_hybrid_collection.num_entities)


Number of entities inserted: 60


In [50]:
from tqdm import tqdm

data = []

for i, line in enumerate(tqdm(all_content_list, desc="Creating embeddings")):
    data.append({"id": i, "vector": emb_text(line), "text": line})

milvus_lite_client.client.insert(collection_name='my_rag_collection', data=data)

{'insert_count': 60, 'ids': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59], 'cost': 0}

### Build a simple RAG pipeline

In [40]:
question = "Which vector database should I use?"

In [51]:
# Search for the question in the collection and retrieve the semantic top-3 matches
search_res = milvus_lite_client.client.search(
    collection_name='my_rag_collection',
    data=[
        emb_text(question)
    ],  # Use the `emb_text` function to convert the question to an embedding vector
    limit=3,  # Return top 3 results
    search_params={"metric_type": "IP", "params": {}},  # Inner product distance
    output_fields=["text"],  # Return the text field
)

In [52]:
import json

# Print the retrieved lines with distances
retrieved_lines_with_distances = [
    (res["entity"]["text"], res["distance"]) for res in search_res[0]
]
print(json.dumps(retrieved_lines_with_distances, indent=4))

[
    [
        "\n<START Article Number: 40>\nTitle: Binary vector embeddings are so cool\nURL: https://emschwartz.me/binary-vector-embeddings-are-so-cool/\nSummary: Binary quantized vector embeddings represent a significant advancement in the field of machine learning, particularly in natural language processing. These embeddings can achieve over 95% retrieval accuracy while compressing data by 32 times and accelerating retrieval speed by approximately 25 times. By converting 32-bit floating point weights to single bits, binary quantization retains essential information, allowing for efficient similarity searches using Hamming distance instead of cosine similarity. This technique, when combined with Matryoshka embeddings\u2014which prioritize important information at the beginning of the vector\u2014further enhances performance. The results show that binary embeddings not only reduce storage costs but also improve computational efficiency, making them a compelling choice for applicat

In [44]:
# Generate embeddings for the query
query_embeddings = dense_embedding_function([question])
query_embeddings

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'dense': [array([-0.03993049, -0.02827392, -0.05900438, ...,  0.01967705,
          0.00615428,  0.00818304], dtype=float32)],
 'sparse': <Compressed Sparse Row sparse array of dtype 'float64'
 	with 8 stored elements and shape (1, 250002)>}

In [61]:
dense_results = milvus_full_client.dense_search(milvus_hybrid_collection, query_embeddings["dense"][0], limit=3)
sparse_results = milvus_full_client.sparse_search(milvus_hybrid_collection, query_embeddings["sparse"]._getrow(0), limit=3)
hybrid_results = milvus_full_client.hybrid_search(
    milvus_hybrid_collection,
    query_embeddings["dense"][0],
    query_embeddings["sparse"]._getrow(0),
    sparse_weight=0.7,
    dense_weight=1.0,
    limit=3,
)

results_dict = {
    "dense": dense_results,
    "sparse": sparse_results,
    "hybrid": hybrid_results,
}
results_dict

{'dense': ["\n<START Article Number: 41>\nTitle: Understanding the BM25 full text search algorithm\nURL: https://emschwartz.me/understanding-the-bm25-full-text-search-algorithm/\nSummary: BM25 (Best Match 25) is a prominent full-text search algorithm utilized in systems like Lucene, Elasticsearch, and SQLite, known for its effectiveness in ranking documents based on their relevance to a query. It operates on the principle of probabilistic ranking, leveraging components such as Inverse Document Frequency (IDF), term frequency, and document length normalization to compute scores for documents. The algorithm's clever design allows it to rank documents without needing to calculate exact probabilities, making it practical for real-world applications. Notably, BM25 scores can only be compared within the same document collection, as they depend on the specific characteristics of that collection. This understanding is crucial for developers looking to implement or enhance search functionalitie

In [62]:
def doc_text_formatting(ef, query, docs):
    tokenizer = ef.model.tokenizer
    query_tokens_ids = tokenizer.encode(query, return_offsets_mapping=True)
    query_tokens = tokenizer.convert_ids_to_tokens(query_tokens_ids)
    formatted_texts = []

    for doc in docs:
        ldx = 0
        landmarks = []
        encoding = tokenizer.encode_plus(doc, return_offsets_mapping=True)
        tokens = tokenizer.convert_ids_to_tokens(encoding["input_ids"])[1:-1]
        offsets = encoding["offset_mapping"][1:-1]
        for token, (start, end) in zip(tokens, offsets):
            if token in query_tokens:
                if len(landmarks) != 0 and start == landmarks[-1]:
                    landmarks[-1] = end
                else:
                    landmarks.append(start)
                    landmarks.append(end)
        close = False
        formatted_text = ""
        for i, c in enumerate(doc):
            if ldx == len(landmarks):
                pass
            elif i == landmarks[ldx]:
                if close:
                    formatted_text += "</span>"
                else:
                    formatted_text += "<span style='color:red'>"
                close = not close
                ldx = ldx + 1
            formatted_text += c
        if close is True:
            formatted_text += "</span>"
        formatted_texts.append(formatted_text)
    return formatted_texts


In [64]:
from IPython.display import Markdown, display

display(Markdown("**Dense Search Results:**"))
formatted_results = doc_text_formatting(dense_embedding_function, question, dense_results)
for result in formatted_results:
    display(Markdown(result))

display(Markdown("\n**Sparse Search Results:**"))
formatted_results = doc_text_formatting(dense_embedding_function, question, sparse_results)
for result in formatted_results:
    display(Markdown(result))

display(Markdown("\n**Hybrid Search Results:**"))
formatted_results = doc_text_formatting(dense_embedding_function, question, hybrid_results)
for result in formatted_results:
    display(Markdown(result))


**Dense Search Results:**


<START Article Number: 41>
Title: Understanding the BM25 full text search algorithm
URL: https://emschwartz.me/understanding-the-bm25-full-text-search-algorithm/
Summary: BM25 (Best Match 25) is a prominent full-text search algorithm utilized in systems like Lucene, Elasticsearch, and SQLite, known for its effectiveness in ranking documents based on their relevance to a query. It operates on the principle of probabilistic ranking, leveraging components such as Inverse Document Frequency (IDF), term frequency, and document length normalization to compute scores for documents. The algorithm's clever design allows it to rank documents without needing to calculate exact probabilities, making it practical for real-world applications. Notably, BM25 scores can only be compared within the same document collection, as they depend on the specific characteristics of that collection. This understanding is crucial for developers looking to implement or enhance search functionalities in applications, particularly when integrating full-text search with<span style='color:red'> vector</span> similarity search for improved content retrieval.

### Key Points:
- **BM25 Algorithm**: A widely adopted full-text search algorithm that ranks documents based on relevance.
- **Components**: Utilizes query terms,<span style='color:red'> I</span>DF, term frequency, and document length normalization.
- **Probabilistic Ranking**: Focuses on the order of documents rather than exact relevance probabilities.
- **Score Comparisons**: BM25 scores can only be compared within the same document collection due to dependency on collection-specific metrics.
- **Hybrid Search**: Increasingly used in conjunction with<span style='color:red'> vector</span> similarity search to enhance search capabilities.
Description: BM25 is a widely used algorithm for full text search.<span style='color:red'> I</span> wanted to understand how it works, so here is my attempt at understanding by re-explaining.
Created: 2024-11-20
Type: web-page
<END Article Number: 41>



<START Article Number: 40>
Title: Binary<span style='color:red'> vector</span> embeddings are so cool
URL: https://emschwartz.me/binary-ve<span style='color:red'>ctor</span>-embeddings-are-so-cool/
Summary: Binary quantized<span style='color:red'> vector</span> embeddings represent a significant advancement in the field of machine learning, particularly in natural language processing. These embeddings can achieve over 95% retrieval accuracy while compressing data by 32 times and accelerating retrieval speed by approximately 25 times. By converting 32-bit floating point weights to single bits, binary quantization retains essential information, allowing for efficient similarity searches using Hamming distance instead of cosine similarity. This technique, when combined with Matryoshka embeddings—which prioritize important information at the beginning of the<span style='color:red'> vector</span>—further enhances performance. The results show that binary embeddings not only reduce storage costs but also improve computational efficiency, making them a compelling choice for applications requiring fast and accurate<span style='color:red'> vector</span> similarity searches. Key insights include the effectiveness of binary quantization in maintaining high accuracy with minimal data size, and the potential for significant speed improvements in distance calculations. 

- Utilize binary quantized embeddings to achieve high retrieval accuracy with reduced data size.
- Implement Hamming distance for faster similarity searches compared to traditional cosine similarity.
- Explore the combination of binary quantization with Matryoshka embeddings for enhanced performance.
- Consider the computational efficiency of binary embeddings in applications requiring rapid data processing.
Description: Ve<span style='color:red'>ctor</span> embeddings by themselves are pretty neat. Binary quantized<span style='color:red'> vector</span> embeddings are extra impressive. In short, they can retain 95+% retrieval accuracy with 32x compression 🤯.
Created: 2024-11-20
Type: web-page
<END Article Number: 40>



<START Article Number: 21>
Title: GitHub - pingcap/autoflow: pingcap/autoflow is a Graph RAG based and conversational knowledge base tool built with TiDB Serverless Ve<span style='color:red'>ctor</span> Storage. Demo: https://tidb.ai
URL: https://github.com/pingcap/autoflow
Summary: pingcap/autoflow is an open-source conversational knowledge base tool leveraging Graph RAG (Retrieval-Augmented Generation) architecture, built on TiDB Serverless Ve<span style='color:red'>ctor</span> Storage. It integrates advanced features such as a perplexity-style conversational search, a built-in website crawler for comprehensive documentation coverage, and an embeddable JavaScript snippet for seamless integration into existing websites. The tech stack includes TiDB for data storage, LlamaIndex for RAG framework, and DSPy for programming foundation models. Key functionalities include the ability to edit the knowledge graph for accuracy, support for multiple knowledge bases, and performance enhancements in CI processes. This tool is particularly relevant for developers looking to implement conversational AI solutions and enhance user interaction through intelligent search capabilities.

- Utilize Graph RAG architecture for enhanced conversational AI.
- Implement a built-in website crawler for improved documentation search.
- Integrate an embeddable JavaScript snippet for user-friendly interfaces.
- Leverage TiDB for efficient data management and storage.
- Contribute to the project by following community guidelines.
Description: pingcap/autoflow is a Graph RAG based and conversational knowledge base tool built with TiDB Serverless Ve<span style='color:red'>ctor</span> Storage. Demo: https://tidb.ai - pingcap/autoflow
Created: 2024-11-22
Type: web-page
<END Article Number: 21>



**Sparse Search Results:**


<START Article Number: 40>
Title: Binary<span style='color:red'> vector</span> embeddings are so cool
URL: https://emschwartz.me/binary-ve<span style='color:red'>ctor</span>-embeddings-are-so-cool/
Summary: Binary quantized<span style='color:red'> vector</span> embeddings represent a significant advancement in the field of machine learning, particularly in natural language processing. These embeddings can achieve over 95% retrieval accuracy while compressing data by 32 times and accelerating retrieval speed by approximately 25 times. By converting 32-bit floating point weights to single bits, binary quantization retains essential information, allowing for efficient similarity searches using Hamming distance instead of cosine similarity. This technique, when combined with Matryoshka embeddings—which prioritize important information at the beginning of the<span style='color:red'> vector</span>—further enhances performance. The results show that binary embeddings not only reduce storage costs but also improve computational efficiency, making them a compelling choice for applications requiring fast and accurate<span style='color:red'> vector</span> similarity searches. Key insights include the effectiveness of binary quantization in maintaining high accuracy with minimal data size, and the potential for significant speed improvements in distance calculations. 

- Utilize binary quantized embeddings to achieve high retrieval accuracy with reduced data size.
- Implement Hamming distance for faster similarity searches compared to traditional cosine similarity.
- Explore the combination of binary quantization with Matryoshka embeddings for enhanced performance.
- Consider the computational efficiency of binary embeddings in applications requiring rapid data processing.
Description: Ve<span style='color:red'>ctor</span> embeddings by themselves are pretty neat. Binary quantized<span style='color:red'> vector</span> embeddings are extra impressive. In short, they can retain 95+% retrieval accuracy with 32x compression 🤯.
Created: 2024-11-20
Type: web-page
<END Article Number: 40>



<START Article Number: 58>
Title: voyage-multimodal-3: all-in-one embedding model for interleaved text, images, and screenshots
URL: https://blog.voyageai.com/2024/11/12/voyage-multimodal-3/
Summary: Voyage AI has launched `voyage-multimodal-3`, a cutting-edge multimodal embedding model that integrates interleaved text and images, significantly enhancing retrieval accuracy for documents containing both visual and textual data. This model outperforms existing solutions like OpenAI CLIP and Cohere multimodal v3 by an average of 19.63% across various multimodal retrieval tasks, including table/figure retrieval and document screenshot retrieval. Unlike traditional models that process text and images separately, `voyage-multimodal-3` utilizes a unified transformer architecture, allowing for more effective<span style='color:red'> vector</span>ization of complex layouts without the need for heuristic parsing. This innovation addresses the modality gap issue prevalent in CLIP-like models, ensuring robust performance in mixed-modality searches. The model is evaluated across 20 multimodal datasets and demonstrates superior capabilities in capturing semantic content from screenshots and documents, making it a valuable tool for semantic search and retrieval-augmented generation (RAG) applications.

### Key Points:
- `voyage-multimodal-3` integrates text and image data for improved retrieval accuracy.
- Achieves an average of 19.63% better performance than leading multimodal models.
- Utilizes a unified transformer architecture for<span style='color:red'> vector</span>ization, enhancing flexibility and accuracy.
- Addresses the modality gap, improving mixed-modality search results.
- Evaluated across 20 multimodal datasets, showcasing superior capabilities in semantic content retrieval.
Description: TL;DR — We are excited to announce voyage-multimodal-3, a new state-of-the-art for multimodal embeddings and a big step forward towards seamless RAG and semantic search for documents rich with both…
Created: 2024-11-18
Type: web-page
<END Article Number: 58>



<START Article Number: 17>
Title: The AI agents stack  | Letta
URL: https://www.letta.com/blog/ai-agents-stack
Summary: The AI agents stack has evolved significantly, reflecting advancements in memory, tool usage, and deployment strategies. This stack is categorized into three layers: model serving, storage, and agent frameworks. The transition from LLMs to LLM agents highlights the complexity of state management and tool execution, which are critical for developing autonomous systems. Key players in model serving include OpenAI and Anthropic for closed APIs, while vLLM and Ollama cater to local inference needs. Storage solutions like Chroma and Pinecone support the stateful nature of agents, enabling them to retain conversation histories and external data. The ability to call tools through structured outputs distinguishes agents from traditional chatbots, necessitating secure execution environments. Frameworks like Letta and LangChain manage agent state and context, with varying approaches to memory management and cross-agent communication. The future of agent deployment is anticipated to shift towards service-oriented architectures, emphasizing REST APIs for scalability and state normalization. As the ecosystem matures, the choice of frameworks will become increasingly critical for developers building complex agent applications.

- Understand the three layers of the AI agents stack: model serving, storage, and agent frameworks.
- Recognize the importance of state management and tool execution in developing LLM agents.
- Explore model serving options, including both closed APIs and local inference solutions.
- Utilize<span style='color:red'> vector database</span>s for effective storage of agent state and conversation history.
- Implement secure execution environments for tool calls made by agents.
- Choose frameworks based on their state management, memory handling, and support for open models.
- Prepare for a shift towards service-oriented architectures for agent deployment, focusing on REST APIs.
Description: Understanding the AI agents stack landscape.
Created: 2024-11-23
Type: web-page
<END Article Number: 17>



**Hybrid Search Results:**


<START Article Number: 40>
Title: Binary<span style='color:red'> vector</span> embeddings are so cool
URL: https://emschwartz.me/binary-ve<span style='color:red'>ctor</span>-embeddings-are-so-cool/
Summary: Binary quantized<span style='color:red'> vector</span> embeddings represent a significant advancement in the field of machine learning, particularly in natural language processing. These embeddings can achieve over 95% retrieval accuracy while compressing data by 32 times and accelerating retrieval speed by approximately 25 times. By converting 32-bit floating point weights to single bits, binary quantization retains essential information, allowing for efficient similarity searches using Hamming distance instead of cosine similarity. This technique, when combined with Matryoshka embeddings—which prioritize important information at the beginning of the<span style='color:red'> vector</span>—further enhances performance. The results show that binary embeddings not only reduce storage costs but also improve computational efficiency, making them a compelling choice for applications requiring fast and accurate<span style='color:red'> vector</span> similarity searches. Key insights include the effectiveness of binary quantization in maintaining high accuracy with minimal data size, and the potential for significant speed improvements in distance calculations. 

- Utilize binary quantized embeddings to achieve high retrieval accuracy with reduced data size.
- Implement Hamming distance for faster similarity searches compared to traditional cosine similarity.
- Explore the combination of binary quantization with Matryoshka embeddings for enhanced performance.
- Consider the computational efficiency of binary embeddings in applications requiring rapid data processing.
Description: Ve<span style='color:red'>ctor</span> embeddings by themselves are pretty neat. Binary quantized<span style='color:red'> vector</span> embeddings are extra impressive. In short, they can retain 95+% retrieval accuracy with 32x compression 🤯.
Created: 2024-11-20
Type: web-page
<END Article Number: 40>



<START Article Number: 41>
Title: Understanding the BM25 full text search algorithm
URL: https://emschwartz.me/understanding-the-bm25-full-text-search-algorithm/
Summary: BM25 (Best Match 25) is a prominent full-text search algorithm utilized in systems like Lucene, Elasticsearch, and SQLite, known for its effectiveness in ranking documents based on their relevance to a query. It operates on the principle of probabilistic ranking, leveraging components such as Inverse Document Frequency (IDF), term frequency, and document length normalization to compute scores for documents. The algorithm's clever design allows it to rank documents without needing to calculate exact probabilities, making it practical for real-world applications. Notably, BM25 scores can only be compared within the same document collection, as they depend on the specific characteristics of that collection. This understanding is crucial for developers looking to implement or enhance search functionalities in applications, particularly when integrating full-text search with<span style='color:red'> vector</span> similarity search for improved content retrieval.

### Key Points:
- **BM25 Algorithm**: A widely adopted full-text search algorithm that ranks documents based on relevance.
- **Components**: Utilizes query terms,<span style='color:red'> I</span>DF, term frequency, and document length normalization.
- **Probabilistic Ranking**: Focuses on the order of documents rather than exact relevance probabilities.
- **Score Comparisons**: BM25 scores can only be compared within the same document collection due to dependency on collection-specific metrics.
- **Hybrid Search**: Increasingly used in conjunction with<span style='color:red'> vector</span> similarity search to enhance search capabilities.
Description: BM25 is a widely used algorithm for full text search.<span style='color:red'> I</span> wanted to understand how it works, so here is my attempt at understanding by re-explaining.
Created: 2024-11-20
Type: web-page
<END Article Number: 41>



<START Article Number: 21>
Title: GitHub - pingcap/autoflow: pingcap/autoflow is a Graph RAG based and conversational knowledge base tool built with TiDB Serverless Ve<span style='color:red'>ctor</span> Storage. Demo: https://tidb.ai
URL: https://github.com/pingcap/autoflow
Summary: pingcap/autoflow is an open-source conversational knowledge base tool leveraging Graph RAG (Retrieval-Augmented Generation) architecture, built on TiDB Serverless Ve<span style='color:red'>ctor</span> Storage. It integrates advanced features such as a perplexity-style conversational search, a built-in website crawler for comprehensive documentation coverage, and an embeddable JavaScript snippet for seamless integration into existing websites. The tech stack includes TiDB for data storage, LlamaIndex for RAG framework, and DSPy for programming foundation models. Key functionalities include the ability to edit the knowledge graph for accuracy, support for multiple knowledge bases, and performance enhancements in CI processes. This tool is particularly relevant for developers looking to implement conversational AI solutions and enhance user interaction through intelligent search capabilities.

- Utilize Graph RAG architecture for enhanced conversational AI.
- Implement a built-in website crawler for improved documentation search.
- Integrate an embeddable JavaScript snippet for user-friendly interfaces.
- Leverage TiDB for efficient data management and storage.
- Contribute to the project by following community guidelines.
Description: pingcap/autoflow is a Graph RAG based and conversational knowledge base tool built with TiDB Serverless Ve<span style='color:red'>ctor</span> Storage. Demo: https://tidb.ai - pingcap/autoflow
Created: 2024-11-22
Type: web-page
<END Article Number: 21>


### Use a LLM to answer the question using the retrieved lines

In [67]:
# Join the retrieved lines with a newline character
basic_context = "\n".join(
    [line_with_distance[0] for line_with_distance in retrieved_lines_with_distances]
)
print(basic_context)


<START Article Number: 40>
Title: Binary vector embeddings are so cool
URL: https://emschwartz.me/binary-vector-embeddings-are-so-cool/
Summary: Binary quantized vector embeddings represent a significant advancement in the field of machine learning, particularly in natural language processing. These embeddings can achieve over 95% retrieval accuracy while compressing data by 32 times and accelerating retrieval speed by approximately 25 times. By converting 32-bit floating point weights to single bits, binary quantization retains essential information, allowing for efficient similarity searches using Hamming distance instead of cosine similarity. This technique, when combined with Matryoshka embeddings—which prioritize important information at the beginning of the vector—further enhances performance. The results show that binary embeddings not only reduce storage costs but also improve computational efficiency, making them a compelling choice for applications requiring fast and accurat

In [73]:
hybrid_context = "\n".join(
    [hybrid_match for hybrid_match in hybrid_results]
)
print(hybrid_context)



<START Article Number: 40>
Title: Binary vector embeddings are so cool
URL: https://emschwartz.me/binary-vector-embeddings-are-so-cool/
Summary: Binary quantized vector embeddings represent a significant advancement in the field of machine learning, particularly in natural language processing. These embeddings can achieve over 95% retrieval accuracy while compressing data by 32 times and accelerating retrieval speed by approximately 25 times. By converting 32-bit floating point weights to single bits, binary quantization retains essential information, allowing for efficient similarity searches using Hamming distance instead of cosine similarity. This technique, when combined with Matryoshka embeddings—which prioritize important information at the beginning of the vector—further enhances performance. The results show that binary embeddings not only reduce storage costs but also improve computational efficiency, making them a compelling choice for applications requiring fast and accurat

In [82]:
def prompt_template(context, question):
    SYSTEM_PROMPT = """
Human: You are an AI assistant. You are able to find answers to the questions from the contextual passage snippets provided."""
    USER_PROMPT = """
Use the following pieces of information enclosed in <context> tags to provide an answer to the question enclosed in <question> tags.
<context>
{context}
</context>
<question>
{question}
</question>""".format(context=context, question=question)
    return SYSTEM_PROMPT, USER_PROMPT

In [75]:
SYSTEM_PROMPT, USER_PROMPT = prompt_template(basic_context, question)
basic_response = openai_client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": USER_PROMPT},
    ],
)
print(basic_response.choices[0].message.content)

Based on the provided information, you might consider using TiDB Serverless Vector Storage if you're looking for a vector database that supports conversational AI and advanced features like a built-in website crawler and a flexible architecture (Graph RAG). This database is particularly suited for developers aiming to implement conversational knowledge bases and enhance user interaction through intelligent search capabilities. Alternatively, if you're focused solely on the performance of vector embeddings, particularly in natural language processing tasks, you may want to explore binary quantized vector embeddings due to their efficiency, high retrieval accuracy, and significant speed improvements for similarity searches.


In [76]:
print('Question:', question)
print('Basic context derived from the vector database:', basic_context)
print('Answer:', basic_response.choices[0].message.content)


Question: Which vector database should I use?
Basic context derived from the vector database: 
<START Article Number: 40>
Title: Binary vector embeddings are so cool
URL: https://emschwartz.me/binary-vector-embeddings-are-so-cool/
Summary: Binary quantized vector embeddings represent a significant advancement in the field of machine learning, particularly in natural language processing. These embeddings can achieve over 95% retrieval accuracy while compressing data by 32 times and accelerating retrieval speed by approximately 25 times. By converting 32-bit floating point weights to single bits, binary quantization retains essential information, allowing for efficient similarity searches using Hamming distance instead of cosine similarity. This technique, when combined with Matryoshka embeddings—which prioritize important information at the beginning of the vector—further enhances performance. The results show that binary embeddings not only reduce storage costs but also improve computa

In [85]:
SYSTEM_PROMPT, USER_PROMPT = prompt_template(hybrid_context, question)
hybrid_response = openai_client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": USER_PROMPT},
    ],
)
print(hybrid_response.choices[0].message.content)

Based on the provided context, you should consider using TiDB Serverless Vector Storage, as mentioned in the article about pingcap/autoflow. This platform is designed for efficient data management and storage, and it integrates well with conversational AI solutions, making it suitable for applications requiring advanced search capabilities and interaction. Additionally, it can support technologies like Retrieval-Augmented Generation (RAG), enhancing the performance of your vector-based tasks.


In [88]:
print('Question:', question)
print('Hybrid context derived from the vector database:', hybrid_context)
print('Answer:', hybrid_response.choices[0].message.content)


Question: Which vector database should I use?
Hybrid context derived from the vector database: 
<START Article Number: 40>
Title: Binary vector embeddings are so cool
URL: https://emschwartz.me/binary-vector-embeddings-are-so-cool/
Summary: Binary quantized vector embeddings represent a significant advancement in the field of machine learning, particularly in natural language processing. These embeddings can achieve over 95% retrieval accuracy while compressing data by 32 times and accelerating retrieval speed by approximately 25 times. By converting 32-bit floating point weights to single bits, binary quantization retains essential information, allowing for efficient similarity searches using Hamming distance instead of cosine similarity. This technique, when combined with Matryoshka embeddings—which prioritize important information at the beginning of the vector—further enhances performance. The results show that binary embeddings not only reduce storage costs but also improve comput