In [4]:
# Import the env_vars module
import sys
import os

from modules.env_vars import set_os_env_vars, check_missing_vars
from modules.neon_db import run_neon_query, load_sql_query
from modules.date_functions import get_current_date
from modules.reference_extraction import create_content_from_df
from modules.prompt_templates import one_shot_example, system_message_example

set_os_env_vars() # This will execute the code in env_vars.py and put the environment variables in os

In [5]:
from modules.langchain_config import set_langsmith_client, get_langsmith_tracer, get_llm_model, load_model_costs

set_langsmith_client()
tracer = get_langsmith_tracer()

In [6]:
# Load the costs
MODEL_COSTS = load_model_costs()
MODEL_COSTS


{'claude-3-sonnet-20240229': {'provider': 'anthropic',
  'input': 0.003,
  'output': 0.015},
 'claude-3-5-sonnet-20241022': {'provider': 'anthropic',
  'input': 0.003,
  'output': 0.015},
 'gpt-4o-mini': {'provider': 'openai'}}

In [7]:
# Initialize the language model
model_name = "gpt-4o-mini"
streaming = True # Streaming is when the LLM returns a token at a time, instead of the entire response at once

# Initialize the language model
llm = get_llm_model(model_name, streaming, MODEL_COSTS)


In [8]:
query = load_sql_query("web_pages.sql")
df = run_neon_query(query)

print("Number of rows:", len(df.index))
df.head(1)

Number of rows: 60


Unnamed: 0,id,url,media_type,status,created_at,title,description,summary,author,published_at
0,b0762d1d-a825-4427-a785-cb52229f4c67,https://aidanmclaughlin.notion.site/reasoners-...,web-page,completed,2024-11-29 07:51:53.011015,Notion – The all-in-one workspace for your not...,A new tool that blends your everyday work apps...,The article discusses the limitations of curre...,,NaT


In [9]:
# Print out the results (summary, titles, etc.)
all_content, all_content_list = create_content_from_df(df)

print(len(all_content_list))
print(all_content_list[0])


60

<START Article Number: 1>
Title: Notion – The all-in-one workspace for your notes, tasks, wikis, and databases.
URL: https://aidanmclaughlin.notion.site/reasoners-problem
Summary: The article discusses the limitations of current reasoning models, particularly OpenAI's o1, which utilize reinforcement learning (RL) to enhance reasoning capabilities. While these models show promise in structured environments with clear rewards, they struggle with open-ended tasks that lack frequent feedback, such as creative writing or philosophical reasoning. The author argues that despite the advancements in RL, these models do not generalize well beyond their training domains, leading to subpar performance in tasks requiring nuanced understanding. The piece highlights the challenges of scaling model size and the potential stagnation in AI development if the focus remains solely on improving reasoning without addressing the need for larger, more capable models. Key insights include the importance of

In [10]:
from openai import OpenAI

openai_client = OpenAI()

In [11]:
def emb_text(text):
    return (
        openai_client.embeddings.create(input=text, model="text-embedding-3-small")
        .data[0]
        .embedding
    )

In [12]:
test_embedding = emb_text("This is a test")
embedding_dim = len(test_embedding)
print(embedding_dim)
print(test_embedding[:10])

1536
[0.009889289736747742, -0.005578675772994757, 0.00683477520942688, -0.03805781528353691, -0.01824733428657055, -0.04121600463986397, -0.007636285852640867, 0.03225184231996536, 0.018949154764413834, 9.352207416668534e-05]


### As for the argument of MilvusClient:
- Setting the uri as a local file, e.g../milvus.db, is the most convenient method, as it automatically utilizes Milvus Lite to store all data in this file.
- If you have large scale of data, you can set up a more performant Milvus server on docker or kubernetes. In this setup, please use the server uri, e.g.http://localhost:19530, as your uri.
- If you want to use Zilliz Cloud, the fully managed cloud service for Milvus, adjust the uri and token, which correspond to the Public Endpoint and Api key in Zilliz Cloud.

In [105]:
# import modules.milvus_helper
import importlib

# import modules.milvus_wrapper
# importlib.reload(modules.milvus_helper)

# from modules.milvus_helper import (
#     get_milvus_client, create_milvus_collection, create_demo_hybrid_milvus_schema, get_dense_embedding_details, create_demo_hybrid_milvus_indices
# )

import modules.milvus_wrapper
importlib.reload(modules.milvus_wrapper)
from modules.milvus_wrapper import MilvusLiteClient, MilvusFullClient, get_dense_embedding_details
from pymilvus import utility

milvus_lite_client = MilvusLiteClient()
milvus_full_client = MilvusFullClient()


### Basic Vector Database Implementation with Milvus Lite

In [91]:
milvus_lite_client.create_collection(dimension=embedding_dim,
                                     collection_name="my_rag_collection",
                                     metric_type="IP", consistency_level="Strong", drop_if_exists=True
                                     )

### Hybrid Search Vector Database Implementation with Milvus


In [101]:
dense_dim, dense_embedding_function = get_dense_embedding_details(use_fp16=False, device="cpu")
schema = milvus_full_client.create_demo_hybrid_schema(embedding_dim=dense_dim)
milvus_hybrid_collection = milvus_full_client.create_collection(collection_name="my_hybrid_collection",
                         schema=schema, consistency_level="Strong", drop_if_exists=True)
milvus_full_client.create_demo_hybrid_indices(milvus_hybrid_collection)

Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

In [107]:
# Check the load state of the collections
res = milvus_lite_client.client.get_load_state(
    collection_name="my_hybrid_collection"
)
print("my_hybrid_collection:", res)

res = utility.load_state(
    collection_name="my_rag_collection"
)
print("my_rag_collection:", res)

my_hybrid_collection: {'state': <LoadState: Loaded>}
my_rag_collection: Loaded


In [57]:
import pandas as pd

file_path = "quora_duplicate_questions.tsv"
df = pd.read_csv(file_path, sep="\t")
questions = set()
for _, row in df.iterrows():
    obj = row.to_dict()
    questions.add(obj["question1"][:512])
    questions.add(obj["question2"][:512])
    if len(questions) > 500:  # Skip this if you want to use the full dataset
        break

docs = list(questions)

# example question
print(docs[0])

How I can speak English fluently?


In [59]:
from milvus_model.hybrid import BGEM3EmbeddingFunction

ef = BGEM3EmbeddingFunction(use_fp16=False, device="cpu")
dense_dim = ef.dim["dense"]

docs_embeddings = ef(docs)


Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

TypeError: M3Embedder.encode() missing 1 required positional argument: 'sentences'

In [56]:
# Generate embeddings using BGE-M3 model
# docs_embeddings = dense_embedding_function(all_content_list)
docs_embeddings = ef.encode_documents(docs)
print(docs_embeddings)

TypeError: M3Embedder.encode() missing 1 required positional argument: 'sentences'

In [None]:
for i in range(0, len(docs), 50):
    batched_entities = [
        docs[i : i + 50],
        docs_embeddings["sparse"][i : i + 50],
        docs_embeddings["dense"][i : i + 50],
    ]
    col.insert(batched_entities)
print("Number of entities inserted:", col.num_entities)


In [22]:
from tqdm import tqdm

data = []

for i, line in enumerate(tqdm(all_content_list, desc="Creating embeddings")):
    data.append({"id": i, "vector": emb_text(line), "text": line})


Creating embeddings: 100%|██████████| 60/60 [00:29<00:00,  2.01it/s]


In [64]:
milvus_client.insert(collection_name="my_hybrid_collection", data=data)

{'insert_count': 33, 'ids': [454319645845618754, 454319645845618755, 454319645845618756, 454319645845618757, 454319645845618758, 454319645845618759, 454319645845618760, 454319645845618761, 454319645845618762, 454319645845618763, 454319645845618764, 454319645845618765, 454319645845618766, 454319645845618767, 454319645845618768, 454319645845618769, 454319645845618770, 454319645845618771, 454319645845618772, 454319645845618773, 454319645845618774, 454319645845618775, 454319645845618776, 454319645845618777, 454319645845618778, 454319645845618779, 454319645845618780, 454319645845618781, 454319645845618782, 454319645845618783, 454319645845618784, 454319645845618785, 454319645845618786], 'cost': 0}

### Iterate through the text lines, create embeddings, and then insert the data into Milvus.
- Here is a new field text, which is a non-defined field in the collection schema. It will be automatically added to the reserved JSON dynamic field, which can be treated as a normal field at a high level.

In [13]:
from tqdm import tqdm

data = []

for i, line in enumerate(tqdm(all_content_list, desc="Creating embeddings")):
    data.append({"id": i, "vector": emb_text(line), "text": line})

milvus_client.insert(collection_name=collection_name, data=data)

Creating embeddings: 100%|██████████| 33/33 [00:12<00:00,  2.67it/s]


{'insert_count': 33, 'ids': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32], 'cost': 0}

### Build a simple RAG pipeline

In [18]:
question = "Which vector database should I use?"

In [19]:
# Search for the question in the collection and retrieve the semantic top-3 matches
search_res = milvus_client.search(
    collection_name=collection_name,
    data=[
        emb_text(question)
    ],  # Use the `emb_text` function to convert the question to an embedding vector
    limit=3,  # Return top 3 results
    search_params={"metric_type": "IP", "params": {}},  # Inner product distance
    output_fields=["text"],  # Return the text field
)

In [20]:
import json

# Print the retrieved lines with distances
retrieved_lines_with_distances = [
    (res["entity"]["text"], res["distance"]) for res in search_res[0]
]
print(json.dumps(retrieved_lines_with_distances, indent=4))

[
    [
        "\n<START Article Number: 13>\nTitle: Binary vector embeddings are so cool\nURL: https://emschwartz.me/binary-vector-embeddings-are-so-cool/\nSummary: Binary quantized vector embeddings represent a significant advancement in the field of machine learning, particularly in natural language processing. These embeddings can achieve over 95% retrieval accuracy while compressing data by 32 times and accelerating retrieval speed by approximately 25 times. By converting 32-bit floating point weights to single bits, binary quantization retains essential information, allowing for efficient similarity searches using Hamming distance instead of cosine similarity. This technique, when combined with Matryoshka embeddings\u2014which prioritize important information at the beginning of the vector\u2014further enhances performance. The results show that binary embeddings not only reduce storage costs but also improve computational efficiency, making them a compelling choice for applicat

### Use a LLM to answer the question using the retrieved lines

In [21]:
# Join the retrieved lines with a newline character
context = "\n".join(
    [line_with_distance[0] for line_with_distance in retrieved_lines_with_distances]
)
print(context)


<START Article Number: 13>
Title: Binary vector embeddings are so cool
URL: https://emschwartz.me/binary-vector-embeddings-are-so-cool/
Summary: Binary quantized vector embeddings represent a significant advancement in the field of machine learning, particularly in natural language processing. These embeddings can achieve over 95% retrieval accuracy while compressing data by 32 times and accelerating retrieval speed by approximately 25 times. By converting 32-bit floating point weights to single bits, binary quantization retains essential information, allowing for efficient similarity searches using Hamming distance instead of cosine similarity. This technique, when combined with Matryoshka embeddings—which prioritize important information at the beginning of the vector—further enhances performance. The results show that binary embeddings not only reduce storage costs but also improve computational efficiency, making them a compelling choice for applications requiring fast and accurat

In [22]:
SYSTEM_PROMPT = """
Human: You are an AI assistant. You are able to find answers to the questions from the contextual passage snippets provided.
"""
USER_PROMPT = f"""
Use the following pieces of information enclosed in <context> tags to provide an answer to the question enclosed in <question> tags.
<context>
{context}
</context>
<question>
{question}
</question>
"""

In [23]:
response = openai_client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": USER_PROMPT},
    ],
)
print(response.choices[0].message.content)

Based on the provided context, if you're considering a vector database focused on efficient similarity searches and high retrieval accuracy, you should look into platforms that support binary quantized vector embeddings. These embeddings can achieve over 95% retrieval accuracy while compressing data significantly and accelerating retrieval speed. Additionally, combining binary quantization with techniques like Matryoshka embeddings can further enhance performance for applications that require fast and accurate vector similarity comparisons.

On the other hand, if you're interested in full-text search capabilities with sophisticated relevance scoring, consider exploring systems that implement algorithms like BM25, as it provides better search quality than traditional methods like TF-IDF or basic PostgreSQL full-text search.

For real-time adaptability and dynamic data management, the Fast GraphRAG framework mentioned could be a compelling option, especially for implementations that are 

In [24]:
print('Question:', question)
print('Context derived from the vector database:', context)
print('Answer:', response.choices[0].message.content)

Question: Which vector database should I use?
Context derived from the vector database: 
<START Article Number: 13>
Title: Binary vector embeddings are so cool
URL: https://emschwartz.me/binary-vector-embeddings-are-so-cool/
Summary: Binary quantized vector embeddings represent a significant advancement in the field of machine learning, particularly in natural language processing. These embeddings can achieve over 95% retrieval accuracy while compressing data by 32 times and accelerating retrieval speed by approximately 25 times. By converting 32-bit floating point weights to single bits, binary quantization retains essential information, allowing for efficient similarity searches using Hamming distance instead of cosine similarity. This technique, when combined with Matryoshka embeddings—which prioritize important information at the beginning of the vector—further enhances performance. The results show that binary embeddings not only reduce storage costs but also improve computational