In [1]:
# Import the env_vars module
import sys
import os

from modules.env_vars import set_os_env_vars, check_missing_vars
from modules.neon_db import run_neon_query, load_sql_query
from modules.date_functions import get_current_date
from modules.reference_extraction import create_content_from_df
from modules.prompt_templates import one_shot_example, system_message_example

set_os_env_vars() # This will execute the code in env_vars.py and put the environment variables in os

In [2]:
from modules.langchain_config import set_langsmith_client, get_langsmith_tracer, get_llm_model, load_model_costs

set_langsmith_client()
tracer = get_langsmith_tracer()

In [3]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_milvus.retrievers import MilvusCollectionHybridSearchRetriever
from langchain_milvus.utils.sparse import BM25SparseEmbedding
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from pymilvus import (
    Collection,
    CollectionSchema,
    DataType,
    FieldSchema,
    WeightedRanker,
    connections,
)

In [4]:
# Load the costs
MODEL_COSTS = load_model_costs()
MODEL_COSTS


{'claude-3-sonnet-20240229': {'provider': 'anthropic',
  'input': 0.003,
  'output': 0.015},
 'claude-3-5-sonnet-20241022': {'provider': 'anthropic',
  'input': 0.003,
  'output': 0.015},
 'gpt-4o-mini': {'provider': 'openai'}}

# Ingest data

In [5]:
# Prepare dense and sparse embedding functions
texts = [
    "In 'The Whispering Walls' by Ava Moreno, a young journalist named Sophia uncovers a decades-old conspiracy hidden within the crumbling walls of an ancient mansion, where the whispers of the past threaten to destroy her own sanity.",
    "In 'The Last Refuge' by Ethan Blackwood, a group of survivors must band together to escape a post-apocalyptic wasteland, where the last remnants of humanity cling to life in a desperate bid for survival.",
    "In 'The Memory Thief' by Lila Rose, a charismatic thief with the ability to steal and manipulate memories is hired by a mysterious client to pull off a daring heist, but soon finds themselves trapped in a web of deceit and betrayal.",
    "In 'The City of Echoes' by Julian Saint Clair, a brilliant detective must navigate a labyrinthine metropolis where time is currency, and the rich can live forever, but at a terrible cost to the poor.",
    "In 'The Starlight Serenade' by Ruby Flynn, a shy astronomer discovers a mysterious melody emanating from a distant star, which leads her on a journey to uncover the secrets of the universe and her own heart.",
    "In 'The Shadow Weaver' by Piper Redding, a young orphan discovers she has the ability to weave powerful illusions, but soon finds herself at the center of a deadly game of cat and mouse between rival factions vying for control of the mystical arts.",
    "In 'The Lost Expedition' by Caspian Grey, a team of explorers ventures into the heart of the Amazon rainforest in search of a lost city, but soon finds themselves hunted by a ruthless treasure hunter and the treacherous jungle itself.",
    "In 'The Clockwork Kingdom' by Augusta Wynter, a brilliant inventor discovers a hidden world of clockwork machines and ancient magic, where a rebellion is brewing against the tyrannical ruler of the land.",
    "In 'The Phantom Pilgrim' by Rowan Welles, a charismatic smuggler is hired by a mysterious organization to transport a valuable artifact across a war-torn continent, but soon finds themselves pursued by deadly assassins and rival factions.",
    "In 'The Dreamwalker's Journey' by Lyra Snow, a young dreamwalker discovers she has the ability to enter people's dreams, but soon finds herself trapped in a surreal world of nightmares and illusions, where the boundaries between reality and fantasy blur.",
]

In [20]:
query = load_sql_query("web_pages.sql")
df = run_neon_query(query)

print("Number of rows:", len(df.index))
df.head(1)

Number of rows: 60


Unnamed: 0,id,url,media_type,status,created_at,title,description,summary,author,published_at
0,b0762d1d-a825-4427-a785-cb52229f4c67,https://aidanmclaughlin.notion.site/reasoners-...,web-page,completed,2024-11-29 07:51:53.011015,Notion – The all-in-one workspace for your not...,A new tool that blends your everyday work apps...,The article discusses the limitations of curre...,,NaT


In [22]:
# Print out the results (summary, titles, etc.)
all_content, all_content_list, all_content_dict = create_content_from_df(df)
print(len(all_content_list))
print(all_content_list[0])

60

<START Article Number: 1>
Title: Notion – The all-in-one workspace for your notes, tasks, wikis, and databases.
URL: https://aidanmclaughlin.notion.site/reasoners-problem
Summary: The article discusses the limitations of current reasoning models, particularly OpenAI's o1, which utilize reinforcement learning (RL) to enhance reasoning capabilities. While these models show promise in structured environments with clear rewards, they struggle with open-ended tasks that lack frequent feedback, such as creative writing or philosophical reasoning. The author argues that despite the advancements in RL, these models do not generalize well beyond their training domains, leading to subpar performance in tasks requiring nuanced understanding. The piece highlights the challenges of scaling model size and the potential stagnation in AI development if the focus remains solely on improving reasoning without addressing the need for larger, more capable models. Key insights include the importance of

# Prepare dense and sparse embedding functions

In [23]:
dense_embedding_func = OpenAIEmbeddings()
dense_dim = len(dense_embedding_func.embed_query(all_content_list[1]))
dense_dim

1536

In [24]:
sparse_embedding_func = BM25SparseEmbedding(corpus=all_content_list)
sparse_embedding_func.embed_query(all_content_list[1])

{0: 0.7702225,
 1: 1.540445,
 2: 1.540445,
 4: 0.7702225,
 10: 0.83173335,
 13: 0.7702225,
 14: 0.7702225,
 16: 0.7702225,
 19: 2.5301633,
 21: 1.540445,
 22: 0.9921288,
 24: 0.9105601,
 30: 0.7702225,
 31: 0.7702225,
 49: 0.19735943,
 55: 1.69029,
 57: 0.13119456,
 62: 1.2580401,
 63: 3.38058,
 68: 6.9320025,
 77: 5.0603266,
 79: 1.0768723,
 80: 0.7702225,
 85: 0.75527894,
 92: 0.9921288,
 101: 0.7702225,
 110: 0.7702225,
 111: 2.799022,
 112: 0.7702225,
 113: 0.7702225,
 114: 0.7702225,
 115: 5.598044,
 116: 2.799022,
 117: 1.9647787,
 118: 3.6393967,
 119: 2.5301633,
 120: 3.6805112,
 121: 2.5301633,
 122: 1.9647787,
 123: 0.7702225,
 124: 1.4596256,
 125: 0.7702225,
 126: 2.5301633,
 127: 6.305472,
 128: 6.305472,
 129: 3.1411963,
 130: 3.8511124,
 131: 5.3843617,
 132: 7.3610225,
 133: 3.38058,
 134: 0.394008,
 135: 7.3610225,
 136: 6.305472,
 137: 4.62327,
 138: 1.5705981,
 139: 2.799022,
 140: 6.305472,
 141: 3.641494,
 142: 7.3610225,
 143: 3.152736,
 144: 1.5705981,
 145: 3.68

# Create Milvus Collection and load data

In [25]:
# Initialize connection URI and establish connection
connections.connect(uri="./milvus_demo.db")

In [26]:
# Define field names and their data types
pk_field = "doc_id"
dense_field = "dense_vector"
sparse_field = "sparse_vector"
text_field = "text"
fields = [
    FieldSchema(
        name=pk_field,
        dtype=DataType.VARCHAR,
        is_primary=True,
        auto_id=True,
        max_length=100,
    ),
    FieldSchema(name=dense_field, dtype=DataType.FLOAT_VECTOR, dim=dense_dim),
    FieldSchema(name=sparse_field, dtype=DataType.SPARSE_FLOAT_VECTOR),
    FieldSchema(name=text_field, dtype=DataType.VARCHAR, max_length=65_535),
]

In [27]:
# Create a collection with the defined schema
schema = CollectionSchema(fields=fields, enable_dynamic_field=False)
collection = Collection(
    name="IntroductionToTheNovels", schema=schema, consistency_level="Strong"
)

In [28]:
# Define index for dense and sparse vectors
dense_index = {"index_type": "FLAT", "metric_type": "IP"}
collection.create_index("dense_vector", dense_index)
sparse_index = {"index_type": "SPARSE_INVERTED_INDEX", "metric_type": "IP"}
collection.create_index("sparse_vector", sparse_index)

# Flush the collection to make the changes persistent
collection.flush()

In [35]:
# Insert entities into the collection and load the collection
entities = []
for text in all_content_list:
    entity = {
        dense_field: dense_embedding_func.embed_documents([text])[0],
        sparse_field: sparse_embedding_func.embed_documents([text])[0],
        text_field: text,
    }
    entities.append(entity)
collection.insert(entities)
collection.load()

# Instantiation

In [36]:
# Now we can instantiate our retriever, defining search parameters for sparse and dense fields:
sparse_search_params = {"metric_type": "IP"}
dense_search_params = {"metric_type": "IP", "params": {}}
# In the input parameters of this Retriever, we use a dense embedding and a sparse embedding to perform hybrid search on the two fields of this Collection, and use WeightedRanker for reranking. Finally, 3 top-K Documents will be returned.
retriever = MilvusCollectionHybridSearchRetriever(
    collection=collection,
    rerank=WeightedRanker(0.5, 0.5),
    anns_fields=[dense_field, sparse_field],
    field_embeddings=[dense_embedding_func, sparse_embedding_func],
    field_search_params=[dense_search_params, sparse_search_params],
    top_k=3,
    text_field=text_field,
)

# Usage

In [39]:
# retriever.invoke("What are the story about ventures?")
retriever.invoke("Which vector database should I use?")

[Document(metadata={'doc_id': '454566879038537787'}, page_content='\n<START Article Number: 40>\nTitle: Binary vector embeddings are so cool\nURL: https://emschwartz.me/binary-vector-embeddings-are-so-cool/\nSummary: Binary quantized vector embeddings represent a significant advancement in the field of machine learning, particularly in natural language processing. These embeddings can achieve over 95% retrieval accuracy while compressing data by 32 times and accelerating retrieval speed by approximately 25 times. By converting 32-bit floating point weights to single bits, binary quantization retains essential information, allowing for efficient similarity searches using Hamming distance instead of cosine similarity. This technique, when combined with Matryoshka embeddings—which prioritize important information at the beginning of the vector—further enhances performance. The results show that binary embeddings not only reduce storage costs but also improve computational efficiency, maki

# Use within a chain

In [40]:
# Initialize ChatOpenAI and define a prompt template
# llm = ChatOpenAI()

model_name = "gpt-4o-mini"
streaming = False
llm = get_llm_model(model_name, streaming, MODEL_COSTS)

PROMPT_TEMPLATE = """
Human: You are an AI assistant, and provides answers to questions by using fact based and statistical information when possible.
Use the following pieces of information to provide a concise answer to the question enclosed in <question> tags.

<context>
{context}
</context>

<question>
{question}
</question>

Assistant:"""

prompt = PromptTemplate(
    template=PROMPT_TEMPLATE, input_variables=["context", "question"]
)

In [41]:
# Define a function for formatting documents
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [42]:
# Define a chain using the retriever and other components
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [43]:
# Perform a query using the defined chain
# rag_chain.invoke("What novels has Lila written and what are their contents?")
rag_chain.invoke("Which vector database should I use?")

'The choice of vector database depends on your specific needs, but popular options include Chroma and Pinecone. Both support the stateful nature of AI agents, enabling them to retain conversation histories and external data. If you require local inference capabilities, consider using vLLM or Ollama for model serving. Evaluate your requirements for scalability, state management, and integration with your existing systems to make the best decision.'

In [None]:
# Drop the collection
# collection.drop()