In [21]:
# Standard library imports
import logging
import os

# Third-party imports
from dotenv import load_dotenv
from flask import Flask, Response, jsonify, request, send_from_directory
from flask_cors import CORS
from langchain.agents import AgentExecutor, create_openai_functions_agent
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.tools import tool
from langchain_openai import ChatOpenAI, AzureChatOpenAI
from langgraph.prebuilt import create_react_agent

# Local imports
import db
from config import AI_ASSISTANT_NAME, LOCAL_KNOWLEDGE_BASE_DESCRIPTION, VECTOR_STORE_SEARCH_TOP_K

# Configure logging
logging.basicConfig(level=logging.INFO)

# Load environment variables from .env file
load_dotenv()

True

In [2]:
vector_store = db.get_vector_store()

INFO:root:Loading vector store from /Users/gaopeng/workspace/github.com/gaopenghigh/ailocalsearch/server/data/chroma
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: BAAI/bge-large-en-v1.5
INFO:chromadb.telemetry.product.posthog:Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


In [22]:

llm = ChatOpenAI(
    model="gpt-4o-mini", 
    streaming=True  # Changed to False since we don't need streaming
)

llm_think = ChatOpenAI(
    model="o3-mini", 
    streaming=False  # Changed to False since we don't need streaming
)

llm_azure = AzureChatOpenAI(
    model=os.getenv("AZURE_OPENAI_DEPLOYMENT"),
    azure_deployment=os.getenv("AZURE_OPENAI_DEPLOYMENT"),
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
)


In [23]:
r = llm_azure.invoke([("system", "You are a helpful assistant."), ("user", "What is the capital of France?")])
print(r)

INFO:httpx:HTTP Request: POST https://gaopengllm2.openai.azure.com/openai/deployments/gpt-4o/chat/completions?api-version=2024-10-21 "HTTP/1.1 200 OK"


content='The capital of France is **Paris**.' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 9, 'prompt_tokens': 24, 'total_tokens': 33, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-2024-11-20', 'system_fingerprint': 'fp_a42ed5ff0c', 'prompt_filter_results': [{'prompt_index': 0, 'content_filter_results': {'hate': {'filtered': False, 'severity': 'safe'}, 'jailbreak': {'filtered': False, 'detected': False}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': False, 'severity': 'safe'}}}], 'finish_reason': 'stop', 'logprobs': None, 'content_filter_results': {'hate': {'filtered': False, 'severity': 'safe'}, 'protected_material_code': {'filtered': False, 'detected': False}, 'protected_mater

In [24]:
r.content

'The capital of France is **Paris**.'

In [4]:
@tool
def search(question: str) -> str:
    """
    This tool is used to answer questions by searching through available documentation from local knowledge base.
    Args:
        question: The question to answer
    Returns:
        A string containing the search results
    """
    return db.search(vector_store, question, VECTOR_STORE_SEARCH_TOP_K)

In [5]:
def simple_answer(question: str) -> str:
    """
    This tool is used to answer questions by searching through available documentation from local knowledge base.
    Args:
        question: The question to answer
    Returns:
        A string containing the search results
    """
    search_result = search(question)
    system_prompt = f"""Your are {AI_ASSISTANT_NAME}, an AI assistant to help to answer technical questions based on the provided search results.
The search results are from a local knowledge base.
{LOCAL_KNOWLEDGE_BASE_DESCRIPTION}

"""
    prompt = f"""Answer question based solely on the search results, the answer should be in 3 parts
- Summary
- Detail, be comprehensive, verbose, detailed and accurate, include examples if needed
- Sources (extract the "Source" part of the search results, it's a file path, not a link)

IMPORTANT: answer the question based on only the search results, don't make up information, if you can't find relevant information, say so.

SEARCH RESULTS:

{search_result}


QUESTION:
{question}
"""
    messages = [
        ("system", system_prompt),
        ("user", prompt)
    ]
    response = llm.invoke(messages)
    return response.content


In [47]:
def create_answer_agent():
    """
    Create a graph that can answer questions based on local knowledge base.
    """
    prompt = f"""Your are {AI_ASSISTANT_NAME}, an AI assistant to help to answer questions based on local knowledge base.
{LOCAL_KNOWLEDGE_BASE_DESCRIPTION}

Always use tools to answer technical questions.
Do not make up information.
If you can't find relevant information, say so.

To answer a question:
1. Understand what the user is asking
2. Use the search tool to find relevant information, try to call it multiple times with different queries if needed
3. Generate a comprehensive answer

Provide answer based solely on the search results, in 3 parts
- Summary
- Detail, be comprehensive, verbose, detailed and accurate, include examples if needed
- Sources (extract the "Source" part of the search results, it's a file path, not a link)
"""
    tools = [search]
    return create_react_agent(llm, tools, prompt=prompt)

In [48]:
@tool
def answer(question: str) -> str:
    """
    This tool is used to answer questions by searching through available documentation from local knowledge base.
    Args:
        question: The question to answer
    Returns:
        A string containing the search results
    """
    logging.info(f"answering question: {question}")
    agent = create_answer_agent()
    return agent.invoke({"messages": [("human", question)]})

In [49]:
s = answer("how hcp works")

INFO:root:answering question: how hcp works


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:root:Searching for how hcp works in AKS
INFO:root:Found 37 references
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


In [51]:
print(s["messages"][-1]).content

content='### Summary\n\nThe Hosted Control Plane (HCP) in Azure Kubernetes Service (AKS) serves as a critical component by acting as a database abstraction layer for the AKS Resource Provider. It manages crucial entities related to customer control planes such as subscriptions, usage, and billing, enabling CRUD operations through a microservices architecture. HCP is instrumental in scaling, sharding, and caching, providing the necessary infrastructure to support the operations of customer control planes (CCPs).\n\n### Detail\n\nThe Hosted Control Plane (HCP) in AKS is designed to handle metadata and CRUD operations, freeing the customer control plane from direct database interactions. It leverages a microservices architecture with components like NGINX Ingress for request routing, an Underlay Cache Service for storing capacity statistics, and API Servers for interacting with Azure services such as SQL DB and Key Vault. Communication between services utilizes gRPC for synchronous reques

In [46]:
i = 0
for chunk in chunks:
    print("--- chunk", i, "---")
    print("type of chunk", type(chunk))
    # print("keys of chunk", chunk.keys())
    # for k, v in chunk.items():
    #     print("    key: ", k, "type of value: ", type(v))
    #     for k2, v2 in v.items():
    #         print("        key: ", k2, ", type of value: ", type(v2))
    #         print("        value: ", v2)
    #         for v3 in v2:
    #             print("            type of value: ", type(v3))
    #             print("            value: ", v3)
    print(chunk)
    print("--- end of chunk", i, "---")
    i += 1


--- chunk 0 ---
type of chunk <class 'tuple'>
(AIMessageChunk(content='', additional_kwargs={'tool_calls': [{'index': 0, 'id': 'call_xARgJfzaCGhal0eRuFmt74de', 'function': {'arguments': '', 'name': 'search'}, 'type': 'function'}]}, response_metadata={}, id='run-40a7e298-f105-4256-a55f-c7cd0a26db8a', tool_calls=[{'name': 'search', 'args': {}, 'id': 'call_xARgJfzaCGhal0eRuFmt74de', 'type': 'tool_call'}], tool_call_chunks=[{'name': 'search', 'args': '', 'id': 'call_xARgJfzaCGhal0eRuFmt74de', 'index': 0, 'type': 'tool_call_chunk'}]), {'langgraph_step': 1, 'langgraph_node': 'agent', 'langgraph_triggers': ['start:agent'], 'langgraph_path': ('__pregel_pull', 'agent'), 'langgraph_checkpoint_ns': 'agent:e51bfab7-b145-5d60-ad30-cdbc95a671e5', 'checkpoint_ns': 'agent:e51bfab7-b145-5d60-ad30-cdbc95a671e5', 'ls_provider': 'openai', 'ls_model_name': 'gpt-4o', 'ls_model_type': 'chat', 'ls_temperature': None})
--- end of chunk 0 ---
--- chunk 1 ---
type of chunk <class 'tuple'>
(AIMessageChunk(content

In [10]:
def create_think_agent():
    """
    Create a graph that can think about a question and break it down into smaller questions.
    """
    prompt = f"""Your are {AI_ASSISTANT_NAME}, an AI assistant to help to answer questions based on local knowledge base.
{LOCAL_KNOWLEDGE_BASE_DESCRIPTION}

lways use tools to answer technical questions.
Do not make up information.
If you can't find relevant information, say so.

To answer a question:
1. Understand what the user is asking
2. Use the search tool to find relevant information, try to call it multiple times with different questions if needed
3. Generate a comprehensive answer

Provide answer based solely on the search results, in 3 parts
- Summary
- Detail, be comprehensive, detailed, verbose and accurate, include examples if needed
- Sources (extract the "Source" part of the search results, it's a file path, not a link)
"""
    tools = [answer]
    return create_react_agent(llm_think, tools, prompt=prompt)