In [1]:
# Built-in library
import asyncio
import json
import logging
import re
import warnings
from pathlib import Path
from pprint import pprint
from typing import Annotated, Any, Iterable, Literal, Optional, Union

# Standard imports
import nest_asyncio
import numpy as np
import numpy.typing as npt
import pandas as pd
import polars as pl
from rich.console import Console
from rich.theme import Theme

custom_theme = Theme(
    {
        "white": "#FFFFFF",  # Bright white
        "info": "#00FF00",  # Bright green
        "warning": "#FFD700",  # Bright gold
        "error": "#FF1493",  # Deep pink
        "success": "#00FFFF",  # Cyan
        "highlight": "#FF4500",  # Orange-red
    }
)
console = Console(theme=custom_theme)

# Visualization
# import matplotlib.pyplot as pltife

# NumPy settings
np.set_printoptions(precision=4)

# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

# Polars settings
pl.Config.set_fmt_str_lengths(1_000)
pl.Config.set_tbl_cols(n=1_000)

warnings.filterwarnings("ignore")

# Black code formatter (Optional)
%load_ext lab_black

# auto reload imports
%load_ext autoreload
%autoreload 2

In [2]:
def go_up_from_current_directory(*, go_up: int = 1) -> None:
    """This is used to up a number of directories.

    Params:
    -------
    go_up: int, default=1
        This indicates the number of times to go back up from the current directory.

    Returns:
    --------
    None
    """
    import os
    import sys

    CONST: str = "../"
    NUM: str = CONST * go_up

    # Goto the previous directory
    prev_directory = os.path.join(os.path.dirname(__name__), NUM)
    # Get the 'absolute path' of the previous directory
    abs_path_prev_directory = os.path.abspath(prev_directory)

    # Add the path to the System paths
    sys.path.insert(0, abs_path_prev_directory)
    print(abs_path_prev_directory)

In [3]:
go_up_from_current_directory(go_up=2)


from schemas import ModelEnum  # noqa: E402
from settings import refresh_settings  # noqa: E402
from utilities.client_utils import check_rate_limit  # noqa: E402

settings = refresh_settings()

/Users/neidu/Desktop/Projects/Personal/My_Projects/AI-Tutorials


## LiteLLM

In [4]:
from litellm import acompletion
from pydantic import BaseModel, Field


class Response(BaseModel):
    content: str = Field(description="The content of the response.")


messages: list[dict[str, Any]] = [
    {
        "role": "system",
        "content": "<inst>/no_think You're an expert AI assstance that replies to "
        "questions in a very polite and concise manner.</inst>",
    },
    {
        "role": "user",
        "content": "When it rains it ....",
    },
]
response = await acompletion(
    model=f"openrouter/{ModelEnum.BASE_REMOTE_MODEL_1_8B.value}",
    messages=messages,
    max_tokens=700,
    max_retries=5,
    temperature=0.0,
    seed=0,
    response_format=Response,
)

console.log(response)

In [5]:
check_rate_limit()

{
  "data": {
    "label": "sk-or-v1-902...c45",
    "limit": 2,
    "usage": 0.1977190685,
    "is_provisioning_key": false,
    "limit_remaining": 1.8022809315,
    "is_free_tier": false,
    "rate_limit": {
      "requests": 20,
      "interval": "10s"
    }
  }
}


In [17]:
from outlines import models

model = models.openai(
    # ModelEnum.BASE_REMOTE_MODEL_1_8B,
    ModelEnum.BASE_MODEL_LOCAL_1,
    api_key=settings.OPENROUTER_API_KEY.get_secret_value(),
    # base_url=settings.OPENROUTER_URL,
    base_url=settings.OLLAMA_URL,
)

In [18]:
import outlines.models as models
from outlines import generate
from pydantic import BaseModel, ConfigDict


class Person(BaseModel):
    model_config = ConfigDict(extra="forbid")  # required for openai
    first_name: str
    last_name: str
    age: int


generator = generate.json(model, Person)
generator("current indian prime minister on january 1st 2023")
# Person(first_name='Narendra', last_name='Modi', age=72)

generator = generate.choice(model, ["Chicken", "Egg"])
print(generator("Which came first?"))
# Chicken

Chicken


In [None]:
class Person(BaseModel):
    """A schema for a person."""

    name: str = Field(description="The name of the person.")
    age: int = Field(description="The age of the person.", ge=5, le=100)


class Persons(BaseModel):
    persons: list[Person] = Field(description="A list of persons.", alias="engineers")


messages: list[dict[str, Any]] = [
    {
        "role": "system",
        "content": "<inst>/no_think You're an expert AI assstance that replies to "
        "questions in a very polite and concise manner. When you respond, reply "
        "with a cleanly formatted JSON without including backticks.</inst>",
    },
    {
        "role": "user",
        "content": "<user>There are two engineers working at Fred AI. Kunle, 28 years old is "
        "a Python developer while Francis is a Golang guru at a ripe age of 32. Extract "
        "their information in a JSON format.</user>",
    },
]

raw_response = await acompletion(
    model=f"openrouter/{ModelEnum.GEMMA_3p0_12B_REMOTE_FREE.value}",
    messages=messages,
    max_tokens=700,
    max_retries=5,
    temperature=0.0,
    seed=0,
    response_format=Persons,
)

console.log(raw_response)

In [None]:
try:
    response = raw_response.choices[0].message.content
    console.log(Persons.model_validate_json(response).model_dump())
except Exception as e:
    console.log(e)

### Comment

- The `response_format` helps to validate the response.
- It not guaranteed to always work.
- Another alternative is to use the `Instructor` library to validate the response.

<hr><br>

## Instructor


In [None]:
import instructor

aclient = instructor.from_litellm(acompletion, mode=instructor.Mode.JSON)

response, raw_response = await aclient.chat.completions.create_with_completion(
    response_model=Persons,
    model=f"openrouter/{ModelEnum.GEMMA_3p0_12B_REMOTE_FREE.value}",
    messages=messages,
    max_tokens=700,
    max_retries=5,
    temperature=0.0,
    seed=0,
)
console.log(raw_response)

print("Validated response: ")
console.log(response.model_dump())

In [None]:
# Using Iterable does NOT work with create_with_completion
response = await aclient.chat.completions.create(
    response_model=Iterable[Person],
    model=f"openrouter/{ModelEnum.GEMMA_3p0_12B_REMOTE.value}",
    messages=messages,
    max_tokens=700,
    max_retries=5,
    temperature=0.0,
    seed=0,
)
result = [person.model_dump() async for person in response]
console.log(result)

<hr><br>

# Add Memory To LangGraph Workflow

- the previous chatbot can use tools but can't remember the previous conversation.
- LangGraph solves this by using `persistent checkpointing`.
- This can be achieved by providing a `checkpointer` when compiling the graph and a `thread_id` when running the graph.
- LangGraph automatically saves the state after each step and when the graph is invoked later using the same `thread_id`, the graph loads up its saved state.
- LangGraph claims `checkpoint` is more powerful than using a `simple chat memory`.

In [None]:
from langgraph.checkpoint.memory import MemorySaver

# A simple memory saver for this tutorial. In production,
# it's recommennded to use SqliteSaver or PostgresSaver
memory = MemorySaver()

In [None]:
from typing import Annotated, TypedDict

from langchain.chat_models import init_chat_model
from langchain_tavily import TavilySearch
from langgraph.graph import END, START, StateGraph
from langgraph.graph.message import add_messages
from langgraph.prebuilt import ToolNode, tools_condition


class State(TypedDict):
    # This appends messages instead of overwriting
    messages: Annotated[list, add_messages]


llm = init_chat_model("mistralai:ministral-8b-latest")
tavily_search = TavilySearch(max_results=2)
tools = [tavily_search]
llm_with_tools = llm.bind_tools(tools)

In [None]:
# llm = init_chat_model("mistralai:ministral-8b-latest")
response = await llm.ainvoke("Sup ma nigga?")
response.content

In [None]:
async def chatbot(state: State) -> dict[str, Any]:
    """Process chat messages and generate a response using LLM with tools.

    Parameters:
        state : State
            The current state object containing chat messages.

    Returns:
        dict[str, Any]
            A dictionary containing the 'messages' key with a list of the LLM response.

    Notes:
        The function uses an asynchronous LLM invocation to process the messages
        and returns a single response wrapped in a list.
    """
    response = await llm_with_tools.ainvoke(state["messages"])
    return {"messages": [response]}


# Init the state graph
graph_builder: StateGraph = StateGraph(State)

In [None]:
# Create the nodes
tool_node = ToolNode(tools=tools)
graph_builder.add_node("chatbot", chatbot)
graph_builder.add_node("tools", tool_node)

# Connect the nodes
graph_builder.add_edge(START, "chatbot")
graph_builder.add_conditional_edges("chatbot", tools_condition)
graph_builder.add_edge("tools", "chatbot")
graph_builder.add_edge("chatbot", END)

### Compile The Graph

- Add the memory saver.
- Add [LangFuse](https://github.com/langfuse/langfuse) callback handler for observability.

In [None]:
from langfuse.callback import CallbackHandler

# langfuse_handler = CallbackHandler()

# Create a callback handler with a session and user id
langfuse_handler = CallbackHandler(
    session_id="chatbot_with_tools",
    user_id="user_123",
)

# Compile the graph
graph = graph_builder.compile(checkpointer=memory).with_config({"callbacks": [langfuse_handler]})


# Visualize the graph
from IPython.display import Image, display  # noqa: E402

display(Image(graph.get_graph().draw_mermaid_png()))
# or display(Image(graph.get_graph().draw_png()))

### Interact With Chatbot

- Add a `thread` to use the graph.
- Add [LangFuse](https://github.com/langfuse/langfuse) callback handler for observability.

In [None]:
# Adding the langfuse handler using this approach didn't work for some reason
config = {
    "configurable": {"thread_id": "1"},
    # "callback": [langfuse_handler],
}

### Call The Chatbot


In [None]:
user_input: str = "'Sup! I'm Neidu."

# NB: config variable is the 2nd positional argument
events = graph.astream(
    {"messages": [{"role": "user", "content": user_input}]},
    config=config,
    stream_mode="values",
)

async for event in events:
    event["messages"][-1].pretty_print()

In [None]:
# Ask a follow up question
user_input: str = "Yo! You remember my name?"

# NB: config variable is the 2nd positional argument
events = graph.astream(
    {"messages": [{"role": "user", "content": user_input}]},
    config=config,
    stream_mode="values",
)

async for event in events:
    event["messages"][-1].pretty_print()

#### Verify The Persistence

- Change the thread id

In [None]:
# Ask a follow up question
user_input: str = "Yo! You remember my name?"

# NB: config variable is the 2nd positional argument
events = graph.astream(
    {"messages": [{"role": "user", "content": user_input}]},
    config={"configurable": {"thread_id": "2"}},
    stream_mode="values",
)

async for event in events:
    event["messages"][-1].pretty_print()

### Inspect The State

In [None]:
snapshot = graph.get_state(config)
console.log(snapshot)

In [None]:
snapshot.next

## Add Human-in-the-loop Controls
- LangGraph's persistence layer supports human-in-the-loop workflows to handle unreliable agents needing human input or approval. 
- The `interrupt` function pauses execution for user feedback, which is then provided via a Command to resume, similar to Python's `input()`.

In [None]:
from langchain_core.tools import tool
from langgraph.types import Command, interrupt


@tool
def human_assistance(query: str) -> str:
    """Request human assistance for a given query.

    Parameters
    ----------
    query : str
        The question or request to be handled by a human.

    Returns
    -------
    str
        The response data provided by the human assistant.
    """
    human_assistance = interrupt({"query": query})
    return human_assistance["data"]

In [None]:
tools = [tavily_search, human_assistance]
llm_with_tools = llm.bind_tools(tools)

In [None]:
response = await llm_with_tools.ainvoke("Who won the FA Cup final match today?")
console.log(response)

In [None]:
async def chatbot(state: State) -> dict[str, Any]:
    """Process chat messages through LLM with tools and return response.

    Parameters
    ----------
    state : State
        Current state containing message history.

    Returns
    -------
    dict[str, Any]
        Dictionary containing LLM response message.
        Contains key 'messages' with list of one message.

    Notes
    -----
    Disables parallel tool calling to prevent duplicate tool invocations
    when restarting the graph flow. Asserts at most one tool call per message.
    """
    message = await llm_with_tools.ainvoke(state["messages"])
    # Disable parallel tool calling because we'll be interrupting (human-in-the-loop)
    # to prevent repeating any tool invocations when we restart the graph
    assert len(message.tool_calls) <= 1
    return {"messages": [message]}