In [1]:
from dotenv import load_dotenv

load_dotenv()

True

In [5]:
from typing import List
import json

from pydantic import BaseModel, Field

class CDSDataset(BaseModel):
    id: str
    filename: str
    description: str

class DatasetsFile(BaseModel):
    cds_files: List[CDSDataset] = Field(alias="cds-files")

with open("./datasets.json", "r") as fp:
    datasets_file = DatasetsFile(**json.load(fp))

datasets = datasets_file.cds_files

In [12]:
import os

from pinecone import Pinecone
from llama_index.core.llms import LLM
from llama_index.llms.openai import OpenAI
from llama_index.core.indices import VectorStoreIndex
from llama_index.core.query_engine import RouterQueryEngine
from llama_index.vector_stores.pinecone import PineconeVectorStore
from llama_index.core.tools import QueryEngineTool, ToolMetadata
from llama_index.embeddings.openai import OpenAIEmbedding, OpenAIEmbeddingModelType
from llama_index.core.selectors import LLMMultiSelector
from llama_index.core.vector_stores import (
    MetadataFilter,
    MetadataFilters,
    FilterOperator,
)

OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
PINECONE_API_KEY = os.environ["PINECONE_API_KEY"]

llm = OpenAI(api_key=OPENAI_API_KEY, model="gpt-4-turbo")
pc = Pinecone(api_key=PINECONE_API_KEY)
embed_model = OpenAIEmbedding(
    api_key=OPENAI_API_KEY, model=OpenAIEmbeddingModelType.TEXT_EMBED_3_LARGE
)

index = VectorStoreIndex.from_vector_store(
    vector_store=PineconeVectorStore(pc.Index("cds-index-test")),
    embed_model=embed_model,
)

def get_dataset_query_engine(dataset_id: str, llm: LLM, index: VectorStoreIndex):
    vector_filter = MetadataFilters(
        filters=[
            MetadataFilter(
                key="dataset_id",
                operator=FilterOperator.EQ,
                value=dataset_id,
            ),
        ]
    )

    return index.as_query_engine(
        llm=llm,
        verbose=True,
        filters=vector_filter,
    )

query_engine_tools = []

for dataset in datasets:
    query_engine = get_dataset_query_engine(
        dataset_id=dataset.id, llm=llm, index=index
    )

    tool = QueryEngineTool(
        query_engine=query_engine,
        metadata=ToolMetadata(
            name=dataset.id,
            description=(dataset.description),
        ),
    )

    query_engine_tools.append(tool)

cds_query_engine = RouterQueryEngine(
    selector=LLMMultiSelector.from_defaults(llm=llm),
    query_engine_tools=query_engine_tools,
    verbose=True,
)

In [21]:
cds_query_engine.query("give a breakdown of importance of what academic and nonacademic factors the admissions commitee considers for admission at NYU")
# cds_query_engine.as_query_component

[1;3;38;5;200mSelecting query engine 7: This item specifically refers to admissions data for New York University, which is directly relevant to the query about the factors considered by the admissions committee at NYU..
[0m

Response(response="For admission decisions at NYU, the following academic and nonacademic factors are considered with varying degrees of importance:\n\n**Academic Factors:**\n- Rigor of secondary school record: Very Important\n- Academic GPA: Very Important\n- Recommendations: Very Important\n- Standardized test scores: Important\n- Class rank: Not Considered\n\n**Nonacademic Factors:**\n- Character/personal qualities: Very Important\n- Extracurricular activities: Considered\n- Talent/ability: Considered\n- Volunteer work: Considered\n- Work experience: Considered\n- Level of applicant's interest: Considered\n- First generation: Considered\n- Geographical residence: Considered\n- Interview: Not Considered\n- Alumni/ae relation: Not Considered\n- State residency: Not Considered\n- Religious affiliation/commitment: Not Considered\n\nAdditionally, for programs requiring a portfolio or audition, artistic talent is considered Very Important.", source_nodes=[NodeWithScore(node=TextNode(id_='

In [91]:
# TODO: create pipeline for determining chances of getting into a school
from llama_index.core.query_pipeline import RouterComponent, QueryPipeline, InputComponent
from llama_index.core import PromptTemplate
import nest_asyncio; nest_asyncio.apply()

# vector_chain = QueryPipeline(chain=[vector_query_engine])
# summary_chain = QueryPipeline(
#     chain=[summary_qrewrite_prompt, llm, summary_query_engine], verbose=True
# )

# what are my chances pipeline:
# determine school(s)
# determine student's information relevant to admission
# determine admissions data for each school
# determine weightings for each school
# ask LLM to make decisions

determine_schools_prompt_tmpl = PromptTemplate(
    "Based on this input, determine which schools the user would like to know their chances of admission at:\n"
    "\n\n"
    "{input}"
)

determine_factors_tmpl = PromptTemplate(
    "Give a breakdown of importance of what academic and nonacademic factors the admissions commitee considers for admission at the following schools:"
    "\n\n"
    "{schools}"
)

determine_test_scores_tmpl = PromptTemplate(
    "Give a breakdown of standardized test scores for admitted applicants at the following schools:"
    "\n\n"
    "{schools}"
)

determine_gpa_tmpl = PromptTemplate(
    "Give a breakdown of GPAs and class rank of admitted applicants at the following schools:"
    "\n\n"
    "{schools}"
)

determine_chances_tmpl = PromptTemplate(
    "Based on what factors the schools deem important, the GPAs and class ranks of the schools,\n"
    "the test scores at the schools, and what the student has shared about themselves in the input\n"
    "and chat history, make your best judgement at whether or not the student has a good chance of admission\n"
    "at these institutions.\n"
    "\n\n"
    "Input:\n"
    "{input}"
    "\n\n"
    "Chat History:\n"
    "{chat_history}"
    "\n\n"
    "Admission Factor Information:\n"
    "{factors}"
    "\n\n"
    "GPA and Class Rank Information:\n"
    "{gpas}"
    "\n\n"
    "Standardized Test Score Information:\n"
    "{test_scores}"
    "\n\n"
)

input_component = InputComponent()

chances_query_modules = {
    "input": input_component,
    "determine_schools_prompt": determine_schools_prompt_tmpl,
    "determine_factors_tmpl": determine_factors_tmpl,
    "determine_test_scores_tmpl": determine_test_scores_tmpl,
    "determine_gpa_tmpl": determine_gpa_tmpl,
    "determine_schools": llm,
    "determine_factors": cds_query_engine,
    "determine_test_scores": cds_query_engine,
    "determine_gpa": cds_query_engine,
    "determine_chances_tmpl": determine_chances_tmpl,
    "determine_chances": llm,
}

chances_query_chain = QueryPipeline(verbose=True)

chances_query_chain.add_modules(chances_query_modules)
chances_query_chain.add_link("input", "determine_schools_prompt", src_key="input", dest_key="input")
chances_query_chain.add_link("determine_schools_prompt", "determine_schools")
chances_query_chain.add_link("determine_schools", "determine_factors_tmpl")
chances_query_chain.add_link("determine_schools", "determine_test_scores_tmpl")
chances_query_chain.add_link("determine_schools", "determine_gpa_tmpl")
chances_query_chain.add_link("determine_factors_tmpl", "determine_factors")
chances_query_chain.add_link("determine_test_scores_tmpl", "determine_test_scores")
chances_query_chain.add_link("determine_gpa_tmpl", "determine_gpa")
chances_query_chain.add_link("input", "determine_chances_tmpl", src_key="input", dest_key="input")
chances_query_chain.add_link("input", "determine_chances_tmpl", src_key="chat_history", dest_key="chat_history")
chances_query_chain.add_link("determine_factors", "determine_chances_tmpl", dest_key="factors")
chances_query_chain.add_link("determine_test_scores", "determine_chances_tmpl", dest_key="gpas")
chances_query_chain.add_link("determine_gpa", "determine_chances_tmpl", dest_key="test_scores")
chances_query_chain.add_link("determine_chances_tmpl", "determine_chances")

await chances_query_chain.arun(input="what are my chances of getting in to penn state", chat_history=["I have a 3.75 gpa and a 1490 on my sat"])

[1;3;38;2;155;135;227m> Running modules and inputs in parallel: 
Module key: input. Input: 
input: what are my chances of getting in to penn state
chat_history: ['I have a 3.75 gpa and a 1490 on my sat']


[0m[1;3;38;2;155;135;227m> Running modules and inputs in parallel: 
Module key: determine_schools_prompt. Input: 
input: what are my chances of getting in to penn state


[0m[1;3;38;2;155;135;227m> Running modules and inputs in parallel: 
Module key: determine_schools. Input: 
messages: Based on this input, determine which schools the user would like to know their chances of admission at:


what are my chances of getting in to penn state


[0m[1;3;38;2;155;135;227m> Running modules and inputs in parallel: 
Module key: determine_factors_tmpl. Input: 
schools: assistant: Based on your input, you are interested in knowing your chances of admission at Penn State University. If you have specific details about your academic profile, extracurricular activities, ...

Module key: deter

ChatResponse(message=ChatMessage(role=<MessageRole.ASSISTANT: 'assistant'>, content="Based on the information provided and the admissions criteria at Penn State University, you appear to have a good chance of admission. Here's a breakdown of how your credentials match up with Penn State's admissions profile:\n\n1. **GPA**: Your GPA of 3.75 is competitive and should position you well, particularly since Penn State values academic credentials highly. While specific information about the average GPA of admitted students isn't provided, a 3.75 GPA generally indicates strong academic performance.\n\n2. **SAT Scores**: Your SAT score of 1490 is significantly above the 75th percentile range for Penn State, where the top 25% of admitted students score below 1390. This places you well within the competitive range for both the Evidence-Based Reading and Writing and Math sections.\n\n3. **Class Rank**: Although you did not mention your class rank, your high GPA and SAT score suggest that you are 

In [88]:
from typing import Dict, Any, Optional, Tuple, List, cast, Set, Optional

from llama_index.core.agent.react.types import (
    ActionReasoningStep,
    ObservationReasoningStep,
    ResponseReasoningStep,
)
from llama_index.core.agent.react.output_parser import ReActOutputParser
from llama_index.core.agent import Task, AgentChatResponse, ReActChatFormatter, QueryPipelineAgentWorker
from llama_index.core.agent.types import Task
from llama_index.core.query_pipeline import (
    AgentInputComponent,
    AgentFnComponent,
    CustomAgentComponent,
    QueryComponent,
    ToolRunnerComponent,
    InputComponent,
    Link,
)
from llama_index.core.llms import MessageRole, ChatResponse, ChatMessage
from llama_index.core.tools import BaseTool, FunctionTool
from llama_index.core.callbacks import CallbackManager


## Agent Input Component
## This is the component that produces agent inputs to the rest of the components
## Can also put initialization logic here.
def agent_input_fn(task: Task, state: Dict[str, Any]) -> Dict[str, Any]:
    """Agent input function.

    Returns:
        A Dictionary of output keys and values. If you are specifying
        src_key when defining links between this component and other
        components, make sure the src_key matches the specified output_key.

    """
    # initialize current_reasoning
    if "current_reasoning" not in state:
        state["current_reasoning"] = []
    reasoning_step = ObservationReasoningStep(observation=task.input)
    state["current_reasoning"].append(reasoning_step)
    return {"input": task.input}

agent_input_component = AgentInputComponent(fn=agent_input_fn)

## define prompt function
def react_prompt_fn(
    task: Task, state: Dict[str, Any], input: str, tools: List[BaseTool]
) -> List[ChatMessage]:
    # Add input to reasoning
    # TODO: may need to redo prompts
    print("state:", state)
    print("memory:", task.memory.get())
    chat_formatter = ReActChatFormatter.from_defaults(system_header="Answer questions like you are hulk hogan")
    return chat_formatter.format(
        tools,
        chat_history=task.memory.get() + state["memory"].get_all(),
        current_reasoning=state["current_reasoning"],
    )

react_prompt_component = AgentFnComponent(
    fn=react_prompt_fn, partial_dict={"tools": [tool]}
)


def parse_react_output_fn(
    task: Task, state: Dict[str, Any], chat_response: ChatResponse
):
    """Parse ReAct output into a reasoning step."""
    output_parser = ReActOutputParser()
    reasoning_step = output_parser.parse(chat_response.message.content)
    return {"done": reasoning_step.is_done, "reasoning_step": reasoning_step}


parse_react_output = AgentFnComponent(fn=parse_react_output_fn)

def determine_chances(query_str: str, chat_history: List[str]):
    response = chances_query_chain.run(
        input=query_str,
        chat_history=chat_history,
    )
    return response

tool = FunctionTool.from_defaults(
    determine_chances,
    # async_fn=aget_weather,  # optional!
)


def run_tool_fn(
    task: Task, state: Dict[str, Any], reasoning_step: ActionReasoningStep
):
    """Run tool and process tool output."""
    # TODO: take tool retriever from args
    tool_runner_component = ToolRunnerComponent(
        [tool], callback_manager=task.callback_manager
    )
    tool_output = tool_runner_component.run_component(
        tool_name=reasoning_step.action,
        tool_input=reasoning_step.action_input,
    )
    observation_step = ObservationReasoningStep(observation=str(tool_output))
    state["current_reasoning"].append(observation_step)

    return {"response_str": observation_step.get_content(), "is_done": False}


run_tool = AgentFnComponent(fn=run_tool_fn)


def process_response_fn(
    task: Task, state: Dict[str, Any], response_step: ResponseReasoningStep
):
    """Process response."""
    state["current_reasoning"].append(response_step)
    response_str = response_step.response
    # Now that we're done with this step, put into memory
    state["memory"].put(ChatMessage(content=task.input, role=MessageRole.USER))
    state["memory"].put(
        ChatMessage(content=response_str, role=MessageRole.ASSISTANT)
    )

    return {"response_str": response_str, "is_done": True}


process_response = AgentFnComponent(fn=process_response_fn)


def process_agent_response_fn(
    task: Task, state: Dict[str, Any], response_dict: dict
):
    """Process agent response."""
    return (
        AgentChatResponse(response_dict["response_str"]),
        response_dict["is_done"],
    )


process_agent_response = AgentFnComponent(fn=process_agent_response_fn)

chat_agent_query_pipeline = QueryPipeline(verbose=True)

chat_agent_query_pipeline.add_modules(
    {
        "agent_input": agent_input_component,
        "react_prompt": react_prompt_component,
        "llm": llm,
        "react_output_parser": parse_react_output,
        "run_tool": run_tool,
        "process_response": process_response,
        "process_agent_response": process_agent_response,
    }
)

# link input to react prompt to parsed out response (either tool action/input or observation)
chat_agent_query_pipeline.add_chain(["agent_input", "react_prompt", "llm", "react_output_parser"])

# add conditional link from react output to tool call (if not done)
chat_agent_query_pipeline.add_link(
    "react_output_parser",
    "run_tool",
    condition_fn=lambda x: not x["done"],
    input_fn=lambda x: x["reasoning_step"],
)
# add conditional link from react output to final response processing (if done)
chat_agent_query_pipeline.add_link(
    "react_output_parser",
    "process_response",
    condition_fn=lambda x: x["done"],
    input_fn=lambda x: x["reasoning_step"],
)

# whether response processing or tool output processing, add link to final agent response
chat_agent_query_pipeline.add_link("process_response", "process_agent_response")
chat_agent_query_pipeline.add_link("run_tool", "process_agent_response")

agent_worker = QueryPipelineAgentWorker(chat_agent_query_pipeline)
agent = agent_worker.as_agent(
    callback_manager=CallbackManager([]),
    chat_history=[ChatMessage.from_str("I have a 3.7 gpa and a 1490 on my sat")],
    # chat_history=[],
    verbose=True,
)

# start task
await agent.achat(
    "What are my chances at uw madison?"
)

> Running step df4aaadb-23df-4845-b64f-64f7130411c5. Step input: What are my chances at uw madison?
[1;3;38;2;155;135;227m> Running modules and inputs in parallel: 
Module key: agent_input. Input: 
state: {'sources': [], 'memory': ChatMemoryBuffer(chat_store=SimpleChatStore(store={}), chat_store_key='chat_history', token_limit=3000, tokenizer_fn=functools.partial(<bound method Encoding.encode of <Encod...
task: task_id='be901196-d01c-4dc4-aaa3-a10e0a575df6' input='What are my chances at uw madison?' memory=ChatMemoryBuffer(chat_store=SimpleChatStore(store={'chat_history': [ChatMessage(role=<MessageRole.USER:...


[0m[1;3;38;2;155;135;227m> Running modules and inputs in parallel: 
Module key: react_prompt. Input: 
input: What are my chances at uw madison?


[0mstate: {'sources': [], 'memory': ChatMemoryBuffer(chat_store=SimpleChatStore(store={}), chat_store_key='chat_history', token_limit=3000, tokenizer_fn=functools.partial(<bound method Encoding.encode of <Encoding 'cl100k_base'>>

AgentChatResponse(response="Well let me tell you something, brother! With a 3.7 GPA and a 1490 on your SAT, you're already running wild on the academics, just like Hulkamania runs wild on the wrestling world! The University of Wisconsin-Madison is a great school, and they're looking for top-notch students just like you, dude.\n\nYour GPA and SAT scores are definitely strong, and they put you in a good position for consideration. But remember, brother, UW-Madison also looks at your extracurricular activities, your personal statement, and letters of recommendation. They want to see the whole package, not just the academic stats.\n\nSo, make sure you highlight all your strengths, brother! Show them what makes you unique, just like Hulk Hogan shows his charisma in the ring. Give it everything you've got, say your prayers, eat your vitamins, and I believe you've got a good shot at getting into UW-Madison. Keep pushing yourself, train hard, and stay positive, brother!", sources=[], source_no