## 1. Import the required libraries

In [20]:
%pip install cohere weave --q --disable-pip-version-check

Note: you may need to restart the kernel to use updated packages.


## 2. Utils

In [21]:
import weave # < --- notice

In [22]:
def display_tool_calls(tool_calls):
    # Determine the message based on the number of tool calls
    if len(tool_calls) > 1:
        print("The model suggests making Parallel Queries:")
    else:
        print("The model suggests making a single tool call:")

    for i, tool_call in enumerate(tool_calls):
        # If there's more than one tool call, separate each with a header
        if len(tool_calls) > 1:
            print(f"== Parallel Tool Call #{i+1}")

        # Print the tool call name and "with this code:" on the same line
        if tool_call.name == 'python_interpreter':
            print(f"{tool_call.name} with this code:")
            code = tool_call.parameters.get('code', '')
            print("\n".join(f"  {line}" for line_num, line in enumerate(code.splitlines())))
        else:
            # For non-python_interpreter tool calls, just print the parameters
            print(f"{tool_call.name}")
            print(f"{tool_call.parameters}")

## 3. Define Tools for Agentic RAG: a web search engine and a spreadsheet

### 3.1. A web search engine, accessible through an API

In [23]:
%pip install tavily-python --q --disable-pip-version-check
from tavily import TavilyClient
tavily_client = TavilyClient(api_key="...")

Note: you may need to restart the kernel to use updated packages.


In [24]:
# here's a web search engine
@weave.op() # < ----- notice
def web_search(query: str) -> list[dict]:
  response = tavily_client.search(query, max_results=3)['results']
  return response

In [25]:
# the LLM is equipped with a description of the web search engine
web_search_tool = {
  "name": "web_search",
  "description": "Returns a list of relevant document snippets for a textual query retrieved from the internet",
  "parameter_definitions": {
    "query": {
      "description": "Query to search the internet with",
      "type": "str",
      "required": True
    }
  }
}

### 3.2. A spreadsheet, accessible through a Python interpreter

In [26]:
# here's a python console, which can be used to access the spreadsheet, but also more generally to code and plot stuff
import io, contextlib


@weave.op() # < --- notice
def python_interpreter(code: str) -> list[dict]:
    output = io.StringIO()
    try:
        # Redirect stdout to capture print statements
        with contextlib.redirect_stdout(output):
            exec(code, globals())
    except Exception as e:
        return [{
            "executed_code": code,
            "error": str(e)

        }]
    # Get the output value
    return [{
  		"console_output": output.getvalue(),
      "executed_code": code
  	}]

In [27]:
# the LLM is equipped with a description of a python console
python_interpreter_tool = {
  "name": "python_interpreter",
  "description": "Executes python code and returns the result. The code runs in a static sandbox without internet access and without interactive mode, so print output or save output to a file.",
  "parameter_definitions": {
    "code": {
      "description": "Python code to execute",
      "type": "str",
      "required": True
    }
  }
}

In [28]:
functions_map = {
    "web_search": web_search,
    "python_interpreter": python_interpreter,
}

## 4. Try Agentic RAG with a Cohere model

In [29]:
# < --- notice
import cohere
from weave.integrations.cohere import cohere_patcher

cohere_patcher.attempt_patch()

True

In [30]:
# Ensure the dotenv usage:
%pip install python-dotenv

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.1.2 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip




### 4.1 Create cohere model and intergrate Weave

In [31]:
import os
from dotenv import load_dotenv
import cohere

# Integrate cohere with weave
from weave.integrations.cohere import cohere_patcher

cohere_patcher.attempt_patch()


# Load the .env file
load_dotenv()

# Initialize Cohere client using an environment variable for the API key
api_key = os.getenv('COHERE_API_KEY')

# Check if the .env file is loaded
#print(f"Cohere API Key: {api_key}") 


co = cohere.Client(api_key=api_key)


model = "command-r-plus-08-2024"

### 4.2 Let's look at a complex user query

In [32]:
preamble="""You have access to to the internet.
You also have access to a dataset with information about Spotify songs from the past 10 years, located at ./spotify_dataset.csv.
Remember to inspect the dataset to understand its structure before querying it.
Use the dataset when you can. Otherwise use the internet.
"""

In [33]:
message = "What's the age and citizenship of the artists who had the top 3 most streamed songs on Spotify in 2023"

### 4.3. Get the model plan

In [34]:
weave_client = weave.init("cohere-weave-project") # < --- notice

In [35]:
response = co.chat(
    model=model,
    preamble=preamble,
    message=message,
    tools=[web_search_tool, python_interpreter_tool],
    temperature=0,
    prompt_truncation="OFF"
)

print(response.text)

🍩 https://wandb.ai/dastech1998-ozyegin-university/cohere-weave-project/r/call/0191a3d6-6848-79e0-9d29-353b20817ed7
I will first check the dataset to see if it contains information about the top 3 most streamed songs on Spotify in 2023. If it does, I will then find the age and citizenship of the artists of those songs.


### 4.4. RAG System Itarations for final solution

In [36]:
step = 0

while response.tool_calls:
    print(f"\n\n\033[1;34m == STEP {step+1} \033[0m\n")  # Blue bold for step header

    print(f"\033[1;32mThe plan is:\033[0m \033[1m{response.text}\033[0m\n")  # Green bold for "The plan is" and bold for the plan text

    # Tool calls suggested by the model
    print(f"\033[1;33mTool calls suggested by the model:\033[0m")  # Yellow bold for tool calls header
    display_tool_calls(response.tool_calls)

    # Execute the tool calls
    tool_results = []
    for tool_call in response.tool_calls:
        outputs = functions_map[tool_call.name](**tool_call.parameters)
        tool_result = {"call": tool_call, "outputs": outputs}
        tool_results.append(tool_result)
        # print("tool_results: ", tool_result['outputs'])
        print(f"\033[1;35mtool_results:\033[0m \033[1m{tool_result['outputs']}\033[0m")  # Magenta bold for "tool_results" and bold for outputs

    # call chat again with tool results
    response = co.chat(
        model=model,
        preamble=preamble,
        message="",
        chat_history=response.chat_history,
        tools=[web_search_tool,python_interpreter_tool],
        tool_results=tool_results,
        temperature=0,
    )

    step+=1

# print final answer
print(f"\n\n \033[1;32mThe final answer is:\033[0m \033[1m{response.text}\033[0m\n")  # Green bold for "The plan is" and bold for the plan text



[1;34m == STEP 1 [0m

[1;32mThe plan is:[0m [1mI will first check the dataset to see if it contains information about the top 3 most streamed songs on Spotify in 2023. If it does, I will then find the age and citizenship of the artists of those songs.[0m

[1;33mTool calls suggested by the model:[0m
The model suggests making a single tool call:
python_interpreter with this code:
  import pandas as pd
  
  df = pd.read_csv("spotify_dataset.csv")
  
  # Check if the dataset contains information about the top 3 most streamed songs on Spotify in 2023
  if "2023" in df["year"]: 
      print(f"The dataset contains information about the top 3 most streamed songs on Spotify in 2023")
  else:
      print(f"The dataset does not contain information about the top 3 most streamed songs on Spotify in 2023")
🍩 https://wandb.ai/dastech1998-ozyegin-university/cohere-weave-project/r/call/0191a3d6-894b-7570-9a6b-49b0c0d428e5
[1;35mtool_results:[0m [1m[{'executed_code': 'import pandas as pd\n\

InvalidAPIKeyError: The provided API key is invalid.

## 5. Better Agentic workflow with weave.Model

In [None]:
class SimpleSpotifyAgent(weave.Model):
    preamble: str
    tools: list
    temperature: float = 0.0
    model: str = "command-r-plus-08-2024"
    max_steps: int = 10
    debug: bool = False

    @weave.op()
    def run_spotify_agent(self, query: str) -> str:
        # Use LLM for planning
        response = co.chat(
            model=self.model,
            preamble=self.preamble,
            message=query,
            tools=self.tools,
            temperature=self.temperature,
            prompt_truncation="OFF"
        )

        step = 0
        while response.tool_calls:
            if self.debug:
                print(f"\n\n\033[1;34m == STEP {step+1} \033[0m\n")
                print(f"\033[1;32mThe plan is:\033[0m \033[1m{response.text}\033[0m\n")
                print(f"\033[1;33mTool calls suggested by the model:\033[0m")
                display_tool_calls(response.tool_calls)

            tool_results = []
            for tool_call in response.tool_calls:
                outputs = functions_map[tool_call.name](**tool_call.parameters)
                tool_result = {"call": tool_call, "outputs": outputs}
                tool_results.append(tool_result)
                if self.debug:
                  print(f"\033[1;35mtool_results:\033[0m \033[1m{tool_result['outputs']}\033[0m")

            response = co.chat(
                model=self.model,
                preamble=self.preamble,
                message="",
                chat_history=response.chat_history,
                tools=self.tools,
                tool_results=tool_results,
                temperature=self.temperature,
            )
            step+=1

            if step >= self.max_steps:
                print(f"Could not find the answer in {self.max_steps} steps.")
                print(f"The final response before ending execution: {response.text}")
                return f"[MAX STEP REACHED]: {response.text}"

        if self.debug:
          print(f"\n\n \033[1;32mThe final answer is:\033[0m \033[1m{response.text}\033[0m\n")

        return response.text

    @weave.op()
    def infer(self, query: str) -> str:
        return self.run_spotify_agent(query)

In [None]:
preamble="""You have access to to the internet.
You also have access to a dataset with information about Spotify songs from the past 10 years, located at ./spotify_dataset.csv.
Remember to inspect the dataset to understand its structure before querying it.
Use the dataset when you can. Otherwise use the internet.
"""

spotify_agent = SimpleSpotifyAgent(
    preamble=preamble,
    tools=[web_search_tool, python_interpreter_tool],
    debug=True,
)

In [None]:
output = spotify_agent.infer(
    "What's the age and citizenship of the artists who had the top 3 most streamed songs on Spotify in 2023?"
)

In [None]:
spotify_agent.infer(
    "What's the most danceable song on Spotify in 2023?"
)

## 6. Evaluation

**We can evaluate:**

how accurately is LLM coming up with function name/function arguments?
- Is function calling happening correctly?
- Is the final answer correct based on the retrieved content from web?
- Is the final answer is based off tool x's result?

In [None]:
# prompt: open spotify_dataset and print head

import pandas as pd
df = pd.read_csv('./spotify_dataset.csv')
df.head()

In [None]:
## Initial e2e evaluation set based off the dataset
dataset_based_eval = [
    {"query": "Which song is the most danceable on Spotify in 2023?", "ground_truth": "Gol Bolinha, Gol Quadrado 2"},
    {"query": "Which song has the most number of artists collaborating?", "ground_truth": "Los del Espacio"},
    {"query": "How many song did Taylor Swift release in the last 5 years?", "ground_truth": "28"},
]

e2e_eval_set = weave.Dataset(name="response_eval", rows=dataset_based_eval)
weave.publish(e2e_eval_set)

In [None]:
# wrote quick one liners to get the answer; an example below
df.loc[df.loc[df.released_year==2023].danceability.idxmax()].track_name

In [None]:
import json
import asyncio

import nest_asyncio
nest_asyncio.apply()

## Correctness scorer
llm_correctness_judge = """You are responsible to evaluate the response of a system against some groud truth.
Return your judgement in a valid JSON format:

```json
{
  "judgement": "CORRECT" | "INCORRECT",
  "reasoning": "Reasoning for your judgement",
}
```
"""

llm_correctness_judge_message = """The provided ground truth and generated response are:

GROUND TRUTH:
{ground_truth}

GENERATED RESPONSE:
{generated_text}
"""


@weave.op()
def correctness_evaluator(ground_truth: str, model_output):
    response = co.chat(
        model="command-r-plus-08-2024",
        preamble=llm_correctness_judge,
        message=llm_correctness_judge_message.format(ground_truth=ground_truth, generated_text=model_output),
        temperature=0,
        prompt_truncation="OFF"
    )

    # ideally run with retries or use structured output parsing
    try:
        eval = json.loads(response.text)
        return {
            "score": eval["judgement"] == "CORRECT",
            "reasoning": eval["reasoning"]
        }
    except:
        return {
            "score": False,
            "reasoning": response.text
        }


evaluation = weave.Evaluation(
    name="Response Evaluation",
    dataset=e2e_eval_set,
    scorers=[correctness_evaluator],
)

In [None]:
asyncio.run(
    evaluation.evaluate(
        SimpleSpotifyAgent(
          preamble=preamble,
          tools=[web_search_tool,python_interpreter_tool],
          debug=False,
        )
    )
)

## 7. Build offline trajectories

In [None]:
# @title GROUND_TRUTH_TRAJECTORY
# @markdown The ground truth trajectory can be created manually or by including the trajectories of the best agentic runs.

GROUND_TRUTH_TRAJECTORY = [
    {
        "id": 0,
        "trajectory": [
            (0, 'SimpleSpotifyAgent.infer', '01918f12-6dc5-74f0-9f41-38eca92ca325'),
            (1, 'SimpleSpotifyAgent.run_spotify_agent', '01918f12-6dc5-74f0-9f41-38f4177a0c9a'),
            (2, 'cohere.Client.chat', '01918f12-6dc8-7693-9acc-c949ab590947'),
            (2, 'python_interpreter', '01918f12-7d98-73a0-ae7e-ce6761b838db'),
            (2, 'cohere.Client.chat', '01918f12-7db3-7d31-a25e-da2f9fb17f3a'),
            (2, 'python_interpreter', '01918f12-999a-7b20-b5cd-17af9b85c70e'),
            (2, 'cohere.Client.chat', '01918f12-99a5-7ec2-a684-632f2cfc2ab9')
        ]
    },
    {
        "id": 1,
        "trajectory": [
            (0, 'SimpleSpotifyAgent.infer', '01918f12-6dca-78e2-96c7-d8f843776ec1'),
            (1, 'SimpleSpotifyAgent.run_spotify_agent', '01918f12-6dcb-7231-ba86-f870c5dfe992'),
            (2, 'cohere.Client.chat', '01918f12-6dcd-7f61-9be2-b94e051862a8'),
            (2, 'python_interpreter', '01918f12-7e4a-7dc2-9943-47a0ffbb9d4a'),
            (2, 'cohere.Client.chat', '01918f12-7e54-7930-9695-ecf98cfce0e0'),
            (2, 'python_interpreter', '01918f12-9968-7371-bfa1-0b8e55888cc6'),
            (2, 'cohere.Client.chat', '01918f12-9973-7da0-a50e-4fcfec42e3aa')
        ]
    },
    {
        "id": 2,
        "trajectory": [
            (0, 'SimpleSpotifyAgent.infer', '01918f12-6dd1-7e12-ad34-042fec1bba62'),
            (1, 'SimpleSpotifyAgent.run_spotify_agent', '01918f12-6dd2-7e63-9bba-5218ec198526'),
            (2, 'cohere.Client.chat', '01918f12-6dd4-74f3-8ba2-bdb3c0b7e1ac'),
            (2, 'python_interpreter', '01918f12-7e01-76b2-b3b8-e6cdd1b6ad1a'),
            (2, 'cohere.Client.chat', '01918f12-7e17-7542-9697-82301d29d4c9'),
            (2, 'python_interpreter', '01918f12-95d3-7173-b518-f747ef2bae21'),
            (2, 'cohere.Client.chat', '01918f12-95e2-7792-a783-3a0642673af4')
        ]
    },
]

In [None]:
_eval_calls = weave_client.call("01919262-59bd-7e22-b7eb-8817d3ca1d02")

eval_call_trace_ids = {}
for call in _eval_calls.children():
    if call.op_name.split("/")[-1].split(":")[0] == "Evaluation.predict_and_score":
        for child in call.children():
            if child.op_name.split("/")[-1].split(":")[0] == "SimpleSpotifyAgent.infer":
                eval_call_trace_ids[child.id] = child

eval_call_trace_ids.keys()

In [None]:
# Helper function to recursively build a trajectory
def traverse_children(call, depth=0):
    """
    Recursively traverse the children of a call and collect tuples of (depth, call.op_name, call.id).
    """
    # Start with the current call's tuple
    trajectory = [(depth, call.op_name.split("/")[-1].split(":")[0], call.id)]

    # Recurse into each child and extend the trajectory
    for child in call.children():
        trajectory.extend(traverse_children(child, depth + 1))

    return trajectory

trace_id = "01919262-5ab5-7521-94e9-6efa0ae9c794"
root_call = eval_call_trace_ids[trace_id]

trajectory = traverse_children(root_call)
trajectory

In [None]:
all_trajectories = []

for trace_id, _ in eval_call_trace_ids.items():
    root_call = eval_call_trace_ids[trace_id]
    trajectory = traverse_children(root_call)
    all_trajectories.append(trajectory)

In [None]:
all_trajectories

In [None]:
from difflib import SequenceMatcher


@weave.op()
def exact_match(model_output: list[tuple], trajectory: list[tuple]) -> float:
    correct = sum(1 for p, g in zip(model_output, trajectory) if p[0] == g[0] and p[1] == g[1])
    return correct / len(trajectory)


@weave.op()
def levenshtein_distance(model_output: list[tuple], trajectory: list[tuple]) -> float:
    seq1 = [(p[0], p[1]) for p in model_output]
    seq2 = [(g[0], g[1]) for g in trajectory]
    matcher = SequenceMatcher(None, seq1, seq2)
    return matcher.ratio()


@weave.op()
def get_predicted_trajectory(id: int) -> list[tuple]:
    return all_trajectories[id]


trajectory_evaluation = weave.Evaluation(
    name="Trajectory Evaluation",
    dataset=GROUND_TRUTH_TRAJECTORY,
    scorers=[exact_match, levenshtein_distance],
)

asyncio.run(
    trajectory_evaluation.evaluate(get_predicted_trajectory)
)

## 8. Evaluate Factfulness

In [None]:
## factfulness scorer
llm_factful_judge = """You are an expert checker of facts. Given the context you can find if the generated text is based out of the context.
If the generated text is coming from the provided context return "CORRECT", otherwise return "INCORRECT".

Return your judgement in a valid JSON format:

{
  "judgement": "CORRECT" | "INCORRECT",
  "reasoning": "Reasoning for your judgement",
}
"""

llm_factful_judge_message = """The provided context and generated text are:

CONTEXT:
{context}

GENERATED TEXT:
{generated_text}
"""

@weave.op()
def factfulness_evaluator(model_output: dict):
    response = co.chat(
        model="command-r-plus-08-2024",
        preamble=llm_factful_judge,
        message=llm_factful_judge_message.format(
            context=" \n".join(model_output["context"]), generated_text=model_output["generated_text"]
        ),
        temperature=0,
        prompt_truncation="OFF"
    )

    # ideally run with retries or use structured output parsing
    try:
        eval = json.loads(response.text)
        return {
            "score": eval["judgement"] == "CORRECT",
            "reasoning": eval["reasoning"]
        }
    except:
        return {
            "score": False,
            "reasoning": response.text
        }


@weave.op()
def get_context_and_answer(id: int) -> list[str]:
    # we get the context from the 2nd last component of the trajectory
    trajectory = all_trajectories[id]

    if trajectory[-2][1] == "python_interpreter":
        context = [weave_client.call(trajectory[-2][-1]).output[0]["console_output"]]

    if trajectory[-2][1] == "web_search":
        context = [web_result["content"] for web_result in weave_client.call(trajectory[-2][-1]).output]

    generated_text = weave.ref(weave_client.call(trajectory[-1][-1]).output).get().text

    return {
        "context": context,
        "generated_text": generated_text
    }

In [None]:
factfulness_evaluation = weave.Evaluation(
    name="Factfulness Evaluation",
    dataset=GROUND_TRUTH_TRAJECTORY,
    scorers=[factfulness_evaluator],
)

asyncio.run(
    factfulness_evaluation.evaluate(get_context_and_answer)
)