In [217]:
# !pip uninstall httpx
!pip install --upgrade openai
!pip install httpx==0.27.2




In [218]:
def json_print(data):
    """Pretty print JSON data"""
    print(json.dumps(data, indent=2))

In [219]:
from google.colab import userdata

import openai
from openai import OpenAI

# Now your API key is available as an environment variable
openai_api_key = userdata.get('OPENAI_API_KEY')
mistral_api_key = userdata.get('MISTRAL_API_KEY')

# if openai_api_key is None:
    # raise ValueError("OpenAI API key not found in the .env file.")

print("API key loaded successfully!")

client = openai.OpenAI(api_key=openai_api_key)

API key loaded successfully!


# Langchain


In [220]:
!pip install langchain openai mistralai requests
# You pacakages installations
!pip install langchain_openai google-search-results langchain
!pip install -U langchain-community
!pip install langchain-core
!pip install langchain_mistralai




In [221]:
import json
import requests
from langchain.chat_models import ChatOpenAI
from langchain.schema import HumanMessage
from langchain.prompts import PromptTemplate
from langchain.llms import OpenAI
from langchain_mistralai import ChatMistralAI

# Load JSON data from URLs
import urllib.request

In [222]:
def load_json_data():
    """Load JSON data from remote URLs."""
    spec_url = "https://github.com/IBM/NESTFUL/raw/main/data/executable/executable-spec.json"
    data_url = "https://github.com/IBM/NESTFUL/raw/main/data/executable/executable-data.json"

    with urllib.request.urlopen(spec_url) as spec_resp:
        exec_spec = json.load(spec_resp)

    with urllib.request.urlopen(data_url) as data_resp:
        exec_data = json.load(data_resp)

    # Extract the first index as the test case
    exec_spec_0 = exec_spec[0]
    exec_data_0 = exec_data[0]

    print("Loaded JSON data and extracted the first index.")
    return exec_spec, exec_data, exec_spec_0, exec_data_0

In [223]:
exec_spec, exec_data, exec_spec_0, exec_data_0 = load_json_data()

Loaded JSON data and extracted the first index.


In [224]:
# exec_spec_0

In [225]:

def make_api_call(api_key, query_string, api_url, host):
  # https://www.weatherapi.com/docs/#apis-s

  headers = {
    "x-rapidapi-key": api_key,
    "x-rapidapi-host": host
  }

  response = requests.get(api_url, headers=headers, params=query_string)
  return response.json(), query_string

In [226]:
from langchain_core.prompts import ChatPromptTemplate
def make_llm_api_call(api, query, llm_type, schema, additional_info):
  llm = None
  if llm_type == "mistral":
    llm = ChatMistralAI(
        model="mistral-large-latest",
        mistral_api_key=mistral_api_key,
        temperature=0,
        max_retries=2,
      )
  elif llm_type == "openai":
    llm = ChatOpenAI(model="gpt-3.5-turbo", openai_api_key=openai_api_key)

  schema_info = ""
  if schema:
    required_keys = [key for key, value in schema.items() if value == "required"]
    optional_keys = [key for key, value in schema.items() if value == "optional"]

    schema_info = f"""These are a list of the required parameters {required_keys} and optional parameters {optional_keys}"""

  schema_prompt = f"""
      Given the following api {api} and the query {query} to use to determine
      what parameters need to be set in the API call. {schema_info}.

      Return the parameters you would need to make this API call in a response as a dictionary of API query parameters.
      Only return this dictionary as your result and nothing else as I want to easily extract this object from your response.
      {additional_info}
      """
  prompt = ChatPromptTemplate.from_messages(
      [
      (
          "system",
          "You are a helpful assistant that creates an API call user request.",
      ),
      ("human", schema_prompt),
      ]
  )

  chain = prompt | llm
  result = chain.invoke(
      {
          "api": api,
          "query":query,
          "api_key":api_key,
          # "schema": escaped_params,
          "additional_info" :additional_info
      }
  )

  if result.content:
    print("DEBUG:", result.content)
    generated_params = result.content
    if '`' in generated_params:
      cleaned_response = result.content.strip('`').replace('json\n', '')
      print("DEBUG:", cleaned_response)
      generated_params = json.loads(cleaned_response)
  else:
    print("Error getting response!")
    return {"error": "Failed to generate API call"}

  return generated_params

In [275]:
# Evaluate responses
def evaluate_responses(ground_truth_params, generated_response_params):
    """Evaluate the responses using OpenAI GPT-3.5."""
    evaluator = ChatOpenAI(model="gpt-3.5-turbo", openai_api_key=openai_api_key)

    # Construct evaluation prompt
    eval_prompt = PromptTemplate(
        input_variables=["ground_truth_params", "generated_response_params"],
        template="""
        Compare the following two API query parameters:

        Ground Truth Response:
        {ground_truth_params}

        Generated Response:
        {generated_response_params}

        Instructions:
        - Check if the two responses are identical.
        - If identical, return a score of 100%.
        - If not, calculate the accuracy of the generated response as follows:
        -- accuracy = 100 - [100 * [0.7 * (# of parameters that exist in ground_truth_params but not generated_response_params / total # of parameters in ground_truth_params) +
                     0.3 * (# of parameters that exist in generated_response_params and not ground_truth_params / total # of parameters in ground_truth_params)]]
        - Return a summary score and justification.
        """
    )

    chain = eval_prompt | evaluator

    result = chain.invoke({
        "ground_truth_params": ground_truth_params,
        "generated_response_params": generated_response_params,
    })

    return result.content

In [285]:
def calculate_accuracy(ground_truth_params, generated_response_params):

  # Replace single quotes with double quotes
  if type(generated_response_params) == str:
      cleaned_response = generated_response_params.replace("'", '"')
      cleaned_generated_response_params = json.loads(cleaned_response)
  else:
      cleaned_generated_response_params = generated_response_params
  # Convert dictionaries into sets of keys
  ground_truth_keys = set(ground_truth_params.keys())
  generated_response_keys = set(cleaned_generated_response_params.keys())

  # Calculate missing parameters (in ground_truth but not in generated_response)
  missing_params = ground_truth_keys - generated_response_keys

  # Calculate extra parameters (in generated_response but not in ground_truth)
  extra_params = generated_response_keys - ground_truth_keys

  # Total parameters in ground_truth
  total_params = len(ground_truth_keys)

  # Avoid division by zero
  if total_params == 0:
      return 100.0

  # Calculate the accuracy using the formula
  accuracy = 100 - (100 *((0.7 * (len(missing_params) / total_params)) +
                    (0.3 * (len(extra_params) / total_params))))

  return accuracy

## Example 1

In [229]:
api_key = "808c3bd978mshd588ab1b321f903p1f78aejsnfe670b2df825"
query_string = {"q":"London","days":"1","aqi":"yes"}
api_url = "https://weatherapi-com.p.rapidapi.com/forecast.json"
host = "weatherapi-com.p.rapidapi.com"

all_weather_params = {
    "q" : "required",
    "days" : "required",
    "hour" : "optional",
    "aqi" : "optional",
    "tp" : "optional",
    "lang" : "optional",
}


In [230]:
ground_truth_res, ground_truth_params = make_api_call(api_key, query_string, api_url, host)


In [231]:
json_print(ground_truth_res)

{
  "location": {
    "name": "London",
    "region": "City of London, Greater London",
    "country": "United Kingdom",
    "lat": 51.5171,
    "lon": -0.1062,
    "tz_id": "Europe/London",
    "localtime_epoch": 1734077583,
    "localtime": "2024-12-13 08:13"
  },
  "current": {
    "last_updated_epoch": 1734076800,
    "last_updated": "2024-12-13 08:00",
    "temp_c": 6.0,
    "temp_f": 42.8,
    "is_day": 1,
    "condition": {
      "text": "Light rain",
      "icon": "//cdn.weatherapi.com/weather/64x64/day/296.png",
      "code": 1183
    },
    "wind_mph": 3.4,
    "wind_kph": 5.4,
    "wind_degree": 108,
    "wind_dir": "ESE",
    "pressure_mb": 1026.0,
    "pressure_in": 30.3,
    "precip_mm": 0.03,
    "precip_in": 0.0,
    "humidity": 100,
    "cloud": 50,
    "feelslike_c": 5.1,
    "feelslike_f": 41.1,
    "windchill_c": 5.5,
    "windchill_f": 41.9,
    "heatindex_c": 6.4,
    "heatindex_f": 43.5,
    "dewpoint_c": 3.2,
    "dewpoint_f": 37.8,
    "vis_km": 4.5,
    "vis_m

In [232]:
api_endpoint = "https://weatherapi-com.p.rapidapi.com/forecast.json"
api_query = "What's the weather in London today? I don't know if I can go out this afternoon with my allergies, they get bad with the air quality"
api_key = "808c3bd978mshd588ab1b321f903p1f78aejsnfe670b2df825"
mistral_generated_response = make_llm_api_call(api_endpoint, api_query, "mistral", {}, "")

DEBUG: ```json
{
  "q": "London",
  "days": "1",
  "aqi": "yes"
}
```
DEBUG: {
  "q": "London",
  "days": "1",
  "aqi": "yes"
}



In [233]:
mistral_generated_response

{'q': 'London', 'days': '1', 'aqi': 'yes'}

In [234]:
chatgpt_generated_response = make_llm_api_call(api_endpoint, api_query, "openai", {}, "")

DEBUG: {
    "query": "London",
    "day": "today"
}


In [235]:
chatgpt_generated_response

'{\n    "query": "London",\n    "day": "today"\n}'

In [246]:
evaluation_result = evaluate_responses(ground_truth_params, mistral_generated_response)
print("Evaluation Result:", evaluation_result)

Evaluation Result: The two responses are identical, so the accuracy score is 100%. 
Justification: Both responses have the same query parameters ('q', 'days', 'aqi') with the same values, so the accuracy score is maximum.


In [278]:
evaluation_result = evaluate_responses(ground_truth_params, chatgpt_generated_response)
print("Evaluation Result:", evaluation_result)

Evaluation Result: The ground truth response has 3 parameters: 'q', 'days', and 'aqi'.
The generated response has 2 parameters: 'query' and 'day'.

Calculating the accuracy:
- Parameters in ground truth but not in generated response: 1 ('days')
- Parameters in generated response but not in ground truth: 2 ('query', 'day')

Accuracy = 100 - [100 * [0.7 * (1/3) + 0.3 * (2/3)]]
Accuracy = 100 - [100 * [0.7 * 0.33 + 0.3 * 0.67]]
Accuracy = 100 - [100 * (0.231 + 0.201)]
Accuracy = 100 - [100 * 0.432]
Accuracy = 100 - 43.2
Accuracy = 56.8%

Therefore, the accuracy of the generated response is 56.8%. The generated response is not identical to the ground truth response.


In [286]:
chatgpt_weather_actual_acc = calculate_accuracy(ground_truth_params, chatgpt_generated_response)
mistral_weather_actual_acc = calculate_accuracy(ground_truth_params, mistral_generated_response)

print("ChatGPT Flights Actual Accuracy:", chatgpt_weather_actual_acc)
print("Mistral Flights Actual Accuracy:", mistral_weather_actual_acc)

ChatGPT Flights Actual Accuracy: 10.000000000000014
Mistral Flights Actual Accuracy: 100.0


## Example 2

In [254]:
api_key = "808c3bd978mshd588ab1b321f903p1f78aejsnfe670b2df825"
query_string = {"fromId":"JFK.AIRPORT","toId":"STO.AIRPORT","departDate":"2024-12-13","adults":"2", "cabinClass": "ECONOMY"}
api_url = "https://booking-com15.p.rapidapi.com/api/v1/flights/searchFlights"
host = "booking-com15.p.rapidapi.com"
all_params = {
    "fromId" : "required",
    "toId" : "required",
    "departDate" : "required",
    "returnDate" : "optional",
    "pageNo" : "optional",
    "adults" : "optional",
    "children" : "optional",
    "sort" : "optional",
    "cabinClass" : "optional",
    "currency_code" : "optional",
}


In [255]:
flights_resp, flights_true_params = make_api_call(api_key, query_string, api_url, host)
json_print(flights_resp)

{
  "status": true,
  "message": "Success",
  "timestamp": 1734079225021,
  "data": {
    "aggregation": {
      "totalCount": 107,
      "filteredTotalCount": 107,
      "stops": [
        {
          "numberOfStops": 1,
          "count": 53,
          "minPrice": {
            "currencyCode": "USD",
            "units": 1547,
            "nanos": 290000000
          },
          "minPriceRound": {
            "currencyCode": "USD",
            "units": 1548,
            "nanos": 0
          }
        },
        {
          "numberOfStops": 2,
          "count": 107,
          "minPrice": {
            "currencyCode": "USD",
            "units": 752,
            "nanos": 380000000
          },
          "minPriceRound": {
            "currencyCode": "USD",
            "units": 753,
            "nanos": 0
          }
        }
      ],
      "airlines": [
        {
          "name": "Icelandair",
          "logoUrl": "https://r-xx.bstatic.com/data/airlines_logo/FI.png",
          "iat

In [256]:
flights_true_params

{'fromId': 'JFK.AIRPORT',
 'toId': 'STO.AIRPORT',
 'departDate': '2024-12-13',
 'adults': '2',
 'cabinClass': 'ECONOMY'}

In [257]:
api_query = "I need an affordable flight for today's date from NYC airport to Stockholm for my husband and I"
additional_info = "Note that the airport code for fromId and toId must be in the format <Airport code in caps>.AIRPORT"
mistral_flights_generated_response = make_llm_api_call(api_url, api_query, "mistral", all_params, additional_info)

DEBUG: ```json
{
  "fromId": "NYC.AIRPORT",
  "toId": "STO.AIRPORT",
  "departDate": "2024-08-07",
  "adults": "2",
  "currency_code": "USD"
}
```
DEBUG: {
  "fromId": "NYC.AIRPORT",
  "toId": "STO.AIRPORT",
  "departDate": "2024-08-07",
  "adults": "2",
  "currency_code": "USD"
}



In [258]:
mistral_flights_generated_response

{'fromId': 'NYC.AIRPORT',
 'toId': 'STO.AIRPORT',
 'departDate': '2024-08-07',
 'adults': '2',
 'currency_code': 'USD'}

In [259]:
chatgpt_flights_generated_response = make_llm_api_call(api_url, api_query, "openai", all_params, additional_info)

DEBUG: {
    'fromId': 'NYC.AIRPORT',
    'toId': 'STO.AIRPORT',
    'departDate': 'today'
}


In [260]:
chatgpt_flights_generated_response


"{\n    'fromId': 'NYC.AIRPORT',\n    'toId': 'STO.AIRPORT',\n    'departDate': 'today'\n}"

In [276]:
evaluation_result = evaluate_responses(flights_true_params, mistral_flights_generated_response)
print("Evaluation Result:", evaluation_result)

Evaluation Result: Summary:
The two API query parameters are not identical. The generated response is missing the 'fromId' parameter and has an additional 'currency_code' parameter that is not present in the ground truth response. 

Calculation:
Total parameters in ground truth response: 5
Parameters missing in generated response: 1 (fromId)
Parameters additional in generated response: 1 (currency_code)

Accuracy = 100 - [100 * [0.7 * (1/5) + 0.3 * (1/5)]]
Accuracy = 100 - [100 * (0.7*0.2 + 0.3*0.2)]
Accuracy = 100 - [100 * (0.14 + 0.06)]
Accuracy = 100 - [100 * 0.2]
Accuracy = 100 - 20
Accuracy = 80%

Justification:
The accuracy of the generated response is 80% based on the calculation above. The missing 'fromId' parameter and additional 'currency_code' parameter contribute to the deviation from the ground truth response. Further adjustments may be needed to improve the accuracy of the generated response.


In [277]:
evaluation_result = evaluate_responses(flights_true_params, chatgpt_flights_generated_response)
print("Evaluation Result:", evaluation_result)

Evaluation Result: The ground truth response and generated response have the following parameters:

Ground Truth Response:
- fromId: JFK.AIRPORT
- toId: STO.AIRPORT
- departDate: 2024-12-13
- adults: 2
- cabinClass: ECONOMY

Generated Response:
- fromId: NYC.AIRPORT
- toId: STO.AIRPORT
- departDate: today

Based on the comparison, we can see that the generated response is missing the 'adults' and 'cabinClass' parameters from the ground truth response. It also replaced the specific departure date with 'today'.

Calculating the accuracy using the provided formula:
- Parameters in ground truth but not in generated response: 2 (adults, cabinClass)
- Parameters in generated response but not in ground truth: 0
- Total number of parameters in ground truth: 5

Accuracy = 100 - [100 * [0.7 * (2/5) + 0.3 * (0/5)]]
Accuracy = 100 - [100 * (0.4 + 0)]
Accuracy = 100 - 40
Accuracy = 60%

Therefore, the accuracy of the generated response is 60%. The summary score is 60% and the justification is that 

In [287]:
chat_gpt_flights_actual_acc = calculate_accuracy(flights_true_params, chatgpt_flights_generated_response)
mistral_flights_actual_acc = calculate_accuracy(flights_true_params, mistral_flights_generated_response)

print("ChatGPT Flights Actual Accuracy:", chat_gpt_flights_actual_acc)
print("Mistral Flights Actual Accuracy:", mistral_flights_actual_acc)

ChatGPT Flights Actual Accuracy: 72.0
Mistral Flights Actual Accuracy: 80.0
