# Obtaining TLM Responses for SimpleQA Dataset

In [None]:
# Set your API key
import os
os.environ["CLEANLAB_TLM_API_KEY"] = "<API key>"  # Get your API key from: https://tlm.cleanlab.ai/

In [1]:
import pandas as pd
from cleanlab_tlm import TLM

In [2]:
url = "https://openaipublic.blob.core.windows.net/simple-evals/simple_qa_test_set.csv"
data = pd.read_csv(url)
data.head()

Unnamed: 0,metadata,problem,answer
0,"{'topic': 'Science and technology', 'answer_ty...",Who received the IEEE Frank Rosenblatt Award i...,Michio Sugeno
1,"{'topic': 'Science and technology', 'answer_ty...",Who was awarded the Oceanography Society's Jer...,Annick Bricaud
2,"{'topic': 'Geography', 'answer_type': 'Place',...",What's the name of the women's liberal arts co...,Radcliffe College
3,"{'topic': 'Sports', 'answer_type': 'Person', '...",In whose honor was the Leipzig 1877 tournament...,Adolf Anderssen
4,"{'topic': 'Art', 'answer_type': 'Person', 'url...","According to Karl Küchler, what did Empress El...",Poet Henrich Heine.


## Prompt TLM
Using TLM on `medium` quality preset is the same as directly querying OpenAI for responses, but also returns a trustworthiness score.

In [4]:
tlm = TLM("medium", options={"model": "gpt-4o"})
tlm_results = tlm.try_prompt(data["problem"].tolist())

Querying TLM... 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████|


In [5]:
results_df = pd.concat([data, pd.DataFrame(tlm_results)], axis=1)
results_df.head()

Unnamed: 0,metadata,problem,answer,response,trustworthiness_score
0,"{'topic': 'Science and technology', 'answer_ty...",Who received the IEEE Frank Rosenblatt Award i...,Michio Sugeno,The IEEE Frank Rosenblatt Award was establishe...,0.59795
1,"{'topic': 'Science and technology', 'answer_ty...",Who was awarded the Oceanography Society's Jer...,Annick Bricaud,"In 2018, the Oceanography Society's Jerlov Awa...",0.27572
2,"{'topic': 'Geography', 'answer_type': 'Place',...",What's the name of the women's liberal arts co...,Radcliffe College,"The women's liberal arts college in Cambridge,...",0.597658
3,"{'topic': 'Sports', 'answer_type': 'Person', '...",In whose honor was the Leipzig 1877 tournament...,Adolf Anderssen,The Leipzig 1877 chess tournament was organize...,0.35243
4,"{'topic': 'Art', 'answer_type': 'Person', 'url...","According to Karl Küchler, what did Empress El...",Poet Henrich Heine.,Empress Elizabeth of Austria's favorite sculpt...,0.970854


In [6]:
results_df.to_csv("results/gpt-4o-baseline-responses.csv", index=None)

## Filter results using TLM Trustworthiness Score

In [7]:
def filter_responses(results_df, threshold):
    filtered_results = results_df.copy()
    filter_idx = filtered_results[filtered_results["trustworthiness_score"] < threshold].index
    filtered_results.loc[filter_idx, "response"] = "I'm sorry, I don’t know the answer to that question."

    return filtered_results

In [8]:
# filter results at threshold = 0.25
filter_25 = filter_responses(results_df, 0.25)

# filter results at threshold = 0.8
filter_80 = filter_responses(results_df, 0.8)

filter_25.to_csv("results/gpt-4o-baseline-25-responses.csv", index=None)
filter_80.to_csv("results/gpt-4o-baseline-80-responses.csv", index=None)

## Repeat the same process for `best` quality preset
Using TLM on `best` quality preset improves the LLM responses.

In [9]:
tlm_best = TLM("best", options={"model": "gpt-4o"})
tlm_best_results = tlm_best.try_prompt(data["problem"].tolist())

results_best_df = pd.concat([data, pd.DataFrame(tlm_best_results)], axis=1)
results_best_df.to_csv("results/gpt-4o-best-responses.csv", index=None)

Querying TLM... 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████|


In [10]:
# filter results at threshold = 0.25
filter_best_25 = filter_responses(results_best_df, 0.25)

# filter results at threshold = 0.8
filter_best_80 = filter_responses(results_best_df, 0.8)

filter_best_25.to_csv("results/gpt-4o-best-25-responses.csv", index=None)
filter_best_80.to_csv("results/gpt-4o-best-80-responses.csv", index=None)