# Real-time Eval for every LLM response with Cleanlab TLM

In [2]:
#@title Install Cleanlab
%pip install --upgrade cleanlab-studio

In [3]:
from cleanlab_studio import Studio

studio = Studio("<YOUR_API_KEY>")  # Get your API key from https://tlm.cleanlab.ai

tlm = studio.TLM(options={"log": ["explanation"], "model": "gpt-4o-mini"}) # supports GPT, Claude, etc

# Use TLM like GPT (with more accurate results). Returns response, trustworthiness score, explanation
out = tlm.prompt("Who wrote Harry Potter? Respond with only their name.")
print("TLM model used: ", tlm.get_model_name(), "\n")
print("TLM response: ", out['response'], "\n")
print("TLM trustworthiness score: ", out['trustworthiness_score'], "\n")
print("TLM Explanation: ", out['log']['explanation'], "\n")

TLM model used:  gpt-4o-mini 

TLM response:  J.K. Rowling 

TLM trustworthiness score:  0.9973282399885858 

TLM Explanation:  Did not find a reason to doubt trustworthiness. 



## You can also use TLM to score the trustworthiness of any response to a given prompt.
Use `tlm.get_trustworthiness_score` which returns a numerical value between 0-1.
 - Enables you to use TLM with responses from your own custom LLM or LLM in production.


In [9]:
# TLM returns a high score when the LLM/RAG/Agent is accurate
response = tlm.get_trustworthiness_score("Who wrote Harry Potter? Respond with only their name.", response="J.K. Rowling")
print("Trustworthiness Score: ", response["trustworthiness_score"])
print("Explanation: ", response["log"]["explanation"])

Trustworthiness Score:  0.9936905078946259
Explanation:  Did not find a reason to doubt trustworthiness.


In [7]:
# TLM returns a low score when the LLM/RAG/Agent is untrustworthy
response = tlm.get_trustworthiness_score("Who wrote Harry Potter? Respond with only their name.", response="Albus Dumbledore")
print("Trustworthiness Score: ", response["trustworthiness_score"])
print("Explanation: ", response["log"]["explanation"])

Trustworthiness Score:  0.0012339607050982507
Explanation:  The proposed answer "Albus Dumbledore" is incorrect because the actual author of the Harry Potter series is J.K. Rowling. Albus Dumbledore is a fictional character within the series, not the author. Therefore, the answer does not accurately respond to the question about who wrote Harry Potter. 
This response is untrustworthy due to lack of consistency in possible responses from the model. Here's one inconsistent alternate response that the model considered (which may not be accurate either): 
J.K. Rowling.




# How to interpret the TLM trustworthiness score:
 - `1.0 >= trustworthiness > 0.9` -- highly reliable reponse you can fully trust
 - `0.9 >= trustworthiness > 0.7` -- sometimes a bad retrieval (if RAG), hallucination, or wrong reponse
 - `0.7 >= trustworthiness > 0.3` -- likely a bad retrieval (if RAG), hallucination, or wrong reponse
 - `0.3 >= trustworthiness >= 0.0` -- near-always a bad retrieval (if RAG), hallucination, or wrong reponse

In [None]:
# Run your own real-time evaluation of LLM outputs with TLM here:
tlm.prompt("ENTER_YOUR_OWN_PROMPT_HERE")