# Real-time Eval for every LLM response with Cleanlab TLM

In [2]:
#@title Install Cleanlab
%pip install --upgrade cleanlab-studio

In [8]:
from cleanlab_studio import Studio

studio = Studio("<YOUR_API_KEY>")  # Get your API key from https://tlm.cleanlab.ai

tlm = studio.TLM(options={"log": ["explanation"], "model": "gpt-4o-mini"}) # supports GPT, Claude, etc

# Use TLM like GPT (with more accurate results). Returns response, trustworthiness score, explanation
out = tlm.prompt("What's the third month of the year alphabetically?")
print("TLM model used: ", tlm.get_model_name(), "\n")
print("TLM response: ", out['response'], "\n")
print("TLM trustworthiness score: ", out['trustworthiness_score'], "\n")
print("TLM Explanation: ", out['log']['explanation'], "\n")

TLM model used:  gpt-4o-mini 

TLM response:  The third month of the year alphabetically is "March." The months in alphabetical order are:

1. April
2. August
3. December
4. February
5. January
6. July
7. June
8. March
9. May
10. November
11. October
12. September 

TLM trustworthiness score:  0.4979648802626605 

TLM Explanation:  The answer provided states that "March" is the third month of the year alphabetically. However, when listing the months in alphabetical order, "March" is actually the eighth month. Therefore,  incorrect. 
This response is untrustworthy due to lack of consistency in possible responses from the model. Here's one inconsistent alternate response that the model considered (which may not be accurate either): 
December. 



## You can also use TLM to score the trustworthiness of any response to a given prompt.
Use `tlm.get_trustworthiness_score` which returns a numerical value between 0-1.
 - Enables you to use TLM with responses from your own custom LLM or LLM in production.


In [9]:
# TLM returns a high score when the LLM/RAG/Agent is accurate
response = tlm.get_trustworthiness_score("What's the first month of the year?", response="January")
print("Trustworthiness Score: ", response["trustworthiness_score"])
print("Explanation: ", response["log"]["explanation"])

Trustworthiness Score:  0.9997738711822411
Explanation:  Did not find a reason to doubt trustworthiness.


In [10]:
# TLM returns a low score when the LLM/RAG/Agent is untrustworthy
response = tlm.get_trustworthiness_score("What's the first month of the year?", response="February")
print("Trustworthiness Score: ", response["trustworthiness_score"])
print("Explanation: ", response["log"]["explanation"])

Trustworthiness Score:  0.04739682241488771
Explanation:  The first month of the year is January, not February. Therefore,  factually incorrect. 
This response is untrustworthy due to lack of consistency in possible responses from the model. Here's one inconsistent alternate response that the model considered (which may not be accurate either): 
January.




# How to interpret the TLM trustworthiness score:
 - `1.0 >= trustworthiness > 0.9` -- highly reliable reponse you can fully trust
 - `0.9 >= trustworthiness > 0.7` -- sometimes a bad retrieval (if RAG), hallucination, or wrong reponse
 - `0.7 >= trustworthiness > 0.3` -- likely a bad retrieval (if RAG), hallucination, or wrong reponse
 - `0.3 >= trustworthiness >= 0.0` -- near-always a bad retrieval (if RAG), hallucination, or wrong reponse

In [None]:
# Run your own real-time evaluation of LLM outputs with TLM here:
tlm.prompt("ENTER_YOUR_OWN_PROMPT_HERE")