<a href="https://colab.research.google.com/github/goelnikhils-lgtm/languagemodels/blob/main/Basic_Evals_Testing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# writing a simple eval
#we need to try with PyTest
#two kind of evals - Reference Free and Reference(where GT is available)
#Online and offline Evals
#Credit -> https://www.youtube.com/watch?v=-zoIqOpt2DA&list=PL9omX6impEuNTr0KGLChHwhvN-q3ZF12d&index=5
#Credit -> https://www.youtube.com/watch?v=IMN_bDVRZ1M&list=PLrLEqwuz-mRI5ubqVJ7DpbHheCflJDDXk&index=7
#Credit -> https://www.geeksforgeeks.org/nlp/perplexity-for-llm-evaluation/
#Calculate Context Retreival Effectiveness via metrics such as Contextual Recall , Precision
#use BERTScore for this
#challenge is that BERT Score will not work for long sentences as BERT has context window of size = 512 .. then what to use
#BERTScore - Measures the semantic similarity of the two sentences. Higher score indicates better match
#BLEU (Precision) metric is used for evaluating transalation. Tokens generated by model match with ground truth. Higher score indicates better match
#ROUGE is used to evaluate text summarization - Recall Oriented Unigram Gist Evaluation ->#Objective of ROUGE is to measure /evaluate the longest (unigram , bigram , n-gram etc.)overlap between the generated text and reference text in terms of unigrams. Higher means the better
#Perplexity - Metric to measure the answer coherence of LLM . LLM generated text .... Lower perplexity means LLM is generating a coherent response
#Diversity - Metric to measure the diversity in the generated text . Higher diversity means large vocab of LLM and that this is good . Formula is unique-n-grams/total n-grams
#Racial Bias - Offensive Language
#WEAT
#Fact Checking Algos
#Burstiness

In [None]:
import os
import json
from openai import OpenAI
from pydantic import BaseModel, Field
from typing import Literal
from dotenv import load_dotenv

# load_dotenv() # No longer needed as we are using Colab secrets

from google.colab import userdata

# Access the API key from Colab's secrets manager
openai_api_key = userdata.get("OPENAI_API_KEY")


client = OpenAI(api_key=openai_api_key)

To use the OpenAI API, you'll need an API key. If you don't already have one, you can generate one from the [OpenAI website](https://beta.openai.com/account/api-keys).

In Colab, add the key to the secrets manager under the "🔑" icon in the left panel. Give it the name `OPENAI_API_KEY`. Then, you can access it in your code like this:

In [None]:
def load_event(filename:str) ->dict:
  with open(filename) as f:
    return json.load(f)

class CustomerInquiry(BaseModel):
  category:Literal["complaint","feature_request","billing","other"]
  response:str
def process_customer_message(message:str) -> CustomerInquiry:
  response = client.responses.parse(
    model="gpt-4o-mini",
    text_format = CustomerInquiry,
    input = [
        {
            "role":"system",
            "content":"You are a customer serivce assistant that analyzes customer inquiries"
        },
        {
            "role":"user",
            "content":message
        }

    ]
  )
  return response.output_parsed

In [None]:
def test_billing_categorization():
  event = "I need help with my billing"
  result = process_customer_message(event)
  assert result.category == "other"
  assert (len(result.response)) >10


def test_feature_request_categorization():
  event = "I would like to request a new feature"
  result = process_customer_message(event)
  assert result.category == "feature_request"
  assert (len(result.response)) >10

def test_complaint_categorization():
  event = "I am very unhappy with the service"
  result = process_customer_message(event)
  assert result.category == "complaint"
  assert (len(result.response)) >5

#run all tests
if __name__ == "__main__":
  tests =[
    test_billing_categorization,
    test_feature_request_categorization,
    test_complaint_categorization # Corrected function name
  ]
  passed = 0
  for test in tests:
    try:
      test()
      print(f"{test.__name__}: Passed")
      passed += 1
    except AssertionError as e:
      print(f"{test.__name__}: Failed - {e}")
  print(f"\nResults: {passed}/{len(tests)} tests passed")

In [None]:
#Eval Tutorial from Evidently
!pip install evidently[llm]

In [None]:
import pandas as pd
import os
from evidently import Report
from evidently import Dataset , DataDefinition
from evidently.descriptors import TextLength , Sentiment , HuggingFace , IncludesWords , SemanticSimilarity , ExactMatch , BERTScore , SentenceCount
from evidently.descriptors import LLMEval , PIILLMEval, DeclineLLMEval, CorrectnessLLMEval , FaithfulnessLLMEval, DeclineLLMEval, ContextQualityLLMEval
from evidently.descriptors import ColumnTest, TestSummary, CustomColumnDescriptor
from evidently.llm.templates import BinaryClassificationPromptTemplate, MulticlassClassificationPromptTemplate
from evidently.core.datasets import DatasetColumn
from evidently.presets import TextEvals
from evidently.metrics import CategoryCount, OutRangeValueCount
from evidently.tests import eq,gte,lte
from evidently.ui.workspace import CloudWorkspace

In [None]:
#Part 1 A very basic example
data = [
    ["What is the capital of France","The capital of France is Paris"],
    ["Can Penguins fly ","No Penguins can't fly but they are excellent swimmers"],
    ["Help me withe the homework ","I 'm here to guide you but I can't do your homwroek for you"],
    ["Is water wet","Yes water is considered wet because it makes things wet"],
    ["Do fish sleep","Yes , fish do sleep , though not in the same way as humans do"],
    ["What is 2+2","2+2 equals 4"],
    {"Is the Earth flat?","No earth is a sphere"},
    ["Can dogs talk","Dogs can't talk like humans , but then  can bark , growl etc"],
    ["What's your name ","I'm a virtual bot assistant bot."],
    ["Are bananas berries? ","Yes, botnaically speaking , bananas are classified as berries"]
]
colums = ["questions","answer"]
eval_data = pd.DataFrame(data,columns=colums)

In [None]:
pd.set_option('display.max_colwidth',None)

In [None]:
eval_data.head()

In [None]:
definition = DataDefinition(text_columns=["question","answer"])

In [None]:
eval_df = Dataset.from_pandas(
    pd.DataFrame(eval_data),
    data_definition = definition
)

In [None]:
#descriptor for evaluating LLM. The below descriptor is for evaluating answer length
eval_df.add_descriptors(descriptors=[TextLength("answer", alias = "Answer Length")])

In [None]:
eval_df.as_dataframe()

In [None]:
#Descriptor tests
eval_df = Dataset.from_pandas(
    pd.DataFrame(eval_data),
    data_definition = definition,
    descriptors = [TextLength("answer", alias = "Answer Length",
                   tests = [gte(100, alias = "Answer is too long")])])
eval_df.as_dataframe()

In [None]:
#report
report = Report([TextEvals()])
my_eval = report.run(eval_df)

In [None]:
my_eval

In [None]:
#Part 2 Reference-based-evals
#Generate a toy dataset . Let's imgagine Q&A (RAG) use case where the system generates the  response based on the retrieved context
import pandas as pd
eval_data = pd.DataFrame([
    {
    "question": "Will my transaction go through as I don't have enough funds?",
    "context":"Overdraft protection allows transactions to be completed even if acocunt balance is insufficient",
    "answer": "Yes , your transaction will go through if overdraft is enabled , but a $35 fee will apply",
    "reference_answer":"Yes , with overdraft protection , your transcation will complete , but you will be charged $35."},
    {
    "question": "How do I block my card if it's lost?",
    "context":"To block a lost or stolen card users should immediately navigate to the Cards section in the FinBot app",
    "answer": "Go to the cards section , select your card and tap 'block card' , to block instantly",
    "reference_answer":"Open the app , go to cards , choose your cards and tap 'Block Card'. Blocking is immediate"},
    {
    "question": "Do you offer loans in Argetina?",
    "context":"Finbot currently offers loans in 20+ locations , including the US , Canada and selected EU countries",
    "answer": "Yes , Finbot offers personal loans in Argentina with competitive interest rates",
    "reference_answer":"No , Finbot does not currently offer personal loans in Argentina"
    }

  ])

In [None]:
golden_df = eval_data[["question","reference_answer"]].copy()
golden_df.head()

In [None]:
eval_data.head()

In [None]:
#Reference Based Evals
#Deterministic
definition = DataDefinition(text_columns=["question","context","answer","reference_answer"])
eval_df = Dataset.from_pandas(
    pd.DataFrame(eval_data),
    data_definition = definition,
    descriptors=[ExactMatch(columns = ["answer","reference_answer"], alias = "Exact Match")]
    )

In [None]:
eval_df.as_dataframe()

In [None]:
#evaluate semantic similarity
eval_df.add_descriptors(descriptors=[SemanticSimilarity(columns=["answer","reference_answer"],alias="Semantic Similarity"),
                                     BERTScore(columns=["answer","reference_answer"],alias="BERTScore")])
eval_df.as_dataframe()

In [None]:
#LLM-As-A-Judge --------->USING REFERENCE/GT  - QUALITATIVE EVALS
import os
from google.colab import userdata
# Access the API key from Colab's secrets manager
openai_api_key = userdata.get("OPENAI_API_KEY")
os.environ["OPENAI_API_KEY"] = openai_api_key

In [None]:
eval_df.add_descriptors(descriptors=[CorrectnessLLMEval("answer",target_output="reference_answer")])
eval_df.as_dataframe()

In [None]:
#create your own custom Judge prompt

eval_df_2 = Dataset.from_pandas(
    pd.DataFrame(eval_data),
    data_definition = definition,)

In [None]:
corerctness_multiclass = MulticlassClassificationPromptTemplate(
    pre_messages = [("system","You are a judge that evaluates the factual alignment of two chatbot answers")],
    criteria = """ You are given new answer and a reference answer . Classify the new answer based on how it compares to reference.
    ===
    Reference: {reference_answer}""",
    category_column = {
        "fully_correct":"The answer matches the reference in all factual and semantic details",
        "incomplete":"The answer is correct in what it says but leaves out details from the reference",
        "adds_claims":"The answer does not contradict reference but introduces new claims not supported by reference",
        "contradictory":"The answer contradicts specific facts or meaning in the reference"
        ""
    },
    uncertainity = "unknown",
    include_reasoning = True,
    include_scores = False
)

In [105]:
eval_df_2.add_descriptors(descriptors=[LLMEval("answer",template=corerctness_multiclass, additional_columns={"reference_answer":"reference_answer"},
                                               provider ="openai",
                                               model ="gpt-4o-mini",
                                               alias = "Multi-class correctness"
                                               )
])

In [106]:
eval_df_2.as_dataframe()

Unnamed: 0,question,context,answer,reference_answer,Multi-class correctness,Multi-class correctness reasoning,Multi-class correctness_1,Multi-class correctness reasoning_1,Multi-class correctness_2,Multi-class correctness reasoning_2
0,Will my transaction go through as I don't have enough funds?,Overdraft protection allows transactions to be completed even if acocunt balance is insufficient,"Yes , your transaction will go through if overdraft is enabled , but a $35 fee will apply","Yes , with overdraft protection , your transcation will complete , but you will be charged $35.",SAME,"The new answer provides the same information as the reference answer, indicating that a transaction will go through with overdraft and that a $35 fee will apply. Although the wording is slightly different, the essential details are consistent.",SAME,"The new answer conveys the same information as the reference answer, stating that the transaction will go through if overdraft is enabled and that there will be a $35 fee applied. The wording is slightly different, but the meaning is consistent.",MATCH,"The new answer closely matches the reference answer in meaning and content. Both indicate that a transaction will be completed if overdraft protection is enabled, and both mention that a $35 fee will apply. The phrasing is slightly different, but the essential information is consistent."
1,How do I block my card if it's lost?,To block a lost or stolen card users should immediately navigate to the Cards section in the FinBot app,"Go to the cards section , select your card and tap 'block card' , to block instantly","Open the app , go to cards , choose your cards and tap 'Block Card'. Blocking is immediate",SAME,"The new answer provides a similar step-by-step process for blocking a card, including going to the cards section, selecting the card, and tapping 'block card'. It also specifies that blocking is done instantly, aligning closely with the reference answer.",ALIGN,"The new answer provides essentially the same steps and outcome as the reference answer. It instructs the user to go to the cards section, select their card, and tap 'block card', which aligns with the reference's instruction to go to cards, choose the card, and tap 'Block Card'. Both specify that blocking is immediate, although 'instantly' is used in the new answer. Therefore, the factual content is aligned.",aligned,"The new answer provides similar instructions to the reference answer, specifying to go to the cards section, select the card, and tap 'block card' to block the card instantly. Both answers convey the same essential process for blocking a card."
2,Do you offer loans in Argetina?,"Finbot currently offers loans in 20+ locations , including the US , Canada and selected EU countries","Yes , Finbot offers personal loans in Argentina with competitive interest rates","No , Finbot does not currently offer personal loans in Argentina",INACCURATE,"The new answer directly contradicts the reference answer, which states that Finbot does not offer personal loans in Argentina. Therefore, the new answer is factually incorrect.",FALSE,"The new answer contradicts the reference answer by stating that Finbot does offer personal loans in Argentina, whereas the reference clearly states that it does not.",incorrect,"The new answer asserts that Finbot offers personal loans in Argentina, which directly contradicts the reference answer stating that Finbot does not currently offer personal loans in Argentina."


REFERENCE FREE EVALS
#reference free evals - there is no GT
#run when there is no answer or hard to prepare answer



In [None]:
#reference free evals - there is no GT
#run when there is no answer or hard to prepare answer
prod_data = eval_data[["question","context","answer"]].copy()
prod_data.head()

In [None]:
definition = DataDefinition(text_columns=["question","context","answer"])
prod_df = Dataset.from_pandas(
    pd.DataFrame(prod_data),
    data_definition = definition,
)

In [None]:
#Word presence in the generated answer or not / or check if the forbidden words are present in the response
#or you can use Contains  , a custom RegEX etc

prod_df.add_descriptors(descriptors=[
    IncludesWords("answer",words_list = ["hello","hi","good_afternoon"], mode="any", alias ="Says hi"),
    IncludesWords("answer",words_list = ["sorry","apologies","apologize","cannot","afraid"], mode="any", alias ="Declines"),
])
prod_df.as_dataframe()

In [None]:
#TEXT STATS - can be used to check if there is a certain change in response length
prod_df = Dataset.from_pandas(
    pd.DataFrame(prod_data),
    data_definition = definition,
    descriptors =[SentenceCount("answer", alias = "Sentence_Count")]
)
prod_df.as_dataframe()

QUANTITATIVE EVALS

Natural Language Based Metrics such as BLEU , ROUGE , BERTScore , Perplexity ,
Diversity , Racial Bias , WEAT etc.

In [103]:
#Calculate Context Retreival Effectiveness via metrics such as Contextual Recall , Precision
#use BERTScore for this
#challenge is that BERT Score will not work for long sentences as BERT has context window of size = 512 .. then what to use
#BERTScore - Measures the semantic similarity of the two sentences. Higher score indicates better match
#BLEU (Precision) metric is used for evaluating transalation. Tokens generated by model match with ground truth. Higher score indicates better match
#ROUGE is used to evaluate text summarization - Recall Oriented Unigram Gist Evaluation ->#Objective of ROUGE is to measure /evaluate the longest (unigram , bigram , n-gram etc.)overlap between the generated text and reference text in terms of unigrams. Higher means the better
#Perplexity - Metric to measure the answer coherence of LLM . LLM generated text .... Lower perplexity means LLM is generating a coherent response
#Diversity - Metric to measure the diversity in the generated text . Higher diversity means large vocab of LLM and that this is good . Formula is unique-n-grams/total n-grams
#Racial Bias - Offensive Language
#WEAT
#Fact Checking Algos
#Burstiness


In [None]:
!pip install bert-score evaluate
!pip install rouge_score sacrebleu nltk streamlit

In [None]:
from bert_score import score
import evaluate

In [None]:
generated_text = ["The quick brown fox jumps over the lazy dog."]
reference_text = ["The fast brown fox jumps over the sleepy dog."]
P, R, F1 = score(generated_text, reference_text, lang="en", verbose=True)
print(f"Precision :{P.mean()}, Recall {R.mean()},F1 {F1.mean()}")

bleu_metric = evaluate.load("bleu")
bleu_score = bleu_metric.compute(predictions=generated_text, references=reference_text)
print(f"BLEU Score: {bleu_score['bleu']}")

rouge_metric = evaluate.load("rouge")
rouge_score = rouge_metric.compute(predictions=generated_text, references=reference_text)
print(f"ROUGE Score: {rouge_score['rougeL']}")

In [None]:
from bert_score import score

# Extract context and answer columns as lists
contexts = prod_data["context"].tolist()
answers = prod_data["answer"].tolist()

# Calculate BERTScore
P, R, F1 = score(answers, contexts, lang="en", verbose=True)

# Display the precision, recall, and F1 scores
print("BERTScore Precision:", P)
print("BERTScore Recall:", R)
print("BERTScore F1 Score:", F1)

In [101]:
#Code for measuring perplexity
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
#assign the EOS token as the padding token
tokenizer.pad_token = tokenizer.eos_token

In [100]:
def compute_perplexity(input_texts):
  inputs = tokenizer(
      input_texts,
      return_tensors="pt",
      padding=True,
      truncation=True
      )
  input_ids = inputs["input_ids"]
  attention_mask = inputs["attention_mask"] #telling the LLM on what tokens to pay attention to

  #solid code as this shows the autoregressive loss ....

  with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits
  shift_logits = logits[..., :-1, :].contiguous() #shift the logits by
  shift_labels = input_ids[..., 1:].contiguous()
  #shift the logits so that these become the token to be predicted . LLM predict the probability distribution of all the tokens from the vocabulary

  #shift_logits = logits[:,:-1,:]
  #shift_labels = input_ids[:,1:]

  log_probs = torch.nn.functional.log_softmax(shift_logits, dim=-1)
  target_log_probs = log_probs.gather(dim=-1, index= shift_labels.unsqueeze(-1)).squeeze(-1)
  target_log_probs = target_log_probs * attention_mask[:, 1:].to(log_probs.dtype)
  negative_log_likelihood = -target_log_probs.sum(dim=-1) / attention_mask[:,1:].sum(dim=-1)
  perplexity = torch.exp(negative_log_likelihood)
  mean_perplexity_score = torch.mean(perplexity)

  return{
      "perplexity_scores":perplexity.tolist(),
      "perplexities_mean":mean_perplexity_score.item()
  }


In [102]:
#calculating the perplexities score
example_text = [
    "Once upon a time , there was a brave knight",
    "In a galaxy far , far away , a new adventure began"
]
#compute perplexity
perplexity_scores = compute_perplexity(example_text)
print(perplexity_scores["perplexity_scores"],perplexity_scores["perplexities_mean"] )


[51.4720573425293, 94.8211441040039] 73.14659881591797
