In [1]:
!pip install datasets
!pip install --upgrade ragas
!pip install ragas llama_index


Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [2]:
from google.colab import userdata
from google.colab import drive
import re
drive.mount('/content/drive')
import logging
from IPython.display import display, HTML
from google.colab import userdata
import os
from datasets import Dataset

import pandas as pd
import numpy as np

Mounted at /content/drive


In [3]:
logging.basicConfig(level=logging.INFO)

openai_api_key = userdata.get('OPENAI_API_KEY')

os.environ["OPENAI_API_KEY"] = openai_api_key


In [4]:
def create_ragas_dataset(generated_answers, api_key, audience):
    logging.basicConfig(level=logging.INFO)

    os.environ["OPENAI_API_KEY"] = api_key

    if audience == 'engineering':
      ground_truth = 'gold_answer_research'
    elif audience == 'marketing':
      ground_truth = 'gold_answer_marketing'
    else:
      raise ValueError("Invalid audience")

    data = {
        "question": [qa["question"] for qa in generated_answers.values()],
        "ground_truth": [qa[ground_truth] for qa in generated_answers.values()],
        "contexts": [[qa["context"]] for qa in generated_answers.values()],
        "answer": [qa["generated_answer"] for qa in generated_answers.values()]
    }

    dataset = Dataset.from_dict(data)

    return dataset

openai_api_key = userdata.get('OPENAI_API_KEY')

## Testing RAGAS

In [5]:
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
    answer_similarity,
    answer_correctness
)

generated_answer = "This is the answer generated by your model."
reference_answer = "This is the expected or correct answer."



In [6]:
from datasets import Dataset
import os
from ragas import evaluate
from ragas.metrics import faithfulness, answer_correctness, answer_similarity, answer_relevancy, context_recall, context_precision


data_samples = {
    'question': ['When was the first super bowl?', 'Who won the most super bowls?'],
    'answer': ['The first superbowl was held on Jan 15, 1967', 'The most super bowls have been won by The New England Patriots'],
    'contexts' : [['The First AFL–NFL World Championship Game was an American football game played on January 15, 1967, at the Los Angeles Memorial Coliseum in Los Angeles,'],
    ['The Green Bay Packers...Green Bay, Wisconsin.','The Packers compete...Football Conference']],
    'ground_truth': ['The first superbowl was held on January 15, 1967', 'The New England Patriots have won the Super Bowl a record six times']
}

dataset = Dataset.from_dict(data_samples)

score = evaluate(dataset,metrics=[faithfulness,answer_correctness, context_recall, context_precision, answer_similarity])
score.to_pandas()

Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

Unnamed: 0,user_input,retrieved_contexts,response,reference,faithfulness,answer_correctness,context_recall,context_precision,semantic_similarity
0,When was the first super bowl?,[The First AFL–NFL World Championship Game was...,"The first superbowl was held on Jan 15, 1967","The first superbowl was held on January 15, 1967",1.0,0.999093,1.0,1.0,0.996364
1,Who won the most super bowls?,"[The Green Bay Packers...Green Bay, Wisconsin....",The most super bowls have been won by The New ...,The New England Patriots have won the Super Bo...,0.0,0.981061,0.0,0.0,0.924246


## Evaluation on Actual PolicyText

In [6]:
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import faithfulness, answer_correctness, answer_similarity, context_recall, context_precision, answer_relevancy
import pandas as pd

# Load your data
path = '/content/drive/MyDrive/compliance/validation_results.csv'
df = pd.read_csv(path)

df_ragas = df.rename(columns={
    'question': 'question',
    'answer': 'answer',
    'ground_truth': 'ground_truth',
    'contexts': 'contexts'
})

df_ragas['contexts'] = df_ragas['contexts'].apply(lambda x: x.strip("[]").split("', '"))

dataset = Dataset.from_pandas(df_ragas)

metrics = [faithfulness, answer_correctness, answer_relevancy, context_recall, context_precision, answer_similarity]

score = evaluate(dataset, metrics=metrics)

score_df = score.to_pandas()

output_path = '/content/drive/MyDrive/compliance/validation_results.csv'
score_df.to_csv(output_path, index=False)

score_df.head()


Evaluating:   0%|          | 0/36 [00:00<?, ?it/s]

Unnamed: 0,user_input,retrieved_contexts,response,reference,faithfulness,answer_correctness,answer_relevancy,context_recall,context_precision,semantic_similarity
0,When and where does BitPay collect personal data?,[How we obtain personal data\nMeans of collect...,Main Answer:\nBitPay obtains personal data fro...,Main Answer:\nBitPay collects personal data du...,0.842105,0.981929,0.914853,0.8,1.0,0.927716
1,When does MoonPay require service consent?,[Global Privacy Policy\nEffective Date: 3rd Ma...,Main Answer:\nMoonPay requires service consent...,Main Answer:\nMoonPay initiates consent collec...,0.8,0.224263,0.921159,1.0,1.0,0.897051
2,What are the requirements for handling sensiti...,[under specific conditions and subject to appr...,"Main Answer:\nUnder GDPR regulations, handling...",Main Answer:\nOrganizations conducting researc...,1.0,0.46616,0.958711,0.7,1.0,0.931307
3,Why does Klarna process different categories o...,[Technical\ninformation\ngenerated through\nyo...,Main Answer:\nKlarna processes various categor...,Main Answer:\nKlarna processes distinct catego...,0.533333,0.985362,0.904217,1.0,1.0,0.941449
4,What types of personal data does Bilt collect ...,[e Receive from Third Parties:\nWe may collect...,Main Answer:\nBilt collects various types of p...,Main Answer:\nBilt collects multiple categorie...,1.0,0.76864,0.997571,0.4,1.0,0.941253
