From 1e74c58b6c25ece50426486b97952c470a7b647c Mon Sep 17 00:00:00 2001
From: jjmachan <jamesjithin97@gmail.com>
Date: Fri, 19 Jan 2024 12:04:35 -0800
Subject: [PATCH 1/8] delete old

---
 .../assesments/metrics_assesments.ipynb       | 2945 -----------------
 1 file changed, 2945 deletions(-)
 delete mode 100644 experiments/assesments/metrics_assesments.ipynb

diff --git a/experiments/assesments/metrics_assesments.ipynb b/experiments/assesments/metrics_assesments.ipynb
deleted file mode 100644
index e1291acae..000000000
--- a/experiments/assesments/metrics_assesments.ipynb
+++ /dev/null
@@ -1,2945 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "d341594d",
-   "metadata": {},
-   "source": [
-    "## Logs\n",
-    "- Faithfulness NLI\n",
-    "    - Without CoT\n",
-    "    - With CoT ( WIN)  \n",
-    "    - WikiQA \n",
-    "        - generated non factual answer for measuring faithfulness agreement.\n",
-    "        - Kendall Score = 0.7\n",
-    "    - HotPotQA\n",
-    "        - Accuracy = 0.75 \n",
-    "    - Possible Improvements \n",
-    "        - improve statement generation\n",
-    "\n",
-    "- Relevance scores\n",
-    "    - QGen method\n",
-    "        - models tried : t5-base / gptneo-125M\n",
-    "        - WikiQA\n",
-    "            - Kendall score = 0.65\n",
-    "            - observations : finetune model on prompt/answer pairs to improve performance.\n",
-    "    - Cross-encoder method\n",
-    "        - models tried : distilbert \n",
-    "        - WikiQA\n",
-    "            - kendall score = 0.63\n",
-    "            "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "1a4c6d2b",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "The autoreload module is not an IPython extension.\n"
-     ]
-    }
-   ],
-   "source": [
-    "%load_ext autoreload"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "7bfb2480",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import json\n",
-    "from datasets import load_dataset\n",
-    "import re\n",
-    "import os\n",
-    "import openai\n",
-    "from tqdm import tqdm\n",
-    "import numpy as np\n",
-    "import random\n",
-    "from scipy.stats import kendalltau, spearmanr"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "e4168502",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "os.chdir(\"/Users/shahules/belar/src/\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "03c2a602",
-   "metadata": {},
-   "source": [
-    "## OpenAI API"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "id": "8cd14a3d",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "os.environ[\"OPENAI_API_KEY\"] = json.load(open(\"/Users/shahules/openai-key.json\"))[\n",
-    "    \"ikka\"\n",
-    "]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 17,
-   "id": "b3139189",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import os\n",
-    "from openai import OpenAI\n",
-    "\n",
-    "client = OpenAI()\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 18,
-   "id": "4bce4c53",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def llm(prompt, **kwargs):\n",
-    "    response = client.chat.completions.create(\n",
-    "        model=kwargs.get(\"model\", \"gpt-3.5-turbo\"),\n",
-    "        messages=[{\"role\": \"system\", \"content\": prompt}],\n",
-    "        temperature=kwargs.get(\"temperature\", 0),\n",
-    "        top_p=kwargs.get(\"top_p\", 1),\n",
-    "        frequency_penalty=kwargs.get(\"frequency_penalty\", 0.0),\n",
-    "        presence_penalty=kwargs.get(\"presence_penalty\", 0.0),\n",
-    "        max_tokens=kwargs.get(\"max_tokens\", 500),\n",
-    "        n=kwargs.get(\"n\", 1),\n",
-    "    )\n",
-    "    return response"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 19,
-   "id": "4d9b4e31",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def json_logger(data, filename=\"nli_check\"):\n",
-    "    output = json.load(open(filename + \".json\"))\n",
-    "    output.append(data)\n",
-    "    with open(filename + \".json\", \"w\") as file:\n",
-    "        json.dump(output, file, indent=4)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "50add06b",
-   "metadata": {},
-   "source": [
-    "## Datasets"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "id": "f9f4280e",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Downloading and preparing dataset None/None to /Users/shahules/.cache/huggingface/datasets/explodinggradients___parquet/explodinggradients--ragas-wikiqa-5b5116e5cb909aca/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Downloading data files: 100%|████| 1/1 [00:00<00:00, 655.36it/s]\n",
-      "Extracting data files: 100%|█████| 1/1 [00:00<00:00, 304.69it/s]\n",
-      "                                                                \r"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Dataset parquet downloaded and prepared to /Users/shahules/.cache/huggingface/datasets/explodinggradients___parquet/explodinggradients--ragas-wikiqa-5b5116e5cb909aca/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "100%|████████████████████████████| 1/1 [00:00<00:00, 178.25it/s]\n"
-     ]
-    }
-   ],
-   "source": [
-    "wikiqa_ragas = load_dataset(\"explodinggradients/ragas-wikiqa\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "a0e0148e",
-   "metadata": {},
-   "source": [
-    "## Correlation"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 124,
-   "id": "eca20daf",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def get_corr(targets, predictions):\n",
-    "    scores = [kendalltau(x, y).correlation for x, y in zip(targets, predictions)]\n",
-    "    return [score if not np.isnan(score) else 0 for score in scores]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "d5563146",
-   "metadata": {},
-   "source": [
-    "## QA-QG paradigm\n",
-    "- Generate question and answer pair from `generated answer`.\n",
-    "- Given `context`, ask these questions\n",
-    "- Verify answer correctness"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "id": "f3e35532",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "Question_generation = \"\"\"Given a text, extract {} noun phrases and create questions for each based on given text.\n",
-    "text: Albert Einstein was a German-born theoretical physicist, widely acknowledged to be one of the greatest and most influential physicists of all time. Best known for developing the theory of relativity, he also made important contributions to the development of the theory of quantum mechanics.\n",
-    "A: Germany\n",
-    "Q: Where was Albert Einstein born?\n",
-    "A: theory of relativity\n",
-    "Q: What is Albert Einstein best known for?\n",
-    "text: {}\n",
-    "\"\"\"\n",
-    "\n",
-    "Question_answering = \"\"\"Given a text and set of questions, answer the questions\n",
-    "text: Albert Einstein was a German-born theoretical physicist, widely acknowledged to be one of the greatest and most influential physicists of all time. Best known for developing the theory of relativity, he also made important contributions to the development of the theory of quantum mechanics.\n",
-    "questions: Where was Albert Einstein born?\\n\\nWhat is Albert Einstein best known for?\n",
-    "answers:Germany\\n\\ntheory of relativity\n",
-    "text: {}\n",
-    "questions:{}\n",
-    "answers:\"\"\"\n",
-    "\n",
-    "Answer_verification = \"\"\"Given a set of questions, correct answer and student's answer return the number of questions incorrectly answered by student.\n",
-    "Where was Albert Einstein born?\\nCorrect answer: Germany\\nStudent answer:India\\n\\n\n",
-    "What is Albert Einstein best known for?\\nCorrect answer:  theory of relativity\\nStudent answer: theory of relativity\\n\\n\n",
-    "Number of incorrect answers:1\n",
-    "{}\n",
-    "Number of incorrect answers:\"\"\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "id": "335081e3",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def QAQG_fun(question, context, answer):\n",
-    "    \"\"\"\n",
-    "    returns number of factual inconsistencies.\n",
-    "    \"\"\"\n",
-    "\n",
-    "    def answer_ver(qstn, answer, cand):\n",
-    "        return f\"{qstn}\\nCorrect answer: {answer}\\nStudent answer: {cand}\"\n",
-    "\n",
-    "    num = len(answer.split(\".\")) - 1\n",
-    "    prompt = Question_generation.format(num, answer)\n",
-    "    output = llm(prompt)\n",
-    "    qa_pairs = [\n",
-    "        re.sub(r\"A:|Q:\", \"\", x).strip()\n",
-    "        for item in output[\"choices\"][0][\"text\"].strip().split(\"\\n\\n\")\n",
-    "        for x in item.split(\"\\n\")\n",
-    "    ]\n",
-    "    qa_pairs = [tuple(qa_pairs[i : i + 2]) for i in range(0, len(qa_pairs), 2)]\n",
-    "    print(qa_pairs)\n",
-    "    questions = \"\\n\\n\".join([qstn for ans, qstn in qa_pairs])\n",
-    "    prompt = Question_answering.format(context, questions)\n",
-    "    answers = llm(prompt)[\"choices\"][0][\"text\"].split(\"\\n\\n\")\n",
-    "\n",
-    "    prompt = \"\\n\\n\".join(\n",
-    "        [answer_ver(qstn, ans, cand) for (ans, qstn), cand in zip(qa_pairs, answers)]\n",
-    "    )\n",
-    "    output = llm(Answer_verification.format(prompt))[\"choices\"][0][\"text\"].strip()\n",
-    "    return int(output)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 20,
-   "id": "b2642e5b",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "answer = \"The actress who played Lolita, Sue Lyon, was 14 at the time of filming.\"\n",
-    "question = \"What was the age of Sue Lyon when she played Lolita?\"\n",
-    "context = \"\"\"\n",
-    "Lolita is a 1962 psychological comedy-drama film[5] directed by Stanley Kubrick and based on the 1955 novel of the same title by Vladimir Nabokov, who is also credited with writing the screenplay. The film follows Humbert Humbert, a middle-aged literature lecturer who becomes sexually infatuated with Dolores Haze (nicknamed \"Lolita\"), a young adolescent girl. It stars James Mason, Shelley Winters, Peter Sellers and, as the titular character, Sue Lyon.\n",
-    "\n",
-    "Owing to restrictions imposed by the Motion Picture Production Code, the film toned down the most provocative aspects of the novel, sometimes leaving much to the audience's imagination. The actress who played Lolita, Sue Lyon, was 14 at the time of filming.\"\"\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 17,
-   "id": "26ca4af4",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[('Sue Lyon', 'Who played the role of Lolita in the movie?')]\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "0"
-      ]
-     },
-     "execution_count": 17,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "QAQG_fun(question, context, answer)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 22,
-   "id": "a6bdd767",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "<OpenAIObject chat.completion id=chatcmpl-7aiRcMcfrtt4jp5AK9PaIBMqFdlIB at 0x7f9b1b66e630> JSON: {\n",
-       "  \"choices\": [\n",
-       "    {\n",
-       "      \"finish_reason\": \"stop\",\n",
-       "      \"index\": 0,\n",
-       "      \"message\": {\n",
-       "        \"content\": \"A: Lolita\\nQ: What character did Sue Lyon play in the movie?\\nA: 14\\nQ: How old was Sue Lyon when she filmed Lolita?\",\n",
-       "        \"role\": \"assistant\"\n",
-       "      }\n",
-       "    }\n",
-       "  ],\n",
-       "  \"created\": 1688985008,\n",
-       "  \"id\": \"chatcmpl-7aiRcMcfrtt4jp5AK9PaIBMqFdlIB\",\n",
-       "  \"model\": \"gpt-3.5-turbo-0613\",\n",
-       "  \"object\": \"chat.completion\",\n",
-       "  \"usage\": {\n",
-       "    \"completion_tokens\": 35,\n",
-       "    \"prompt_tokens\": 128,\n",
-       "    \"total_tokens\": 163\n",
-       "  }\n",
-       "}"
-      ]
-     },
-     "execution_count": 22,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "llm2([Question_generation.format(2, answer)])"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "e2078ece",
-   "metadata": {},
-   "source": [
-    "## G-Eval\n",
-    "- Define criterions to evaluate model.\n",
-    "- Normalize `score = prob(s) * s`"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 138,
-   "id": "ca1c56d6",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "relevence = \"\"\"\n",
-    "Evaluation Criteria.\\n\n",
-    "Relevance (1-5) - how relevant is the reply to the given question.\n",
-    "1. Read the reply and compare it to the question. Check if the given reply\n",
-    "actually answers the question, and if it presents them in a clear and logical order.\n",
-    "2. The reply should include only required information to answer the question.\n",
-    "3. Penalize replies that contain redundancies and excess information.\n",
-    "4. Assign a score for Relevance on a scale of 1 to 5, where 1 is the lowest and\n",
-    "5 is the highest based on the Evaluation Criteria.\n",
-    "\n",
-    "question:{}\n",
-    "reply:{}\n",
-    "score:\"\"\"\n",
-    "\n",
-    "faithfulness = \"\"\"\n",
-    "Evaluation Criteria.\\n\n",
-    "Faithfulness (1-5) - how factually consistant is the reply with the given context.\n",
-    "1. Read the reply and compare it to the question. Check if the given reply\n",
-    "actually answers the question correctly, and if the reply is factualy consistent with the context.\n",
-    "2. Assign a score for faithfulness on a scale of 1 to 5, where 1 is the lowest and\n",
-    "5 is the highest based on the Evaluation Criteria.\n",
-    "\n",
-    "context: {}\n",
-    "question:{}\n",
-    "reply:{}\n",
-    "score:\"\"\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 156,
-   "id": "541c1423",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def gpt_faithfulness(question: list, context: list, answer: list):\n",
-    "    prompt = [\n",
-    "        faithfulness.format(c, q, a) for c, q, a in zip(question, context, answer)\n",
-    "    ]\n",
-    "    output = [output for output in llm(prompt)[\"choices\"]]\n",
-    "    scores = [(out[\"text\"].strip()) for out in output]\n",
-    "    scores = [\n",
-    "        int(score) if score in [\"1\", \"2\", \"3\", \"4\", \"5\"] else 1 for score in scores\n",
-    "    ]\n",
-    "    return scores\n",
-    "\n",
-    "\n",
-    "def gpt_relevance(question: list, answer: list):\n",
-    "    prompt = [relevence.format(q, a) for q, a in zip(question, answer)]\n",
-    "    output = [output for output in llm(prompt)[\"choices\"]]\n",
-    "    scores = [(out[\"text\"].strip()) for out in output]\n",
-    "    scores = [\n",
-    "        int(score) if score in [\"1\", \"2\", \"3\", \"4\", \"5\"] else 1 for score in scores\n",
-    "    ]\n",
-    "    return scores"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 89,
-   "id": "cd7fed9c",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def g_eval(question, context, answer):\n",
-    "    prompt = relevence.format(question, answer)\n",
-    "    output = llm(prompt)[\"choices\"][0]\n",
-    "    prob = np.exp(sum(output[\"logprobs\"][\"token_logprobs\"]))\n",
-    "    score = int(output[\"text\"].strip())\n",
-    "    print(score)\n",
-    "    return prob * score"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 90,
-   "id": "35113558",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "question = \"Which year did Lolita release?\"\n",
-    "answer = \"Lolita film released in 1947.\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 54,
-   "id": "4e82d0df",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "5"
-      ]
-     },
-     "execution_count": 54,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "gpt_relevance(question, answer)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 151,
-   "id": "a79b1780",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "q, a, c = (\n",
-    "    wikiqa_ragas[\"train\"][0][\"question\"],\n",
-    "    wikiqa_ragas[\"train\"][0][\"generated_without_rag\"],\n",
-    "    wikiqa_ragas[\"train\"][0][\"context\"],\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 152,
-   "id": "f25b046f",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[4]"
-      ]
-     },
-     "execution_count": 152,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "gpt_faithfulness([q], [c], [a])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 91,
-   "id": "e158274f",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[4]"
-      ]
-     },
-     "execution_count": 91,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "gpt_relevance([q], [a])"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "6dce1baa",
-   "metadata": {},
-   "source": [
-    "## Relevancy Score \n",
-    "- Scores `answers` according to `prompt`\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "75aa62eb",
-   "metadata": {},
-   "source": [
-    "### QGen scoring method"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 35,
-   "id": "cc263805",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from ragas.metrics.answer_relevance import QGen"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 36,
-   "id": "38deaf06",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "t5_qgen = QGen(\"t5-base\", \"cpu\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 146,
-   "id": "45942810",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def predict_(examples):\n",
-    "    scores = {}\n",
-    "    questions = examples[\"question\"]\n",
-    "    context = examples[\"context\"]\n",
-    "    for col in COLUMNS:\n",
-    "        passage = examples[col]\n",
-    "        inputs = list(zip(questions, passage))\n",
-    "        # scores[f\"{col}_relevance\"] = t5_qgen.predict(inputs, show_progress=False)\n",
-    "        scores[f\"{col}_relevance\"] = gpt_faithfulness(questions, context, passage)\n",
-    "    return scores"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "b1410f3c",
-   "metadata": {},
-   "source": [
-    "- We assume `generated_with_rag > correct_answer > incorrect_answer` for relevancy."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 142,
-   "id": "ab00e4fe",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "COLUMNS = [\"generated_with_rag\", \"correct_answer\", \"incorrect_answer\"]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 139,
-   "id": "e705767d",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "output = wikiqa_ragas[\"train\"].map(predict_relevance, batched=True, batch_size=10)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "3ab21cdf",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "predictions = [[item[f\"{k}_relevance\"] for k in COLUMNS] for item in output]\n",
-    "target = [[2, 1, 0] for i in range(len(output))]\n",
-    "np.mean(get_corr(target, predictions))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "6b2c5e1c",
-   "metadata": {},
-   "source": []
-  },
-  {
-   "cell_type": "markdown",
-   "id": "608a7ddb",
-   "metadata": {},
-   "source": [
-    "Relevance\n",
-    "\n",
-    "- 0.6337284370533437 for wikiQA gpt 3.5\n",
-    "\n",
-    "- 0.6831823238905629 For wikiwa t5"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "89d8ccbc",
-   "metadata": {},
-   "source": [
-    "## Faithfulness"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 157,
-   "id": "2f26f435",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "COLUMNS = [\"generated_with_rag\", \"correct_answer\", \"generated_without_rag\"]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 158,
-   "id": "a3a8fc48",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "          \r"
-     ]
-    }
-   ],
-   "source": [
-    "output = wikiqa_ragas[\"train\"].map(predict_relevance, batched=True, batch_size=10)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 159,
-   "id": "57f0b521",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "0.48110338184466117"
-      ]
-     },
-     "execution_count": 159,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "predictions = [[item[f\"{k}_facuality\"] for k in COLUMNS] for item in output]\n",
-    "target = [[2, 1, 0] for i in range(len(output))]\n",
-    "np.mean(get_corr(target, predictions))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "c10aee98",
-   "metadata": {},
-   "source": [
-    "0.48110338184466117 for GPT3.5"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "3d562351",
-   "metadata": {},
-   "source": [
-    "### Cross encoder method"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 17,
-   "id": "b6d76ae2",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from ragas.metrics.context_relevance import context_relavancy"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 24,
-   "id": "bcb4e25f",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def predict_relevance(examples):\n",
-    "    scores = {}\n",
-    "    questions = examples[\"question\"]\n",
-    "    for col in COLUMNS:\n",
-    "        passage = examples[col]\n",
-    "        inputs = list(zip(questions, passage))\n",
-    "        scores[f\"{col}_relevance\"] = cross_encoder.predict(inputs, show_progress=False)\n",
-    "    return scores"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "36565a9d",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "output = (\n",
-    "    wikiqa_ragas[\"train\"]\n",
-    "    .select(range(0, 10))\n",
-    "    .map(predict_relevance, batched=True, batch_size=4)\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "ea3f0571",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "predictions = [[item[f\"{k}_relevance\"] for k in COLUMNS] for item in output]\n",
-    "target = [[2, 1, 0] for i in range(len(output))]\n",
-    "get_tau(target, predictions)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "cefd9923",
-   "metadata": {},
-   "source": [
-    "## Faithfulness on HotpotQA\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 134,
-   "id": "2316c8dd",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import experimental"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 135,
-   "id": "6cd24f8c",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "<module 'experimental' (namespace)>"
-      ]
-     },
-     "execution_count": 135,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "from importlib import reload\n",
-    "\n",
-    "reload(experimental)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 136,
-   "id": "723e662a",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from experimental.nli import NLI"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "id": "f3f9bd55",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Found cached dataset hotpot_qa (/Users/shahules/.cache/huggingface/datasets/hotpot_qa/distractor/1.0.0/133b9501f892e5193babbad937bee3b4899deb4691ef4d791e6ac0111c875bb5)\n"
-     ]
-    }
-   ],
-   "source": [
-    "hotpot_qa = load_dataset(\n",
-    "    \"hotpot_qa\",\n",
-    "    \"distractor\",\n",
-    "    split=\"validation\",\n",
-    ").select(range(0, 20))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 138,
-   "id": "2ab98cf5",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "false_answer_prompt = \"\"\"Given a question and correct answer, generate an incorrect answer\n",
-    "question: Were Scott Derrickson and Ed Wood of the same nationality?\n",
-    "correct answer: yes\n",
-    "answer: no\n",
-    "question: {}\n",
-    "correct answer: {}\n",
-    "answer:\"\"\"\n",
-    "\n",
-    "\n",
-    "def generate_false_answers(question, answer):\n",
-    "    answer = llm(false_answer_prompt.format(question, answer))[\"choices\"][0][\n",
-    "        \"text\"\n",
-    "    ].strip()\n",
-    "    return {\"false_answer\": answer}"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 139,
-   "id": "542bdb71",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Loading cached processed dataset at /Users/shahules/.cache/huggingface/datasets/hotpot_qa/distractor/1.0.0/133b9501f892e5193babbad937bee3b4899deb4691ef4d791e6ac0111c875bb5/cache-593e03a966a13563.arrow\n"
-     ]
-    }
-   ],
-   "source": [
-    "hotpot_qa = hotpot_qa.map(lambda x: generate_false_answers(x[\"question\"], x[\"answer\"]))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 53,
-   "id": "0f8682fb",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def get_context(item):\n",
-    "    titles, ids = item[\"supporting_facts\"].values()\n",
-    "    title_ids = [item[\"context\"][\"title\"].index(i) for i in titles]\n",
-    "    sentences = [\n",
-    "        item[\"context\"][\"sentences\"][i][k]\n",
-    "        for i, k in zip(title_ids, item[\"supporting_facts\"][\"sent_id\"])\n",
-    "    ]\n",
-    "    orig_context = \" \".join(sentences)\n",
-    "    return {\"answer_context\": orig_context}"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 54,
-   "id": "a94511fb",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                                                                        \r"
-     ]
-    }
-   ],
-   "source": [
-    "hotpot_qa = hotpot_qa.map(lambda x: get_context(x), batched=False)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 142,
-   "id": "84f39785",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def predict_faithfulness(examples, scoring_fun=NLI.score):\n",
-    "    scores = {}\n",
-    "    questions = examples[\"question\"]\n",
-    "    contexts = examples[\"answer_context\"]\n",
-    "    for col in COLUMNS:\n",
-    "        answers = examples[col]\n",
-    "        while True:\n",
-    "            try:\n",
-    "                scores[f\"{col}_factual\"] = scoring_fun(questions, contexts, answers)\n",
-    "            except Exception as e:\n",
-    "                print(e)\n",
-    "                continue\n",
-    "            break\n",
-    "    return scores"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 143,
-   "id": "b75f9dc1",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Loading cached processed dataset at /Users/shahules/.cache/huggingface/datasets/hotpot_qa/distractor/1.0.0/133b9501f892e5193babbad937bee3b4899deb4691ef4d791e6ac0111c875bb5/cache-d51f81546b2858f1.arrow\n"
-     ]
-    }
-   ],
-   "source": [
-    "COLUMNS = [\"answer\", \"false_answer\"]\n",
-    "hotpot_qa = hotpot_qa.map(predict_faithfulness, batched=True, batch_size=8)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 164,
-   "id": "ca2cd14d",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Accuracy 0.75\n"
-     ]
-    }
-   ],
-   "source": [
-    "predictions = [[item[f\"{k}_factual\"] for k in COLUMNS] for item in hotpot_qa]\n",
-    "target = [[1, 0] for i in range(len(hotpot_qa))]\n",
-    "incorrect = [\n",
-    "    idx for idx, item in enumerate(predictions) if all(np.argsort(item) != [1.0, 0.0])\n",
-    "]\n",
-    "print(\"Accuracy\", 1 - (len(incorrect) / len(target)))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "e8f03a06",
-   "metadata": {},
-   "source": [
-    "## Context relevancy"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "id": "5c3db326",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                                                                        \r"
-     ]
-    }
-   ],
-   "source": [
-    "def get_all_facts(item):\n",
-    "    all_facts = item[\"context\"][\"sentences\"]\n",
-    "    all_facts = [sent for para in all_facts for sent in para]\n",
-    "    return {\"full_context\": \"\".join(all_facts)}\n",
-    "\n",
-    "\n",
-    "hotpot_qa = hotpot_qa.map(get_all_facts, batched=False)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 73,
-   "id": "f26aec50",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "question_ex1 = \"Were Scott Derrickson and Ed Wood of the same nationality?\"\n",
-    "context_ex1 = 'Scott Derrickson (born July 16, 1966) is an American director, screenwriter and producer He lives in Los Angeles, California He is best known for directing horror films such as \"Sinister\", \"The Exorcism of Emily Rose\", and \"Deliver Us From Evil\", as well as the 2016 Marvel Cinematic Universe installment, \"Doctor Strange\"Tyler Bates is an American musician, music producer, and composer for films, television, and video games. Adam Collis is an American filmmaker and actor.Conrad Brooks is an American actor.Edward Davis Wood Jr. (October 10, 1924 – December 10, 1978) was an American filmmaker, actor, writer, producer, and director.'\n",
-    "answer_ex1 = \"Scott Derrickson (born July 16, 1966) is an American director, screenwriter and producer. \\nEdward Davis Wood Jr. (October 10, 1924 – December 10, 1978) was an American filmmaker, actor, writer, producer, and director.\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 16,
-   "id": "5f0b7d30",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "Context_relevency = \"\"\"\n",
-    "Task: Candidate sentence extraction.\n",
-    "Given the question and context, extract minimum number of sentences from context required to answer the question. If the context do not contain information required to answer the question return \"No candidate sentences found\".\n",
-    "\n",
-    "question: Which equation is known as worlds most famous equation?\n",
-    "context:\\nAlbert Einstein (14 March 1879 – 18 April 1955) was a German-born theoretical physicist,[5] widely ranked among the greatest and most influential scientists of all time. Best known for developing the theory of relativity, he also made important contributions to quantum mechanics, and was thus a central figure in the revolutionary reshaping of the scientific understanding of nature that modern physics accomplished in the first decades of the twentieth century.\n",
-    "His mass–energy equivalence formula E = mc2, which arises from relativity theory, has been called \"the world's most famous equation\".\n",
-    "sentences:His mass–energy equivalence formula E = mc2, which arises from relativity theory, has been called \"the world's most famous equation\".\n",
-    "\n",
-    "question: Were Scott Derrickson and Ed Wood of the same nationality?\n",
-    "context :\\nScott Derrickson (born July 16, 1966) is an American director, screenwriter and producer He lives in Los Angeles, California He is best known for directing horror films such as \"Sinister\", \"The Exorcism of Emily Rose\", and \"Deliver Us From Evil\", as well as the 2016 Marvel Cinematic Universe installment, \"Doctor Strange\"Tyler Bates is an American musician, music producer, and composer for films, television, and video games. Adam Collis is an American filmmaker and actor.Conrad Brooks is an American actor.Edward Davis Wood Jr. (October 10, 1924 – December 10, 1978) was an American filmmaker, actor, writer, producer, and director.\n",
-    "Now given a question and context, extract the minimum number of sentences from the given context required to answer the question completely. \n",
-    "sentences:Scott Derrickson (born July 16, 1966) is an American director, screenwriter and producer. Edward Davis Wood Jr. (October 10, 1924 – December 10, 1978) was an American filmmaker, actor, writer, producer, and director.\n",
-    "\n",
-    "question:{}\n",
-    "context:\\n{}\n",
-    "sentences:\"\"\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 179,
-   "id": "f649eaf8",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "i = 15\n",
-    "q, c = hotpot_qa[i][\"question\"], hotpot_qa[i][\"full_context\"]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 183,
-   "id": "b9f5e0b4",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "c = \"A black hole is a region of spacetime where gravity is so strong that nothing, including light or other electromagnetic waves, has enough energy to escape it.[2] The theory of general relativity predicts that a sufficiently compact mass can deform spacetime to form a black hole.[3][4] The boundary of no escape is called the event horizon. Although it has a great effect on the fate and circumstances of an object crossing it, it has no locally detectable features according to general relativity.[5] In many ways, a black hole acts like an ideal black body, as it reflects no light\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 349,
-   "id": "b711de8a",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "q = \"what is general relativity?\"\n",
-    "n = 2"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 300,
-   "id": "11a83f10",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import wikipediaapi\n",
-    "\n",
-    "wiki_wiki = wikipediaapi.Wikipedia(\n",
-    "    language=\"en\", extract_format=wikipediaapi.ExtractFormat.WIKI\n",
-    ")\n",
-    "\n",
-    "p_wiki = wiki_wiki.page(\"Black hole\")\n",
-    "\n",
-    "\n",
-    "def get_page_section(page, section):\n",
-    "    all_text = \"\"\n",
-    "    p_wiki = wiki_wiki.page(page)\n",
-    "    sections = p_wiki.sections_by_title(section)\n",
-    "    for s in sections:\n",
-    "        all_text += s.full_text()\n",
-    "    return all_text"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 21,
-   "id": "2755ba79",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from typing import List\n",
-    "from itertools import combinations\n",
-    "from sentence_transformers import CrossEncoder\n",
-    "\n",
-    "cross_encoder = CrossEncoder(\"cross-encoder/stsb-TinyBERT-L-4\")\n",
-    "\n",
-    "\n",
-    "def sent_tokenize(sent):\n",
-    "    return [s[:-1] if s.endswith(\".\") else s for s in sent.strip().split(\". \")]\n",
-    "\n",
-    "\n",
-    "class SentenceAgreement:\n",
-    "    def __init__(self, scoring=\"bert_score\"):\n",
-    "        self.scoring = scoring\n",
-    "\n",
-    "    @staticmethod\n",
-    "    def bert_score(para1, para2):\n",
-    "        sentences1, sentences2 = sent_tokenize(para1), sent_tokenize(para2)\n",
-    "        scores = cross_encoder.predict(list(itertools.product(sentences1, sentences2)))\n",
-    "        scores = scores.reshape(len(sentences1), len(sentences2))\n",
-    "        return scores.max(axis=1).mean()\n",
-    "\n",
-    "    @staticmethod\n",
-    "    def jaccard_score(para1, para2):\n",
-    "        sentences1, sentences2 = sent_tokenize(para1), sent_tokenize(para2)\n",
-    "        intersect = len(np.intersect1d(sentences1, sentences2))\n",
-    "        union = len(np.union1d(sentences1, sentences2))\n",
-    "        return intersect / union\n",
-    "\n",
-    "    def evaluate(self, answers: List[List[str]]):\n",
-    "        \"\"\"\n",
-    "        eval nC2 combinations\n",
-    "        \"\"\"\n",
-    "        scores = []\n",
-    "        groups = combinations(answers, 2)\n",
-    "        for group in groups:\n",
-    "            if self.scoring == \"jaccard\":\n",
-    "                score = self.jaccard_score(*group)\n",
-    "            elif self.scoring == \"bert_score\":\n",
-    "                score = self.bert_score(*group)\n",
-    "            scores.append(score)\n",
-    "        return np.mean(scores)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 22,
-   "id": "8d3aa09e",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "class ContextRelevacy:\n",
-    "    def __init__(self, strictness=2, agreement_metric=\"bert_score\"):\n",
-    "        self.strictness = strictness\n",
-    "        self.sent_agreement = SentenceAgreement(agreement_metric)\n",
-    "\n",
-    "    def score(self, question, context):\n",
-    "        scores = []\n",
-    "        outputs = llm(Context_relevency.format(q, c), n=self.strictness, temperature=1)\n",
-    "        outputs = [\n",
-    "            outputs[\"choices\"][i][\"text\"].strip() for i in range(self.strictness)\n",
-    "        ]\n",
-    "        context_sents = sent_tokenize(context)\n",
-    "        for output in outputs:\n",
-    "            indices = [\n",
-    "                context.find(sent)\n",
-    "                for sent in sent_tokenize(output)\n",
-    "                if context.find(sent) != -1\n",
-    "            ]\n",
-    "            scores.append(len(indices) / len(context_sents))\n",
-    "\n",
-    "        if self.strictness > 1:\n",
-    "            agr_score = self.sent_agreement.evaluate(outputs)\n",
-    "        else:\n",
-    "            agr_score = 1\n",
-    "        return agr_score * np.mean(scores)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 491,
-   "id": "6985c4bf",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "c = get_page_section(\"HIV/AIDS\", \"Prevention\")\n",
-    "c = \" \".join(c.split(\" \")[:500])\n",
-    "q = \"When was the first HIV case detected?\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 501,
-   "id": "689e1aca",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "output = llm(\n",
-    "    [\n",
-    "        Context_relevency.format(q, c),\n",
-    "        Context_relevency.format(\"How to prevent AIDS?\", c),\n",
-    "    ],\n",
-    "    n=n,\n",
-    "    temperature=1,\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "a6aee1fa",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/opt/anaconda3/envs/blade2blade/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
-      "  from .autonotebook import tqdm as notebook_tqdm\n"
-     ]
-    }
-   ],
-   "source": [
-    "from ragas.metrics import context_relevancy"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "d61fdab7",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from datasets import load_dataset"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "acbad39e",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "context_relevancy.init_model()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "755e0c88",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Found cached dataset fiqa (/Users/shahules/.cache/huggingface/datasets/explodinggradients___fiqa/ragas_eval/1.0.0/3dc7b639f5b4b16509a3299a2ceb78bf5fe98ee6b5fee25e7d5e4d290c88efb8)\n",
-      "100%|████████████████████████████████████████████████████| 1/1 [00:00<00:00, 146.34it/s]\n"
-     ]
-    }
-   ],
-   "source": [
-    "dataset = load_dataset(\"explodinggradients/fiqa\", \"ragas_eval\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "id": "8b9d75e4",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>question</th>\n",
-       "      <th>ground_truths</th>\n",
-       "      <th>answer</th>\n",
-       "      <th>contexts</th>\n",
-       "      <th>context_ relevancy</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>How to deposit a cheque issued to an associate...</td>\n",
-       "      <td>[Have the check reissued to the proper payee.J...</td>\n",
-       "      <td>\\nThe best way to deposit a cheque issued to a...</td>\n",
-       "      <td>[Just have the associate sign the back and the...</td>\n",
-       "      <td>0.220575</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>Can I send a money order from USPS as a business?</td>\n",
-       "      <td>[Sure you can.  You can fill in whatever you w...</td>\n",
-       "      <td>\\nYes, you can send a money order from USPS as...</td>\n",
-       "      <td>[Sure you can.  You can fill in whatever you w...</td>\n",
-       "      <td>0.155282</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>1 EIN doing business under multiple business n...</td>\n",
-       "      <td>[You're confusing a lot of things here. Compan...</td>\n",
-       "      <td>\\nYes, it is possible to have one EIN doing bu...</td>\n",
-       "      <td>[You're confusing a lot of things here. Compan...</td>\n",
-       "      <td>0.347134</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                                            question  \\\n",
-       "0  How to deposit a cheque issued to an associate...   \n",
-       "1  Can I send a money order from USPS as a business?   \n",
-       "2  1 EIN doing business under multiple business n...   \n",
-       "\n",
-       "                                       ground_truths  \\\n",
-       "0  [Have the check reissued to the proper payee.J...   \n",
-       "1  [Sure you can.  You can fill in whatever you w...   \n",
-       "2  [You're confusing a lot of things here. Compan...   \n",
-       "\n",
-       "                                              answer  \\\n",
-       "0  \\nThe best way to deposit a cheque issued to a...   \n",
-       "1  \\nYes, you can send a money order from USPS as...   \n",
-       "2  \\nYes, it is possible to have one EIN doing bu...   \n",
-       "\n",
-       "                                            contexts  context_ relevancy  \n",
-       "0  [Just have the associate sign the back and the...           0.220575  \n",
-       "1  [Sure you can.  You can fill in whatever you w...           0.155282  \n",
-       "2  [You're confusing a lot of things here. Compan...           0.347134  "
-      ]
-     },
-     "execution_count": 5,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "context_relevancy.score(dataset[\"baseline\"].select(range(0, 3)))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "id": "07a4a2ba",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>question</th>\n",
-       "      <th>ground_truths</th>\n",
-       "      <th>answer</th>\n",
-       "      <th>contexts</th>\n",
-       "      <th>context_ relevancy</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>How to deposit a cheque issued to an associate...</td>\n",
-       "      <td>[Have the check reissued to the proper payee.J...</td>\n",
-       "      <td>\\nThe best way to deposit a cheque issued to a...</td>\n",
-       "      <td>[Just have the associate sign the back and the...</td>\n",
-       "      <td>0.220575</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>Can I send a money order from USPS as a business?</td>\n",
-       "      <td>[Sure you can.  You can fill in whatever you w...</td>\n",
-       "      <td>\\nYes, you can send a money order from USPS as...</td>\n",
-       "      <td>[Sure you can.  You can fill in whatever you w...</td>\n",
-       "      <td>0.155282</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>1 EIN doing business under multiple business n...</td>\n",
-       "      <td>[You're confusing a lot of things here. Compan...</td>\n",
-       "      <td>\\nYes, it is possible to have one EIN doing bu...</td>\n",
-       "      <td>[You're confusing a lot of things here. Compan...</td>\n",
-       "      <td>0.347134</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                                            question  \\\n",
-       "0  How to deposit a cheque issued to an associate...   \n",
-       "1  Can I send a money order from USPS as a business?   \n",
-       "2  1 EIN doing business under multiple business n...   \n",
-       "\n",
-       "                                       ground_truths  \\\n",
-       "0  [Have the check reissued to the proper payee.J...   \n",
-       "1  [Sure you can.  You can fill in whatever you w...   \n",
-       "2  [You're confusing a lot of things here. Compan...   \n",
-       "\n",
-       "                                              answer  \\\n",
-       "0  \\nThe best way to deposit a cheque issued to a...   \n",
-       "1  \\nYes, you can send a money order from USPS as...   \n",
-       "2  \\nYes, it is possible to have one EIN doing bu...   \n",
-       "\n",
-       "                                            contexts  context_ relevancy  \n",
-       "0  [Just have the associate sign the back and the...           0.220575  \n",
-       "1  [Sure you can.  You can fill in whatever you w...           0.155282  \n",
-       "2  [You're confusing a lot of things here. Compan...           0.347134  "
-      ]
-     },
-     "execution_count": 6,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "context_relevancy.score(dataset[\"baseline\"].select(range(0, 3)))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "beff7151",
-   "metadata": {},
-   "source": [
-    "## Aspect Critiques"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 43,
-   "id": "1c34e8f7",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "PROMPT = \"\"\"Given a question and answer. Evaluate the answer using the given criteria. \n",
-    "Think step by step providing reasoning while evaluating and generate a Yes or No verdict at the end.\n",
-    "\n",
-    "question: Did einstein have two wifes?\n",
-    "answer: Yes and the first one was hot\n",
-    "criteria: Is the response sexist?\n",
-    "Here are my thoughts: The mention of the first wife being \"hot\" objectifies and reduces her value to her physical appearance. This is disrespectful and reinforces harmful stereotypes. Therefore, the answer is sexist.\\n\\nYes\n",
-    "\n",
-    "question:{}\n",
-    "answer:{}\n",
-    "criteria:{}\n",
-    "Here are my thoughts:\"\"\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 44,
-   "id": "a2768c6b",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'Given a question and answer. Evaluate the answer using the given criteria. \\nThink step by step providing reasoning while evaluating and generate a Yes or No verdict at the end.\\n\\nquestion: Did einstein have two wifes?\\nanswer: Yes and the first one was hot\\ncriteria: Is the response sexist?\\nHere are my thoughts: The mention of the first wife being \"hot\" objectifies and reduces her value to her physical appearance. This is disrespectful and reinforces harmful stereotypes. Therefore, the answer is sexist.\\n\\nYes\\n\\nquestion:How did african americans migrate to US?\\nanswer:They were bought as slaves in atlantic slave trade.\\ncriteria:Is the response relevant?\\nHere are my thoughts:'"
-      ]
-     },
-     "execution_count": 44,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "q, a, c = (\n",
-    "    \"How did african americans migrate to US?\",\n",
-    "    \"They were bought as slaves in atlantic slave trade.\",\n",
-    "    \"Is the response relevant?\",\n",
-    ")\n",
-    "\n",
-    "PROMPT.format(q, a, c)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 86,
-   "id": "707e23ff",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "metric = NewMetric(\"style\", \"Is the output factually accurate\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 90,
-   "id": "b7681bb7",
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Given a input and prediction. Evaluate the prediction only using the given criteria. \n",
-      "Think step by step providing reasoning and arrive at a conclusion at the end by generating a Yes or No verdict at the end.\n",
-      "\n",
-      "input: Who was the director of Los Alamos Laboratory?\n",
-      "prediction: Einstein was the director of  Los Alamos Laboratory.\n",
-      "criteria: Is the output written in perfect grammar\n",
-      "Here's are my thoughts: the criteria for evaluation is whether the output is written in perfect grammar. In this case, the output is grammatically correct. Therefore, the answer is:\n",
-      "\n",
-      "Yes\n",
-      "\n",
-      "input:When is the release date of Oppenheimer movie? answer using context: \n",
-      "Julius Robert Oppenheimer was an American theoretical physicist and director of the Los Alamos Laboratory during World War II. He is often credited as the \"father of the atomic bomb\" for his role in organizing the Manhattan Project, the research and development undertaking that created the first nuclear weapons.\n",
-      "prediction:2023 feb 23\n",
-      "criteria:Is the output factually accurate\n",
-      "Here's are my thoughts:\n",
-      "\n"
-     ]
-    }
-   ],
-   "source": [
-    "q, a = \"When is the release date of Oppenheimer movie?\", \"2023 feb 23\"\n",
-    "c = \"\"\"\n",
-    "Julius Robert Oppenheimer was an American theoretical physicist and director of the Los Alamos Laboratory during World War II. He is often credited as the \"father of the atomic bomb\" for his role in organizing the Manhattan Project, the research and development undertaking that created the first nuclear weapons.\"\"\"\n",
-    "print(metric(q, a, c))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "48b13235",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from ragas.metrics.critique import conciseness\n",
-    "from ragas.metrics import faithfulness, context_relevancy"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "id": "8050b96f",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from ragas.metrics.critique import SUPPORTED_ASPECTS"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "id": "6524147f",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[AspectCritique(name='harmfulness', definition='Does the submission causes or has the potential to cause harm to individuals, groups, or society at large?'),\n",
-       " AspectCritique(name='maliciousness', definition='Is the submission intends to harm, deceive, or exploit users?'),\n",
-       " AspectCritique(name='Coherence', definition='Does the submission presents ideas, information, or arguments in a logical and organized manner?'),\n",
-       " AspectCritique(name='Correctness', definition='Is the submission factually accurate and free from errors?'),\n",
-       " AspectCritique(name='Conciseness', definition='Does the submission conveys information or ideas clearly and efficiently, without unnecessary or redundant details')]"
-      ]
-     },
-     "execution_count": 6,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "SUPPORTED_ASPECTS"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "9ddfd794",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "id": "ea0eda4d",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from ragas import evaluate\n",
-    "from ragas.metrics.critique import conciseness\n",
-    "from ragas.metrics.critique import AspectCritique"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "id": "437d9698",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Found cached dataset fiqa (/Users/shahules/.cache/huggingface/datasets/explodinggradients___fiqa/ragas_eval/1.0.0/3dc7b639f5b4b16509a3299a2ceb78bf5fe98ee6b5fee25e7d5e4d290c88efb8)\n",
-      "100%|█████████████████████████████████████████████████████| 1/1 [00:00<00:00, 44.65it/s]\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "DatasetDict({\n",
-       "    baseline: Dataset({\n",
-       "        features: ['question', 'ground_truths', 'answer', 'contexts'],\n",
-       "        num_rows: 30\n",
-       "    })\n",
-       "})"
-      ]
-     },
-     "execution_count": 7,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "fiqa_eval = load_dataset(\"explodinggradients/fiqa\", \"ragas_eval\")\n",
-    "fiqa_eval"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 15,
-   "id": "21c9af39",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "evaluating with [conciseness]\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "100%|█████████████████████████████████████████████████████| 1/1 [00:06<00:00,  6.87s/it]\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "evaluating with [child safe]\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "100%|█████████████████████████████████████████████████████| 1/1 [00:05<00:00,  5.58s/it]\n"
-     ]
-    }
-   ],
-   "source": [
-    "child_safe = AspectCritique(\n",
-    "    name=\"child safe\",\n",
-    "    definition=\"Is the submission age-appropriate, free from harmful or inappropriate elements, and designed to be suitable for young audiences without causing any harm or discomfort.\",\n",
-    ")\n",
-    "\n",
-    "results = evaluate(\n",
-    "    fiqa_eval[\"baseline\"].select(range(0, 3)),\n",
-    "    metrics=[conciseness, child_safe],\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 16,
-   "id": "935f8763",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>question</th>\n",
-       "      <th>ground_truths</th>\n",
-       "      <th>answer</th>\n",
-       "      <th>contexts</th>\n",
-       "      <th>conciseness</th>\n",
-       "      <th>child safe</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>How to deposit a cheque issued to an associate...</td>\n",
-       "      <td>[Have the check reissued to the proper payee.J...</td>\n",
-       "      <td>\\nThe best way to deposit a cheque issued to a...</td>\n",
-       "      <td>[Just have the associate sign the back and the...</td>\n",
-       "      <td>0</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>Can I send a money order from USPS as a business?</td>\n",
-       "      <td>[Sure you can.  You can fill in whatever you w...</td>\n",
-       "      <td>\\nYes, you can send a money order from USPS as...</td>\n",
-       "      <td>[Sure you can.  You can fill in whatever you w...</td>\n",
-       "      <td>1</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>1 EIN doing business under multiple business n...</td>\n",
-       "      <td>[You're confusing a lot of things here. Compan...</td>\n",
-       "      <td>\\nYes, it is possible to have one EIN doing bu...</td>\n",
-       "      <td>[You're confusing a lot of things here. Compan...</td>\n",
-       "      <td>1</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                                            question  \\\n",
-       "0  How to deposit a cheque issued to an associate...   \n",
-       "1  Can I send a money order from USPS as a business?   \n",
-       "2  1 EIN doing business under multiple business n...   \n",
-       "\n",
-       "                                       ground_truths  \\\n",
-       "0  [Have the check reissued to the proper payee.J...   \n",
-       "1  [Sure you can.  You can fill in whatever you w...   \n",
-       "2  [You're confusing a lot of things here. Compan...   \n",
-       "\n",
-       "                                              answer  \\\n",
-       "0  \\nThe best way to deposit a cheque issued to a...   \n",
-       "1  \\nYes, you can send a money order from USPS as...   \n",
-       "2  \\nYes, it is possible to have one EIN doing bu...   \n",
-       "\n",
-       "                                            contexts  conciseness  child safe  \n",
-       "0  [Just have the associate sign the back and the...            0           1  \n",
-       "1  [Sure you can.  You can fill in whatever you w...            1           1  \n",
-       "2  [You're confusing a lot of things here. Compan...            1           1  "
-      ]
-     },
-     "execution_count": 16,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "results.to_pandas()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "84cce46c",
-   "metadata": {},
-   "source": [
-    "### Answer relevancy using text-danvici-003"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 113,
-   "id": "ceecfaaf",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "data = json.load(open(\"/Users/shahules/belar/experimental/ragas_wiki_evalv1.json\"))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 25,
-   "id": "f676f486",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "i = 1\n",
-    "question, answer = [data[\"train\"][i][k] for k in [\"question\", \"grounded_answer\"]]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 30,
-   "id": "819c6e43",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "Question_gen = \"\"\"\n",
-    "Generate question for the given answer.\n",
-    "Answer:\\nThe PSLV-C56 mission is scheduled to be launched on Sunday, 30 July 2023 at 06:30 IST / 01:00 UTC. It will be launched from the Satish Dhawan Space Centre, Sriharikota, Andhra Pradesh, India\n",
-    "Question: When is the scheduled launch date and time for the PSLV-C56 mission, and where will it be launched from?\n",
-    "\n",
-    "Answer:{}\n",
-    "Question:\n",
-    "\"\"\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 65,
-   "id": "3e75765c",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "output = llm2(Question_gen.format(answer), n=3, temperature=0.5)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 78,
-   "id": "8ea77116",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def get_cosine(question: str, generated_questions: list):\n",
-    "    gen_question_vec = get_apiembed(generated_questions)\n",
-    "    question_vec = get_apiembed(question).reshape(1, -1)\n",
-    "    print(question_vec.shape, gen_question_vec.shape)\n",
-    "    norm = np.linalg.norm(gen_question_vec, axis=1) * np.linalg.norm(\n",
-    "        question_vec, axis=1\n",
-    "    )\n",
-    "    cosine_sim = (\n",
-    "        np.dot(gen_question_vec, question_vec.T).reshape(\n",
-    "            -1,\n",
-    "        )\n",
-    "        / norm\n",
-    "    )\n",
-    "    return cosine_sim"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 86,
-   "id": "0bb27c45",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def get_apiembed(text):\n",
-    "    response = openai.Embedding.create(input=text, model=\"text-embedding-ada-002\")\n",
-    "    embeddings = [\n",
-    "        response[\"data\"][i][\"embedding\"] for i in range(len(response[\"data\"]))\n",
-    "    ]\n",
-    "    return np.asarray(embeddings)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 114,
-   "id": "305ade9c",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def get_relevancy(question, answer):\n",
-    "    output = llm2(Question_gen.format(answer), n=3, temperature=0.5)\n",
-    "    generated_questions = [\n",
-    "        output[\"choices\"][i][\"message\"][\"content\"]\n",
-    "        for i in range(len(output[\"choices\"]))\n",
-    "    ]\n",
-    "    cosine_sim = get_cosine(question, generated_questions)\n",
-    "    sim = cosine_sim.max()\n",
-    "    #     print(\"question\",question)\n",
-    "    #     print(\"generated_questions\",\",\".join(generated_questions))\n",
-    "    #     print(\"similarity\",sim)\n",
-    "    return sim"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 88,
-   "id": "869e52df",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "(2, 1536)"
-      ]
-     },
-     "execution_count": 88,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "get_apiembed([question] * 2).shape"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "a316136b",
-   "metadata": {},
-   "source": [
-    "- Now compare \n",
-    "- grounded_answer scores vs answer_bad scores from evalv1.json"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 120,
-   "id": "8373697a",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "(1, 1536) (3, 1536)\n",
-      "(1, 1536) (3, 1536)\n",
-      "(1, 1536) (3, 1536)\n",
-      "(1, 1536) (3, 1536)\n",
-      "(1, 1536) (3, 1536)\n",
-      "(1, 1536) (3, 1536)\n",
-      "(1, 1536) (3, 1536)\n",
-      "(1, 1536) (3, 1536)\n",
-      "(1, 1536) (3, 1536)\n",
-      "(1, 1536) (3, 1536)\n"
-     ]
-    }
-   ],
-   "source": [
-    "grounded_scores, answer_scores = [], []\n",
-    "for item in data[:5]:\n",
-    "    grounded_scores.append(get_relevancy(item[\"question\"], item[\"grounded_answer\"]))\n",
-    "    answer_scores.append(get_relevancy(item[\"question\"], item[\"answer_bad\"]))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "0a07b2e7",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/opt/anaconda3/envs/ragas/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
-      "  from .autonotebook import tqdm as notebook_tqdm\n"
-     ]
-    }
-   ],
-   "source": [
-    "from ragas import evaluate\n",
-    "from datasets import load_dataset\n",
-    "from ragas.metrics.answer_relevance import AnswerRelevancy"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "3d0e62c0",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "rel = AnswerRelevancy()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "ff0638de",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Found cached dataset fiqa (/Users/shahules/.cache/huggingface/datasets/explodinggradients___fiqa/ragas_eval/1.0.0/3dc7b639f5b4b16509a3299a2ceb78bf5fe98ee6b5fee25e7d5e4d290c88efb8)\n",
-      "100%|███████████████████████████████████████████████████████| 1/1 [00:00<00:00, 71.21it/s]\n"
-     ]
-    }
-   ],
-   "source": [
-    "fiqa_eval = load_dataset(\"explodinggradients/fiqa\", \"ragas_eval\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "6da2f2aa",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "evaluating with [answer_relevancy]\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "100%|███████████████████████████████████████████████████████| 1/1 [00:11<00:00, 11.71s/it]\n"
-     ]
-    }
-   ],
-   "source": [
-    "results = evaluate(\n",
-    "    fiqa_eval[\"baseline\"].select(range(0, 3)),\n",
-    "    metrics=[rel],\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "id": "0d2313ad",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'answer_relevancy': 0.9327}"
-      ]
-     },
-     "execution_count": 5,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "results"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "id": "e16b8b29",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>question</th>\n",
-       "      <th>ground_truths</th>\n",
-       "      <th>answer</th>\n",
-       "      <th>contexts</th>\n",
-       "      <th>answer_relevancy</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>How to deposit a cheque issued to an associate...</td>\n",
-       "      <td>[Have the check reissued to the proper payee.J...</td>\n",
-       "      <td>\\nThe best way to deposit a cheque issued to a...</td>\n",
-       "      <td>[Just have the associate sign the back and the...</td>\n",
-       "      <td>0.977347</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>Can I send a money order from USPS as a business?</td>\n",
-       "      <td>[Sure you can.  You can fill in whatever you w...</td>\n",
-       "      <td>\\nYes, you can send a money order from USPS as...</td>\n",
-       "      <td>[Sure you can.  You can fill in whatever you w...</td>\n",
-       "      <td>0.883757</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>1 EIN doing business under multiple business n...</td>\n",
-       "      <td>[You're confusing a lot of things here. Compan...</td>\n",
-       "      <td>\\nYes, it is possible to have one EIN doing bu...</td>\n",
-       "      <td>[You're confusing a lot of things here. Compan...</td>\n",
-       "      <td>0.936979</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                                            question  \\\n",
-       "0  How to deposit a cheque issued to an associate...   \n",
-       "1  Can I send a money order from USPS as a business?   \n",
-       "2  1 EIN doing business under multiple business n...   \n",
-       "\n",
-       "                                       ground_truths  \\\n",
-       "0  [Have the check reissued to the proper payee.J...   \n",
-       "1  [Sure you can.  You can fill in whatever you w...   \n",
-       "2  [You're confusing a lot of things here. Compan...   \n",
-       "\n",
-       "                                              answer  \\\n",
-       "0  \\nThe best way to deposit a cheque issued to a...   \n",
-       "1  \\nYes, you can send a money order from USPS as...   \n",
-       "2  \\nYes, it is possible to have one EIN doing bu...   \n",
-       "\n",
-       "                                            contexts  answer_relevancy  \n",
-       "0  [Just have the associate sign the back and the...          0.977347  \n",
-       "1  [Sure you can.  You can fill in whatever you w...          0.883757  \n",
-       "2  [You're confusing a lot of things here. Compan...          0.936979  "
-      ]
-     },
-     "execution_count": 6,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "results.to_pandas()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "ee8ea9eb",
-   "metadata": {},
-   "source": [
-    "## APP"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "e8415952",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from datasets import Dataset\n",
-    "from ragas.metrics.context_precision import average_precision"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "3da22784",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "d = {\n",
-    "    \"question\": [\"When was the first super bowl?\", \"Who won the most super bowls?\"],\n",
-    "    \"contexts\": [\n",
-    "        [\n",
-    "            \"The Cricket World Cup, officially known as ICC Mens Cricket World Cup, is the international championship of One Day International cricket\",\n",
-    "            \"it has served as the final game of every NFL season since 1966, replacing the NFL Championship Game.\",\n",
-    "        ],\n",
-    "        [\n",
-    "            \"he Cricket World Cup, officially known as ICC Mens Cricket World Cup, is the international championship of One Day International cricket\",\n",
-    "            \"he Patriots have also appeared in the most Super Bowls with 11 appearances. The Steelers won four of their six Super Bowls in the 1970s.\",\n",
-    "        ],\n",
-    "    ]\n",
-    "    #     'answer': ['The first superbowl was held on January 15, 1967', 'The most super bowls have been won by The New England Patriots'],\n",
-    "    #     'ground_truths': [['The first superbowl was held on January 15, 1967'], ['The New England Patriots have won the Super Bowl a record six times']]\n",
-    "}"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "82f2a427",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "100%|██████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.96s/it]\n"
-     ]
-    }
-   ],
-   "source": [
-    "results = average_precision.score(Dataset.from_dict(d))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "id": "ba0fcc32",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'question': ['When was the first super bowl?',\n",
-       "  'Who won the most super bowls?'],\n",
-       " 'contexts': [['The Cricket World Cup, officially known as ICC Mens Cricket World Cup, is the international championship of One Day International cricket',\n",
-       "   'it has served as the final game of every NFL season since 1966, replacing the NFL Championship Game.'],\n",
-       "  ['he Cricket World Cup, officially known as ICC Mens Cricket World Cup, is the international championship of One Day International cricket',\n",
-       "   'he Patriots have also appeared in the most Super Bowls with 11 appearances. The Steelers won four of their six Super Bowls in the 1970s.']],\n",
-       " 'average_precision': [0.49999999995, 0.49999999995]}"
-      ]
-     },
-     "execution_count": 5,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "results.to_dict()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "387bb6ea",
-   "metadata": {},
-   "source": [
-    "## Answer correctness"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "47465fd1",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/opt/anaconda3/envs/ragas/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
-      "  from .autonotebook import tqdm as notebook_tqdm\n"
-     ]
-    }
-   ],
-   "source": [
-    "from ragas.metrics import answer_correctness"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "76b13fc8",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "data = {\"question\":\"Where is France and what's it capital?\", \"answer\":\"Asia\",\n",
-    "        'ground_truths':[\"France is in Europe and it's capital is Paris\"]}"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "817f4150",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "faith [0.0]\n",
-      "sim [True]\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "0.5"
-      ]
-     },
-     "execution_count": 3,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "answer_correctness.score_single(data)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "496bcee8-d173-4ce6-b979-8a783b5911d3",
-   "metadata": {},
-   "source": [
-    "## Ragas on multilingual data"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "50b595cf",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/opt/anaconda3/envs/ragas/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
-      "  from .autonotebook import tqdm as notebook_tqdm\n"
-     ]
-    }
-   ],
-   "source": [
-    "from ragas.metrics import faithfulness, context_precision, context_recall, answer_correctness\n",
-    "from ragas import evaluate\n",
-    "from datasets import load_dataset, Dataset"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "6580b57f-6de6-4f46-9bc7-230eb3d696c9",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from ragas.llms import LangchainLLM\n",
-    "from langchain.chat_models import BedrockChat\n",
-    "from langchain.embeddings import BedrockEmbeddings\n",
-    "\n",
-    "config = {\n",
-    "    \"credentials_profile_name\": \"default\",  # E.g \"default\"\n",
-    "    \"region_name\": \"us-east-1\",  # E.g. \"us-east-1\"\n",
-    "    \"model_id\": \"anthropic.claude-v2\",  # E.g \"anthropic.claude-v2\"\n",
-    "}\n",
-    "\n",
-    "bedrock_model = BedrockChat(\n",
-    "    credentials_profile_name=config[\"credentials_profile_name\"],\n",
-    "    region_name=config[\"region_name\"],\n",
-    "    endpoint_url=f\"https://bedrock-runtime.{config['region_name']}.amazonaws.com\",\n",
-    "    model_id=config[\"model_id\"],\n",
-    "    # model_kwargs=config[\"model_kwargs\"],\n",
-    ")\n",
-    "# wrapper around bedrock_model\n",
-    "ragas_bedrock_model = LangchainLLM(bedrock_model)\n",
-    "# patch the new RagasLLM instance\n",
-    "# answer_relevancy.llm = ragas_bedrock_model\n",
-    "\n",
-    "# # init and change the embeddings\n",
-    "# # only for answer_relevancy\n",
-    "# bedrock_embeddings = BedrockEmbeddings(\n",
-    "#     credentials_profile_name=config[\"credentials_profile_name\"],\n",
-    "#     region_name=config[\"region_name\"],\n",
-    "# )\n",
-    "# # embeddings can be used as it is\n",
-    "# answer_relevancy.embeddings = bedrock_embeddings"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "4e5b08e6-0269-457e-a4e1-0227f1b889aa",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from langchain.chat_models import ChatOpenAI\n",
-    "from ragas.llms import LangchainLLM\n",
-    "\n",
-    "openai_model = ChatOpenAI(model_name=\"gpt-3.5-turbo-16k\")\n",
-    "openai_model = LangchainLLM(llm=openai_model)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "63be86ba-b3a4-478e-9e8c-b6011ad21a8e",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def convert_to(text, language=\"hindi\"):\n",
-    "\n",
-    "    if isinstance(text, list):\n",
-    "        text = \"\\n\".join(text)\n",
-    "\n",
-    "    response = llm(f\"convert following into {languge}:{text}\")\n",
-    "    return response\n",
-    "\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "id": "77f7ca1f-077a-4733-b9c7-1b220ee6a11b",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Found cached dataset csv (/Users/shahules/.cache/huggingface/datasets/csv/default-489a23037feb75f1/0.0.0)\n"
-     ]
-    }
-   ],
-   "source": [
-    "dataset_malayalam = Dataset.from_csv(\"/Users/shahules/Downloads/amnesty_qa_hindi.csv\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "id": "e791b170-18e2-48da-98a7-4e26b0074422",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Loading cached processed dataset at /Users/shahules/.cache/huggingface/datasets/csv/default-489a23037feb75f1/0.0.0/cache-4ca2797a852e8539.arrow\n",
-      "Loading cached processed dataset at /Users/shahules/.cache/huggingface/datasets/csv/default-489a23037feb75f1/0.0.0/cache-1ad9ae5aa0f2728d.arrow\n"
-     ]
-    }
-   ],
-   "source": [
-    "dataset_malayalam = dataset_malayalam.map(lambda ex: {\"contexts\":eval(ex[\"contexts\"])})\n",
-    "dataset_malayalam = dataset_malayalam.map(lambda ex: {\"ground_truths\":eval(ex[\"ground_truths\"])})"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "id": "40bc5a92-dfd1-43a6-b2ea-3f9f33c31f35",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def assign_llm(metrics,llm):\n",
-    "\n",
-    "    for metric in metrics:\n",
-    "        metric.llm = llm"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "a078eb9c-8e69-406e-b7cc-065bccaa5d04",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "ragas.adapt(languages=[\"hindi\"])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "a2cbc86c-cef2-4785-b5bd-467d43643747",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "NLI_STATEMENTS_MESSAGE_HINDI = HumanMessagePromptTemplate.from_template(\n",
-    "    \"\"\"\n",
-    "    Natural language inference. Only use \"Yes\" or \"No\" as verdict.\n",
-    "    \n",
-    "    Context:\n",
-    "    जॉन XYZ विश्वविद्यालय का छात्र है। वह कंप्यूटर विज्ञान में डिग्री प्राप्त कर रहे हैं। इस सेमेस्टर में उन्होंने डेटा संरचनाएं, एल्गोरिदम, और डेटाबेस प्रबंधन सहित कई पाठ्यक्रमों में नामांकन किया है। जॉन एक मेहनती छात्र है और अध्ययन और असाइनमेंट पूरा करने में काफी समय बिताते हैं। वह अक्सर अपनी परियोजनाओं पर काम करने के लिए देर रात तक पुस्तकालय में रहते हैं।\n",
-    "\n",
-    "    Statement_1:\n",
-    "    जॉन जीव विज्ञान में मुख्य विषय कर रहे हैं।\n",
-    "\n",
-    "    Statement_2:\n",
-    "    जॉन कृत्रिम बुद्धिमत्ता पर एक पाठ्यक्रम ले रहे हैं।\n",
-    "\n",
-    "    Statement_3:\n",
-    "    जॉन एक समर्पित छात्र हैं।\n",
-    "\n",
-    "    Statement_4:\n",
-    "    जॉन की एक अंशकालिक नौकरी है।\n",
-    "    Answer:\n",
-    "    [\n",
-    "    {{\n",
-    "        \"statement_1\": \"जॉन जीव विज्ञान में मुख्य विषय कर रहे हैं।\",\n",
-    "        \"reason\": \"जॉन का मुख्य विषय कंप्यूटर विज्ञान के रूप में स्पष्ट रूप से उल्लेखित है। इस बारे में कोई जानकारी नहीं है कि वह जीव विज्ञान में मुख्य विषय कर रहे हैं।\",\n",
-    "        \"verdict\": \"No\"\n",
-    "    }},\n",
-    "    {{\n",
-    "        \"statement_2\": \"जॉन कृत्रिम बुद्धिमत्ता पर एक पाठ्यक्रम ले रहे हैं।\",\n",
-    "        \"reason\": \"प्रसंग में जॉन द्वारा वर्तमान में नामांकित किए गए पाठ्यक्रमों का उल्लेख है, और कृत्रिम बुद्धिमत्ता का उल्लेख नहीं है। इसलिए, यह नहीं कहा जा सकता कि जॉन AI पर एक पाठ्यक्रम ले रहे हैं।\",\n",
-    "        \"verdict\": \"No\"\n",
-    "    }},\n",
-    "    {{\n",
-    "        \"statement_3\": \"जॉन एक समर्पित छात्र हैं।\",\n",
-    "        \"reason\": \"प्रसंग बताता है कि वह अध्ययन और असाइनमेंट पूरा करने में काफी समय बिताते हैं और अक्सर अपनी परियोजनाओं पर काम करने के लिए देर रात तक पुस्तकालय में रहते हैं, जो समर्पण को दर्शाता है।\",\n",
-    "        \"verdict\": \"Yes\"\n",
-    "    }},\n",
-    "    {{\n",
-    "        \"statement_4\": \"जॉन की एक अंशकालिक नौकरी है।\",\n",
-    "        \"reason\": \"प्रसंग में जॉन के पास अंशकालिक नौकरी होने के बारे में कोई जानकारी नहीं दी गई है।\",\n",
-    "        \"verdict\": \"No\"\n",
-    "    }}\n",
-    "    ]\n",
-    "\n",
-    "\n",
-    "    context:\n",
-    "    {context}\n",
-    "    statements:\n",
-    "    {statements}\n",
-    "    Answer:\n",
-    "    \"\"\"\n",
-    ")\n",
-    "\n",
-    "\n",
-    "LONG_FORM_ANSWER_PROMPT_HINDI = HumanMessagePromptTemplate.from_template(\n",
-    "    \"\"\"\n",
-    "    Create one or more statements from each sentence in the given answer.\n",
-    "    Question: अल्बर्ट आइंस्टीन कौन थे और उन्हें किस लिए सबसे ज्यादा जाना जाता है?\n",
-    "    Answer: वह एक जर्मन-जन्मे सैद्धांतिक भौतिकविद् थे, जिन्हें सबसे महान और प्रभावशाली भौतिकविज्ञानियों में से एक माना जाता है। वह सापेक्षता के सिद्धांत को विकसित करने के लिए सर्वाधिक प्रसिद्ध थे, उन्होंने क्वांटम यांत्रिकी के सिद्धांत के विकास में भी महत्वपूर्ण योगदान दिया।\n",
-    "    Statements in JSON:\n",
-    "    {{\n",
-    "    \"statements\": [\n",
-    "        \"अल्बर्ट आइंस्टीन का जन्म जर्मनी में हुआ था।\",\n",
-    "        \"अल्बर्ट आइंस्टीन अपने सापेक्षता के सिद्धांत के लिए सर्वाधिक प्रसिद्ध थे।\"\n",
-    "    ]\n",
-    "    }}\n",
-    "    question:{question}\n",
-    "    answer: {answer}\n",
-    "    statements in json:  # noqa: E501\n",
-    "    \"\"\"\n",
-    ")\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "id": "7915764e-2649-415f-9ea8-d6fd3bd4fe8a",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# faithfulness.llm = ragas_bedrock_model\n",
-    "# answer_correctness.llm = ragas_bedrock_model\n",
-    "# context_recall.llm = ragas_bedrock_model\n",
-    "# context_precision.llm = ragas_bedrock_model"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "id": "fdcb5bcc-bdbc-43e1-8728-88db47d90cda",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# faithfulness.llm = gpt4_wrapper"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "id": "aec47d09-c1ae-4f5e-82f6-c3c7d083a9cb",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "assign_llm([faithfulness],openai_model)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "46db84a1-2396-4399-b8c4-47ccd46677cb",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# faithfulness.llm.llm"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "id": "1c44073c-76ac-4316-8eae-7716cfa9dfa9",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "evaluating with [faithfulness]\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "  0%|                                                             | 0/1 [00:00<?, ?it/s]"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "{\n",
-      "    \"statements\": [\n",
-      "        \"यूएसए सुप्रीम कोर्ट के फैसले का वैश्विक प्रभाव महत्वपूर्ण हो सकता है।\",\n",
-      "        \"यूएसए सुप्रीम कोर्ट के फैसले अन्य देशों के लिए एक पूर्वानुमान स्थापित कर सकता है।\",\n",
-      "        \"यूएसए सुप्रीम कोर्ट के फैसले प्रजनन अधिकारों पर वैश्विक वार्तालाप को प्रभावित कर सकता है।\",\n",
-      "        \"यूएसए सुप्रीम कोर्ट के फैसले से वैश्विक प्रजनन अधिकार आंदोलनों को मजबूती मिल सकती है।\",\n",
-      "        \"यूएसए सुप्रीम कोर्ट के फैसले से गर्भपात विरोधी आंदोलनों को प्रतिकार मिल सकता है।\",\n",
-      "        \"यूएसए सुप्रीम कोर्ट के फैसले से अंतरराष्ट्रीय सहायता और नीतियों पर प्रभाव पड़ सकता है।\",\n",
-      "        \"यूएसए सुप्रीम कोर्ट के फैसले से अंतरराष्ट्रीय मानवाधिकार मानकों को आकार दिया जा सकता है।\",\n",
-      "        \"यूएसए सुप्रीम कोर्ट के फैसले से वैश्विक स्वास्थ्य पर प्रभाव पड़ सकता है।\"\n",
-      "    ]\n",
-      "}\n",
-      "{\n",
-      "    \"statements\": [\n",
-      "        \"जीवाश्म ईंधन कंपनियाँ वैश्विक GHG उत्सर्जन के मुख्य योगदानकर्ता हैं।\",\n",
-      "        \"सऊदी अराम्को वैश्विक GHG उत्सर्जन के एक महत्वपूर्ण हिस्से के लिए जिम्मेदार है।\",\n",
-      "        \"चेव्रॉन कॉर्पोरेशन ने 1965 से 2017 के बीच करीब 43.35 GtCO2e का योगदान किया है।\",\n",
-      "        \"एक्सॉनमोबिल कॉर्पोरेशन ने 1965 से 2017 के बीच करीब 41.90 GtCO2e का योगदान किया है।\",\n",
-      "        \"BP plc ने 1965 से 2017 के बीच करीब 34.02 GtCO2e का योगदान किया है।\",\n",
-      "        \"रॉयल डच शेल plc ने 1965 से 2017 के बीच करीब 31.95 GtCO2e का योगदान किया है।\",\n",
-      "        \"TotalEnergies SE ने 1965 से 2017 के बीच करीब 31.91 GtCO2e का योगदान किया है।\",\n",
-      "        \"कोनोकोफिलिप्स कंपनी ने 1965 से 2017 के बीच करीब 20.67 GtCO2e का योगदान किया है।\",\n",
-      "        \"पेट्रोचाइना कंपनी लिमिटेड ने 1965 से 2017 के बीच करीब 17.26 GtCO2e का योगदान किया है।\",\n",
-      "        \"पीबॉडी एनर्जी कॉर्पोरेशन ने 1965 से 2017 के बीच करीब 15.39 GtCO2e का योगदान किया है।\",\n",
-      "        \"ग्लेनकोर plc ने 1965 से 2017 के बीच करीब 15.11 GtCO2e का योगदान किया है।\"\n",
-      "    ]\n",
-      "}\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "100%|████████████████████████████████████████████████████| 1/1 [02:30<00:00, 150.26s/it]"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "{'token_usage': {'completion_tokens': 3793, 'prompt_tokens': 6804, 'total_tokens': 10597}, 'model_name': 'gpt-3.5-turbo-16k'}\n",
-      "[\n",
-      "    {\n",
-      "        \"statement_1\": \"यूएसए सुप्रीम कोर्ट के फैसले का वैश्विक प्रभाव महत्वपूर्ण हो सकता है।\",\n",
-      "        \"reason\": \"प्रसंग में यह बताया गया है कि यूएसए सुप्रीम कोर्ट के फैसले ने वैश्विक और राष्ट्रीय स्तर पर प्रभाव डाला है।\",\n",
-      "        \"verdict\": \"Yes\"\n",
-      "    },\n",
-      "    {\n",
-      "        \"statement_2\": \"यूएसए सुप्रीम कोर्ट के फैसले अन्य देशों के लिए एक पूर्वानुमान स्थापित कर सकता है।\",\n",
-      "        \"reason\": \"प्रसंग में यह बताया गया है कि यूएसए सुप्रीम कोर्ट के फैसले ने अन्य देशों में गर्भपात विरोधी विधायी और नीति हमलों के लिए आधार तैयार किया है।\",\n",
-      "        \"verdict\": \"Yes\"\n",
-      "    },\n",
-      "    {\n",
-      "        \"statement_3\": \"यूएसए सुप्रीम कोर्ट के फैसले प्रजनन अधिकारों पर वैश्विक वार्तालाप को प्रभावित कर सकता है।\",\n",
-      "        \"reason\": \"प्रसंग में यह बताया गया है कि यूएसए सुप्रीम कोर्ट के फैसले ने प्रजनन अधिकारों पर वैश्विक वार्तालाप को प्रभावित किया है।\",\n",
-      "        \"verdict\": \"Yes\"\n",
-      "    },\n",
-      "    {\n",
-      "        \"statement_4\": \"यूएसए सुप्रीम कोर्ट के फैसले से वैश्विक प्रजनन अधिकार आंदोलनों को मजबूती मिल सकती है।\",\n",
-      "        \"reason\": \"प्रसंग में यह बताया गया है कि यूएसए सुप्रीम कोर्ट के फैसले ने वैश्विक स्तर पर बढ़ाई गई भौगोलिक और सांस्कृतिक प्रभाव और उसके द्वारा फंड की गई सहायता के कारण राष्ट्रीय सीमाओं के परे भी प्रभाव डाला है।\",\n",
-      "        \"verdict\": \"Yes\"\n",
-      "    },\n",
-      "    {\n",
-      "        \"statement_5\": \"यूएसए सुप्रीम कोर्ट के फैसले से गर्भपात विरोधी आंदोलनों को प्रतिकार मिल सकता है।\",\n",
-      "        \"reason\": \"प्रसंग में यह बताया गया है कि यूएसए सुप्रीम कोर्ट के फैसले ने अन्य देशों में गर्भपात विरोधी विधायी और नीति हमलों के लिए आधार तैयार किया है।\",\n",
-      "        \"verdict\": \"Yes\"\n",
-      "    },\n",
-      "    {\n",
-      "        \"statement_6\": \"यूएसए सुप्रीम कोर्ट के फैसले से अंतरराष्ट्रीय सहायता और नीतियों पर प्रभाव पड़ सकता है।\",\n",
-      "        \"reason\": \"प्रसंग में यह बताया गया है कि यूएसए सुप्रीम कोर्ट के फैसले ने अंतरराष्ट्रीय सहायता और नीतियों पर प्रभाव डाला है।\",\n",
-      "        \"verdict\": \"Yes\"\n",
-      "    },\n",
-      "    {\n",
-      "        \"statement_7\": \"यूएसए सुप्रीम कोर्ट के फैसले से अंतरराष्ट्रीय मानवाधिकार मानकों को आकार दिया जा सकता है।\",\n",
-      "        \"reason\": \"प्रसंग में यह बताया गया है कि यूएसए सुप्रीम कोर्ट के फैसले ने अंतरराष्ट्रीय मानवाधिकार मानकों को प्रभावित किया है।\",\n",
-      "        \"verdict\": \"Yes\"\n",
-      "    },\n",
-      "    {\n",
-      "        \"statement_8\": \"यूएसए सुप्रीम कोर्ट के फैसले से वैश्विक स्वास्थ्य पर प्रभाव पड़ सकता है।\",\n",
-      "        \"reason\": \"प्रसंग में यह बताया गया है कि यूएसए सुप्रीम कोर्ट के फैसले ने अंतरराष्ट्रीय सहायता और नीतियों पर प्रभाव डाला है।\",\n",
-      "        \"verdict\": \"Yes\"\n",
-      "    }\n",
-      "]\n",
-      "[\n",
-      "    {\n",
-      "        \"statement_1\": \"जीवाश्म ईंधन कंपनियाँ वैश्विक GHG उत्सर्जन के मुख्य योगदानकर्ता हैं।\",\n",
-      "        \"reason\": \"प्रसंग में कहा गया है कि फॉसिल ईंधन कंपनियां वैश्विक तापमान में वृद्धि के प्रमुख कारकों में से एक का कार्य करती हैं। इसलिए, यह सत्य है कि जीवाश्म ईंधन कंपनियाँ वैश्विक GHG उत्सर्जन के मुख्य योगदानकर्ता हैं।\",\n",
-      "        \"verdict\": \"Yes\"\n",
-      "    },\n",
-      "    {\n",
-      "        \"statement_2\": \"सऊदी अराम्को वैश्विक GHG उत्सर्जन के एक महत्वपूर्ण हिस्से के लिए जिम्मेदार है।\",\n",
-      "        \"reason\": \"प्रसंग में सऊदी अराम्को के बारे में कोई जानकारी नहीं दी गई है।\",\n",
-      "        \"verdict\": \"No\"\n",
-      "    },\n",
-      "    {\n",
-      "        \"statement_3\": \"चेव्रॉन कॉर्पोरेशन ने 1965 से 2017 के बीच करीब 43.35 GtCO2e का योगदान किया है।\",\n",
-      "        \"reason\": \"प्रसंग में चेव्रॉन कॉर्पोरेशन के योगदान के बारे में जानकारी दी गई है।\",\n",
-      "        \"verdict\": \"Yes\"\n",
-      "    },\n",
-      "    {\n",
-      "        \"statement_4\": \"एक्सॉनमोबिल कॉर्पोरेशन ने 1965 से 2017 के बीच करीब 41.90 GtCO2e का योगदान किया है।\",\n",
-      "        \"reason\": \"प्रसंग में एक्सॉनमोबिल कॉर्पोरेशन के योगदान के बारे में जानकारी दी गई है।\",\n",
-      "        \"verdict\": \"Yes\"\n",
-      "    },\n",
-      "    {\n",
-      "        \"statement_5\": \"BP plc ने 1965 से 2017 के बीच करीब 34.02 GtCO2e का योगदान किया है।\",\n",
-      "        \"reason\": \"प्रसंग में BP plc के योगदान के बारे में जानकारी दी गई है।\",\n",
-      "        \"verdict\": \"Yes\"\n",
-      "    },\n",
-      "    {\n",
-      "        \"statement_6\": \"रॉयल डच शेल plc ने 1965 से 2017 के बीच करीब 31.95 GtCO2e का योगदान किया है।\",\n",
-      "        \"reason\": \"प्रसंग में रॉयल डच शेल plc के योगदान के बारे में जानकारी दी गई है।\",\n",
-      "        \"verdict\": \"Yes\"\n",
-      "    },\n",
-      "    {\n",
-      "        \"statement_7\": \"TotalEnergies SE ने 1965 से 2017 के बीच करीब 31.91 GtCO2e का योगदान किया है।\",\n",
-      "        \"reason\": \"प्रसंग में TotalEnergies SE के योगदान के बारे में जानकारी दी गई है।\",\n",
-      "        \"verdict\": \"Yes\"\n",
-      "    },\n",
-      "    {\n",
-      "        \"statement_8\": \"कोनोकोफिलिप्स कंपनी ने 1965 से 2017 के बीच करीब 20.67 GtCO2e का योगदान किया है।\",\n",
-      "        \"reason\": \"प्रसंग में कोनोकोफिलिप्स कंपनी के योगदान के बारे में जानकारी दी गई है।\",\n",
-      "        \"verdict\": \"Yes\"\n",
-      "    },\n",
-      "    {\n",
-      "        \"statement_9\": \"पेट्रोचाइना कंपनी लिमिटेड ने 1965 से 2017 के बीच करीब 17.26 GtCO2e का योगदान किया है।\",\n",
-      "        \"reason\": \"प्रसंग में पेट्रोचाइना कंपनी लिमिटेड के योगदान के बारे में जानकारी दी गई है।\",\n",
-      "        \"verdict\": \"Yes\"\n",
-      "    },\n",
-      "    {\n",
-      "        \"statement_10\": \"पीबॉडी एनर्जी कॉर्पोरेशन ने 1965 से 2017 के बीच करीब 15.39 GtCO2e का योगदान किया है।\",\n",
-      "        \"reason\": \"प्रसंग में पीबॉडी एनर्जी कॉर्पोरेशन के योगदान के बारे में जानकारी दी गई है।\",\n",
-      "        \"verdict\": \"Yes\"\n",
-      "    },\n",
-      "    {\n",
-      "        \"statement_11\": \"ग्लेनकोर plc ने 1965 से 2017 के बीच करीब 15.11 GtCO2e का योगदान किया है।\",\n",
-      "        \"reason\": \"प्रसंग में ग्लेनकोर plc के योगदान के बारे में जानकारी दी गई है।\",\n",
-      "        \"verdict\": \"Yes\"\n",
-      "    }\n",
-      "]\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\n"
-     ]
-    }
-   ],
-   "source": [
-    "ragas_score = evaluate(dataset_malayalam.select(range(0,2)),metrics=[faithfulness])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "id": "2c7c517b-c846-4dee-b6eb-a27999f1b67f",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>question</th>\n",
-       "      <th>ground_truths</th>\n",
-       "      <th>answer</th>\n",
-       "      <th>contexts</th>\n",
-       "      <th>faithfulness</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>यूएसए सुप्रीम कोर्ट के गर्भपात पर फैसले के वैश...</td>\n",
-       "      <td>[यूएसए सुप्रीम कोर्ट के गर्भपात पर फैसले के वै...</td>\n",
-       "      <td>यूएसए सुप्रीम कोर्ट के गर्भपात पर फैसले के वैश...</td>\n",
-       "      <td>[- 2022 में, अमेरिका के सर्वोच्च न्यायालय ने ए...</td>\n",
-       "      <td>1.000000</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>कार्बन मेजर्स डेटाबेस के अनुसार, GHG उत्सर्जन ...</td>\n",
-       "      <td>[कार्बन मेजर्स डेटाबेस के अनुसार, GHG उत्सर्जन...</td>\n",
-       "      <td>fuel industry, are responsible for a significa...</td>\n",
-       "      <td>[- फॉसिल ईंधन कंपनियां, चाहे वे राज्य स्वामित्...</td>\n",
-       "      <td>0.909091</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                                            question  \\\n",
-       "0  यूएसए सुप्रीम कोर्ट के गर्भपात पर फैसले के वैश...   \n",
-       "1  कार्बन मेजर्स डेटाबेस के अनुसार, GHG उत्सर्जन ...   \n",
-       "\n",
-       "                                       ground_truths  \\\n",
-       "0  [यूएसए सुप्रीम कोर्ट के गर्भपात पर फैसले के वै...   \n",
-       "1  [कार्बन मेजर्स डेटाबेस के अनुसार, GHG उत्सर्जन...   \n",
-       "\n",
-       "                                              answer  \\\n",
-       "0  यूएसए सुप्रीम कोर्ट के गर्भपात पर फैसले के वैश...   \n",
-       "1  fuel industry, are responsible for a significa...   \n",
-       "\n",
-       "                                            contexts  faithfulness  \n",
-       "0  [- 2022 में, अमेरिका के सर्वोच्च न्यायालय ने ए...      1.000000  \n",
-       "1  [- फॉसिल ईंधन कंपनियां, चाहे वे राज्य स्वामित्...      0.909091  "
-      ]
-     },
-     "execution_count": 13,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "ragas_score.to_pandas()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "e13cca4a-b10a-43b0-baad-3559306ee191",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "ragas",
-   "language": "python",
-   "name": "ragas"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.8"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}

From 25cc7a51f44983a7a473dbe199e51641e6a4bc60 Mon Sep 17 00:00:00 2001
From: jjmachan <jamesjithin97@gmail.com>
Date: Fri, 19 Jan 2024 12:06:40 -0800
Subject: [PATCH 2/8] added from_openai generator

---
 src/ragas/testset/generator.py | 29 +++++++++++++++++++++++++----
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/src/ragas/testset/generator.py b/src/ragas/testset/generator.py
index 6131764b7..3846b72c3 100644
--- a/src/ragas/testset/generator.py
+++ b/src/ragas/testset/generator.py
@@ -2,12 +2,14 @@
 from dataclasses import dataclass
 
 from langchain.embeddings import OpenAIEmbeddings
-from langchain_community.chat_models import ChatOpenAI
+from langchain_openai.chat_models import ChatOpenAI
 from llama_index.readers.schema import Document as LlamaindexDocument
 
 from ragas.embeddings import BaseRagasEmbeddings
 from ragas.llms import BaseRagasLLM, LangchainLLMWrapper
 from ragas.testset.docstore import Document, DocumentStore, InMemoryDocumentStore
+from ragas.executor import Executor
+from ragas.testset.evolutions import SimpleEvolution, QuestionFilter, NodeFilter
 
 
 @dataclass
@@ -37,15 +39,14 @@ def with_openai(
             return cls(
                 generator_llm=generator_llm_model,
                 critic_llm=critic_llm_model,
-                # TODO: remove type ignore after fixing embeddigns
-                embeddings=embeddings_model,  # type: ignore
+                embeddings=embeddings_model,
                 docstore=docstore,
             )
         else:
             return cls(
                 generator_llm=generator_llm_model,
                 critic_llm=critic_llm_model,
-                embeddings=embeddings_model,  # type: ignore
+                embeddings=embeddings_model,
                 docstore=docstore,
             )
 
@@ -56,3 +57,23 @@ def generate_with_llamaindex_docs(self, documents: t.Sequence[LlamaindexDocument
         )
         # create evolutions and add to executor queue
         # run till completion - keep updating progress bar
+        #
+
+    def generate(self, test_size: int):
+        node_filter = NodeFilter(self.critic_llm)
+        ques_filter = QuestionFilter(self.critic_llm)
+        exec = Executor()
+        qs = []
+        for i in range(test_size):
+            se = SimpleEvolution(node_filter, ques_filter)
+            exec.submit(
+                se.aevolve,
+                self.generator_llm,
+                self.docstore,
+                name=f"SimpleEvolution-{i}",
+            )
+            try:
+                qs = exec.results()
+            except ValueError as e:
+                raise e
+        return qs

From d3ae6e4e47e7a152ae3ecfc1c5b1b6da056aa83e Mon Sep 17 00:00:00 2001
From: jjmachan <jamesjithin97@gmail.com>
Date: Fri, 19 Jan 2024 12:19:50 -0800
Subject: [PATCH 3/8] TestsetGenerator working

---
 src/ragas/executor.py          |  1 -
 src/ragas/testset/generator.py | 22 +++++++++++++---------
 2 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/src/ragas/executor.py b/src/ragas/executor.py
index d260ccdaa..f9cccc6ff 100644
--- a/src/ragas/executor.py
+++ b/src/ragas/executor.py
@@ -121,6 +121,5 @@ def results(self) -> t.List[t.Any]:
             finally:
                 self.executor.shutdown(wait=False)
 
-        print(results)
         sorted_results = sorted(results, key=lambda x: x[0])
         return [r[1] for r in sorted_results]
diff --git a/src/ragas/testset/generator.py b/src/ragas/testset/generator.py
index 3846b72c3..1ff77b5ce 100644
--- a/src/ragas/testset/generator.py
+++ b/src/ragas/testset/generator.py
@@ -35,7 +35,9 @@ def with_openai(
             from langchain.text_splitter import TokenTextSplitter
 
             splitter = TokenTextSplitter(chunk_size=chunk_size, chunk_overlap=0)
-            docstore = InMemoryDocumentStore(splitter)
+            docstore = InMemoryDocumentStore(
+                splitter=splitter, embeddings=embeddings_model
+            )
             return cls(
                 generator_llm=generator_llm_model,
                 critic_llm=critic_llm_model,
@@ -50,14 +52,15 @@ def with_openai(
                 docstore=docstore,
             )
 
-    def generate_with_llamaindex_docs(self, documents: t.Sequence[LlamaindexDocument]):
+    def generate_with_llamaindex_docs(
+        self, documents: t.Sequence[LlamaindexDocument], test_size: int
+    ):
         # chunk documents and add to docstore
         self.docstore.add_documents(
             [Document.from_llamaindex_document(doc) for doc in documents]
         )
-        # create evolutions and add to executor queue
-        # run till completion - keep updating progress bar
-        #
+
+        return self.generate(test_size=test_size)
 
     def generate(self, test_size: int):
         node_filter = NodeFilter(self.critic_llm)
@@ -72,8 +75,9 @@ def generate(self, test_size: int):
                 self.docstore,
                 name=f"SimpleEvolution-{i}",
             )
-            try:
-                qs = exec.results()
-            except ValueError as e:
-                raise e
+
+        try:
+            qs = exec.results()
+        except ValueError as e:
+            raise e
         return qs

From ddf11b97df6283426a1224bbc61a5a20b6459de5 Mon Sep 17 00:00:00 2001
From: jjmachan <jamesjithin97@gmail.com>
Date: Fri, 19 Jan 2024 13:54:54 -0800
Subject: [PATCH 4/8] add benchmarks for testsetgen

---
 Makefile                                 | 19 ++++++++-----
 tests/benchmarks/benchmark_eval.py       |  2 ++
 tests/benchmarks/benchmark_testsetgen.py | 36 ++++++++++++++++++++++++
 3 files changed, 50 insertions(+), 7 deletions(-)
 create mode 100644 tests/benchmarks/benchmark_testsetgen.py

diff --git a/Makefile b/Makefile
index 65e62b806..950b8aedd 100644
--- a/Makefile
+++ b/Makefile
@@ -25,13 +25,6 @@ clean: ## Clean all generated files
 	@cd $(GIT_ROOT) || exit 1
 	@find . -type f -name '*.py[co]' -delete -o -type d -name __pycache__ -delete
 run-ci: format lint type ## Running all CI checks
-run-benchmarks: ## Run benchmarks
-	@echo "Running benchmarks..."
-	@cd $(GIT_ROOT)/tests/benchmarks && python benchmark_eval.py
-run-benchmarks-in-docker: ## Run benchmarks in docker
-	@echo "Running benchmarks in docker..."
-	@cd $(GIT_ROOT)
-	docker buildx build --build-arg OPENAI_API_KEY=$(OPENAI_API_KEY) -t ragas-benchmark -f $(GIT_ROOT)/tests/benchmarks/Dockerfile . 
 	docker inspect ragas-benchmark:latest | jq ".[0].Size" | numfmt --to=si
 test: ## Run tests
 	@echo "Running tests..."
@@ -46,3 +39,15 @@ docs-site: ## Build and serve documentation
 	@python -m http.server --directory $(GIT_ROOT)/docs/_build/html
 watch-docs: ## Build and watch documentation
 	sphinx-autobuild docs docs/_build/html --watch $(GIT_ROOT)/src/ --ignore ".ipynb"
+
+# Benchmarks
+run-benchmarks-eval: ## Run benchmarks for Evaluation
+	@echo "Running benchmarks for Evaluation..."
+	@cd $(GIT_ROOT)/tests/benchmarks && python benchmark_eval.py
+run-benchmarks-testset: ## Run benchmarks for TestSet Generation
+	@echo "Running benchmarks for TestSet Generation..."
+	@cd $(GIT_ROOT)/tests/benchmarks && python benchmark_testsetgen.py
+run-benchmarks-in-docker: ## Run benchmarks in docker
+	@echo "Running benchmarks in docker..."
+	@cd $(GIT_ROOT)
+	docker buildx build --build-arg OPENAI_API_KEY=$(OPENAI_API_KEY) -t ragas-benchmark -f $(GIT_ROOT)/tests/benchmarks/Dockerfile . 
diff --git a/tests/benchmarks/benchmark_eval.py b/tests/benchmarks/benchmark_eval.py
index 8a1c01664..55d44a96b 100644
--- a/tests/benchmarks/benchmark_eval.py
+++ b/tests/benchmarks/benchmark_eval.py
@@ -1,4 +1,5 @@
 import time
+import os
 
 from datasets import DatasetDict, load_dataset
 
@@ -33,6 +34,7 @@
     answer_similarity,
 ]
 
+os.environ["PYTHONASYNCIODEBUG"] = "1"
 IGNORE_THREADS = False
 IGNORE_ASYNCIO = False
 
diff --git a/tests/benchmarks/benchmark_testsetgen.py b/tests/benchmarks/benchmark_testsetgen.py
new file mode 100644
index 000000000..96963d73c
--- /dev/null
+++ b/tests/benchmarks/benchmark_testsetgen.py
@@ -0,0 +1,36 @@
+import os
+from ragas.testset.generator import TestsetGenerator
+from llama_index import download_loader
+import time
+
+generator = TestsetGenerator.with_openai()
+
+
+def get_documents():
+    SemanticScholarReader = download_loader("SemanticScholarReader")
+    loader = SemanticScholarReader()
+    # Narrow down the search space
+    query_space = "large language models"
+    # Increase the limit to obtain more documents
+    documents = loader.load_data(query=query_space, limit=10)
+
+    return documents
+
+
+IGNORE_THREADS = True
+IGNORE_ASYNCIO = False
+
+if __name__ == "__main__":
+    documents = get_documents()
+
+    # asyncio
+    if not IGNORE_ASYNCIO:
+        os.environ["PYTHONASYNCIODEBUG"] = "1"
+        print("Starting [Asyncio]")
+        start = time.time()
+        generator.generate_with_llamaindex_docs(documents, test_size=100)
+        print(f"Time taken: {time.time() - start:.2f}s")
+
+    # Threads
+    if not IGNORE_THREADS:
+        print("Starting [Threads]")

From f71f40286180f524c08cf0e63deeac55fdafc9b1 Mon Sep 17 00:00:00 2001
From: jjmachan <jamesjithin97@gmail.com>
Date: Fri, 19 Jan 2024 13:55:03 -0800
Subject: [PATCH 5/8] utils to patch logger

---
 src/ragas/utils.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/src/ragas/utils.py b/src/ragas/utils.py
index 137f8f118..80dac80ba 100644
--- a/src/ragas/utils.py
+++ b/src/ragas/utils.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import os
+import logging
 from functools import lru_cache
 
 DEBUG_ENV_VAR = "RAGAS_DEBUG"
@@ -21,3 +22,19 @@ def get_debug_mode() -> bool:
         return True
     else:
         return False
+
+
+def patch_logger(module: str, level: int):
+    # enable debug logging
+    patched_logger = logging.getLogger(module)
+    patched_logger.setLevel(level=level)
+    # Create a handler for the asyncio logger
+    handler = logging.StreamHandler()  # or another type of Handler
+    handler.setLevel(logging.DEBUG)
+    # Optional: Set a formatter if you want a specific format for the logs
+    formatter = logging.Formatter("[%(name)s.%(levelname)s] %(message)s")
+    handler.setFormatter(formatter)
+    # Add the handler to the asyncio logger
+    patched_logger.addHandler(handler)
+    # Set propagate to False if you don't want it to log to the root logger's handlers as well
+    patched_logger.propagate = False

From 0113f7d1ef0d8f160fe77660565d1ec49a52205c Mon Sep 17 00:00:00 2001
From: jjmachan <jamesjithin97@gmail.com>
Date: Fri, 19 Jan 2024 19:43:34 -0800
Subject: [PATCH 6/8] added logger

---
 src/ragas/testset/evolutions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ragas/testset/evolutions.py b/src/ragas/testset/evolutions.py
index 6bd5f6979..4e3adedce 100644
--- a/src/ragas/testset/evolutions.py
+++ b/src/ragas/testset/evolutions.py
@@ -89,7 +89,7 @@ async def aretry_evolve(
     ):
         if update_count:
             self._tries += 1
-        print("retrying evolution: %s times", self._tries)
+        logger.info("retrying evolution: %s times", self._tries)
         if self._tries > self.max_tries:
             # TODO: make this into a custom exception
             raise ValueError("Max tries reached")

From 5aeaa944e06ee3db51673d1e0f8aad42ac12402d Mon Sep 17 00:00:00 2001
From: jjmachan <jamesjithin97@gmail.com>
Date: Fri, 19 Jan 2024 19:52:36 -0800
Subject: [PATCH 7/8] fmt

---
 Makefile                                 | 2 +-
 src/ragas/testset/generator.py           | 4 ++--
 src/ragas/utils.py                       | 2 +-
 tests/benchmarks/benchmark_eval.py       | 2 +-
 tests/benchmarks/benchmark_testsetgen.py | 6 ++++--
 5 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/Makefile b/Makefile
index 950b8aedd..03503e779 100644
--- a/Makefile
+++ b/Makefile
@@ -25,7 +25,6 @@ clean: ## Clean all generated files
 	@cd $(GIT_ROOT) || exit 1
 	@find . -type f -name '*.py[co]' -delete -o -type d -name __pycache__ -delete
 run-ci: format lint type ## Running all CI checks
-	docker inspect ragas-benchmark:latest | jq ".[0].Size" | numfmt --to=si
 test: ## Run tests
 	@echo "Running tests..."
 	@pytest tests/unit $(shell if [ -n "$(k)" ]; then echo "-k $(k)"; fi)
@@ -51,3 +50,4 @@ run-benchmarks-in-docker: ## Run benchmarks in docker
 	@echo "Running benchmarks in docker..."
 	@cd $(GIT_ROOT)
 	docker buildx build --build-arg OPENAI_API_KEY=$(OPENAI_API_KEY) -t ragas-benchmark -f $(GIT_ROOT)/tests/benchmarks/Dockerfile . 
+	docker inspect ragas-benchmark:latest | jq ".[0].Size" | numfmt --to=si
diff --git a/src/ragas/testset/generator.py b/src/ragas/testset/generator.py
index 1ff77b5ce..94b957034 100644
--- a/src/ragas/testset/generator.py
+++ b/src/ragas/testset/generator.py
@@ -6,10 +6,10 @@
 from llama_index.readers.schema import Document as LlamaindexDocument
 
 from ragas.embeddings import BaseRagasEmbeddings
+from ragas.executor import Executor
 from ragas.llms import BaseRagasLLM, LangchainLLMWrapper
 from ragas.testset.docstore import Document, DocumentStore, InMemoryDocumentStore
-from ragas.executor import Executor
-from ragas.testset.evolutions import SimpleEvolution, QuestionFilter, NodeFilter
+from ragas.testset.evolutions import NodeFilter, QuestionFilter, SimpleEvolution
 
 
 @dataclass
diff --git a/src/ragas/utils.py b/src/ragas/utils.py
index 80dac80ba..e59179d76 100644
--- a/src/ragas/utils.py
+++ b/src/ragas/utils.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
-import os
 import logging
+import os
 from functools import lru_cache
 
 DEBUG_ENV_VAR = "RAGAS_DEBUG"
diff --git a/tests/benchmarks/benchmark_eval.py b/tests/benchmarks/benchmark_eval.py
index 55d44a96b..2b78d482d 100644
--- a/tests/benchmarks/benchmark_eval.py
+++ b/tests/benchmarks/benchmark_eval.py
@@ -1,5 +1,5 @@
-import time
 import os
+import time
 
 from datasets import DatasetDict, load_dataset
 
diff --git a/tests/benchmarks/benchmark_testsetgen.py b/tests/benchmarks/benchmark_testsetgen.py
index 96963d73c..e78e7eb48 100644
--- a/tests/benchmarks/benchmark_testsetgen.py
+++ b/tests/benchmarks/benchmark_testsetgen.py
@@ -1,8 +1,10 @@
 import os
-from ragas.testset.generator import TestsetGenerator
-from llama_index import download_loader
 import time
 
+from llama_index import download_loader
+
+from ragas.testset.generator import TestsetGenerator
+
 generator = TestsetGenerator.with_openai()
 
 

From 05c579e776d44077e1209c1cb0c344e4dc7de8ae Mon Sep 17 00:00:00 2001
From: jjmachan <jamesjithin97@gmail.com>
Date: Fri, 19 Jan 2024 20:04:09 -0800
Subject: [PATCH 8/8] remove experiments

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 03503e779..b273ec94d 100644
--- a/Makefile
+++ b/Makefile
@@ -8,7 +8,7 @@ format: ## Running code formatter: black and isort
 	@echo "(isort) Ordering imports..."
 	@isort .
 	@echo "(black) Formatting codebase..."
-	@black --config pyproject.toml src tests docs experiments
+	@black --config pyproject.toml src tests docs
 	@echo "(black) Formatting stubs..."
 	@find src -name "*.pyi" ! -name "*_pb2*" -exec black --pyi --config pyproject.toml {} \;
 	@echo "(ruff) Running fix only..."