From 5826734998e4185df5edf8df76fffff3a361896b Mon Sep 17 00:00:00 2001
From: Jithin James <jjmachan@pop-os.localdomain>
Date: Fri, 12 May 2023 23:57:50 +0530
Subject: [PATCH] fix: batching in Metric

---
 belar/metrics/__init__.py   |   1 +
 belar/metrics/base.py       |  23 ++-
 belar/metrics/similarity.py |  64 +++++++++
 belar/metrics/simple.py     |   2 +
 examples/quickstart.ipynb   | 276 +++++++++++++++++-------------------
 5 files changed, 208 insertions(+), 158 deletions(-)
 create mode 100644 belar/metrics/similarity.py

diff --git a/belar/metrics/__init__.py b/belar/metrics/__init__.py
index f3b63be9f..1dc56eba2 100644
--- a/belar/metrics/__init__.py
+++ b/belar/metrics/__init__.py
@@ -1,2 +1,3 @@
 from belar.metrics.base import Evaluation, Metric
+from belar.metrics.similarity import *
 from belar.metrics.simple import *
diff --git a/belar/metrics/base.py b/belar/metrics/base.py
index b900abce5..9662a8cf4 100644
--- a/belar/metrics/base.py
+++ b/belar/metrics/base.py
@@ -23,24 +23,21 @@ def is_batchable(self) -> bool:
     def score(self, ground_truth, generated_text) -> float | list[float]:
         ...
 
-    def __call__(self, row):
-        score = self.score(row["ground_truth"], row["generated_text"])
-        row[f"{self.name}_score"] = score
-
-        return row
-
 
 @dataclass
 class Evaluation:
     metrics: list[Metric]
+    batched: bool = False
 
-    def eval(
-        self, ground_truth: Dataset, generated_text: t.Sequence, batched: bool = False
-    ):
+    def eval(self, ground_truth: list[list[str]], generated_text: list[list[str]]):
         ds = ground_truth.add_column("generated_text", generated_text)
-        scores_list = []
+        ds = ds.map(self._get_score, batched=self.batched)
+
+        return ds
+
+    def _get_score(self, row):
         for metric in self.metrics:
-            scores = ds.map(metric, batched=batched)[f"{metric.name}_score"]
-            scores_list.append(scores)
+            score = metric.score(row["ground_truth"], row["generated_text"])
+            row[f"{metric.name}_score"] = score
 
-        return scores_list
+        return row
diff --git a/belar/metrics/similarity.py b/belar/metrics/similarity.py
new file mode 100644
index 000000000..06d7e9135
--- /dev/null
+++ b/belar/metrics/similarity.py
@@ -0,0 +1,64 @@
+from __future__ import annotations
+
+import typing as t
+from dataclasses import dataclass
+
+import numpy as np
+from numpy.linalg import norm
+from sentence_transformers import SentenceTransformer
+
+from belar.metrics.base import Metric
+
+SBERT_METRIC = t.Literal["cosine", "euclidean"]
+
+
+@dataclass
+class SBERTScore(Metric):
+    similarity_metric: t.Literal[SBERT_METRIC] = "cosine"
+    model_path: str = "all-MiniLM-L6-v2"
+    batch_size: int = 1000
+
+    def __post_init__(self):
+        self.model = SentenceTransformer(self.model_path)
+
+    @property
+    def name(
+        self,
+    ):
+        return f"SBERT_{self.similarity_metric}"
+
+    def is_batchable(self):
+        return True
+
+    def score(
+        self,
+        ground_truth: str | list[str],
+        generated_text: str | list[str],
+    ):
+        if isinstance(ground_truth, str):
+            ground_truth = [ground_truth]
+        if isinstance(generated_text, str):
+            generated_text = [generated_text]
+
+        gndtruth_emb = self.model.encode(
+            ground_truth, batch_size=self.batch_size, convert_to_numpy=True
+        )
+        gentext_emb = self.model.encode(
+            generated_text, batch_size=self.batch_size, convert_to_numpy=True
+        )
+
+        if self.similarity_metric == "cosine":
+            score = np.dot(gndtruth_emb, gentext_emb.T) / (
+                norm(gndtruth_emb) * norm(gentext_emb)
+            )
+
+        elif self.similarity_metric == "euclidean":
+            score = norm(gndtruth_emb - gentext_emb, ord=2)
+
+        else:
+            raise ValueError(f"Unkown metrics {self.similarity_metric}")
+
+        return score
+
+
+__all__ = ["SBERTScore"]
diff --git a/belar/metrics/simple.py b/belar/metrics/simple.py
index 338506565..1ff7f9acc 100644
--- a/belar/metrics/simple.py
+++ b/belar/metrics/simple.py
@@ -20,9 +20,11 @@ def __post_init__(self):
             [self.type], use_stemmer=self.use_stemmer
         )
 
+    @property
     def name(self):
         return self.type
 
+    @property
     def is_batchable(self):
         return False
 
diff --git a/examples/quickstart.ipynb b/examples/quickstart.ipynb
index a4c85a8d8..6effae29b 100644
--- a/examples/quickstart.ipynb
+++ b/examples/quickstart.ipynb
@@ -3,7 +3,7 @@
   {
    "cell_type": "code",
    "execution_count": 7,
-   "id": "806b182e",
+   "id": "992c777a",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -13,8 +13,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
-   "id": "4dfd5a18",
+   "execution_count": 4,
+   "id": "5eaf4729",
    "metadata": {},
    "outputs": [
     {
@@ -27,7 +27,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "b3e986700a834fa6ab1e327b49e1fae3",
+       "model_id": "a2231863e61c4ffd8d695c8531a48139",
        "version_major": 2,
        "version_minor": 0
       },
@@ -42,7 +42,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Loading cached processed dataset at /home/jjmachan/.cache/huggingface/datasets/eli5/LFQA_reddit/1.0.0/17574e5502a10f41bbd17beba83e22475b499fa62caa1384a3d093fc856fe6fa/cache-b6864f61633d1e41.arrow\n"
+      "Loading cached processed dataset at /home/jjmachan/.cache/huggingface/datasets/eli5/LFQA_reddit/1.0.0/17574e5502a10f41bbd17beba83e22475b499fa62caa1384a3d093fc856fe6fa/cache-f3427cd7a8a8674f.arrow\n"
      ]
     }
    ],
@@ -51,41 +51,70 @@
     "\n",
     "def format_for_belar(row):\n",
     "    row[\"context\"] = row[\"selftext\"]\n",
-    "    row[\"question\"] = row[\"title\"]\n",
-    "    row['answers'] = row[\"answers\"][\"text\"]\n",
+    "    row[\"prompt\"] = row[\"title\"]\n",
+    "    row['ground_truth'] = row[\"answers\"][\"text\"]\n",
     "    return row\n",
     "    \n",
     "d = load_dataset(\"eli5\")\n",
     "ds = d['test_eli5'].map(format_for_belar, batched=False)\n",
-    "ds = ds.select_columns([\"context\", \"question\", \"answers\"])"
+    "ds = ds.select_columns([\"context\", \"prompt\", \"ground_truth\"])"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
-   "id": "5c2974c5",
+   "execution_count": 5,
+   "id": "a501c296",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "(100, 3)"
+       "(500, 3)"
       ]
      },
-     "execution_count": 3,
+     "execution_count": 5,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "ds = ds.shuffle().select(range(100))\n",
+    "ds = ds.shuffle(seed=42).select(range(500))\n",
     "ds.shape"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
-   "id": "5546153b",
+   "execution_count": 8,
+   "id": "763335eb",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['context', 'prompt', 'ground_truth']"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "ds.column_names"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6aff2ae9",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "c32c39f5",
    "metadata": {},
    "outputs": [
     {
@@ -96,7 +125,7 @@
        "version_minor": 0
       },
       "text/plain": [
-       "Map:   0%|          | 0/100 [00:00<?, ? examples/s]"
+       "Map:   0%|          | 0/500 [00:00<?, ? examples/s]"
       ]
      },
      "metadata": {},
@@ -104,19 +133,34 @@
     }
    ],
    "source": [
+    "import concurrent.futures as f\n",
+    "from langchain.llms import OpenAI\n",
+    "\n",
+    "llm = OpenAI()\n",
+    "prompt = \"\"\"\n",
+    "{context}\n",
+    "with the above context explain like I'm five: {prompt}\n",
+    "\"\"\"\n",
+    "\n",
     "def get_answers(row):\n",
-    "    row[\"answer_generated\"] = llm(prompt.format(\n",
-    "        context=row['context'],\n",
-    "        question=row['question'])\n",
-    "       )\n",
+    "    qs, cs = row[\"prompt\"], row[\"context\"]\n",
+    "    \n",
+    "    generated_answers = []\n",
+    "    with f.ThreadPoolExecutor(max_workers=10) as executor:\n",
+    "        results = executor.map(llm, \n",
+    "            [prompt.format(context=cs[i], prompt=qs[i]) for i in range(len(qs))])\n",
+    "        for result in results:\n",
+    "            generated_answers.append(result)\n",
+    "     \n",
+    "    row[\"generated_answers\"] = generated_answers\n",
     "    return row\n",
     "    \n",
-    "ds = ds.map(get_answers, batched=False)"
+    "ds = ds.map(get_answers, batched=True, batch_size=10)"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "5fd1de0f",
+   "id": "c5ddec2d",
    "metadata": {},
    "source": [
     "## Evalutate"
@@ -124,18 +168,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
-   "id": "fe5c5195",
+   "execution_count": 54,
+   "id": "2d0a7ba7",
    "metadata": {},
    "outputs": [],
    "source": [
-    "from belar.metrics import Rouge1, Evaluation"
+    "from belar.metrics import Rouge1, Evaluation, Rouge2, RougeL"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 9,
-   "id": "e4e5ee0a",
+   "id": "001ab431",
    "metadata": {},
    "outputs": [
     {
@@ -156,7 +200,7 @@
   {
    "cell_type": "code",
    "execution_count": 15,
-   "id": "8eb82c25",
+   "id": "ce9448b9",
    "metadata": {},
    "outputs": [
     {
@@ -197,158 +241,100 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 39,
-   "id": "c3d4543d",
+   "execution_count": 47,
+   "id": "49565367",
    "metadata": {},
    "outputs": [
     {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "WARNING:datasets.arrow_dataset:Loading cached processed dataset at /home/jjmachan/.cache/huggingface/datasets/eli5/LFQA_reddit/1.0.0/17574e5502a10f41bbd17beba83e22475b499fa62caa1384a3d093fc856fe6fa/cache-b775b691e85e97cb.arrow\n"
-     ]
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a14aebe3baca44e386442688a86b26ad",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
     },
     {
      "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ed53f7de6f7c4638ad664b9553e14789",
+       "version_major": 2,
+       "version_minor": 0
+      },
       "text/plain": [
-       "[[0.3235294117647059,\n",
-       "  0.3870967741935484,\n",
-       "  0.3372093023255814,\n",
-       "  0.296875,\n",
-       "  0.3785488958990536,\n",
-       "  0.19729206963249518,\n",
-       "  0.27807486631016043,\n",
-       "  0.25225225225225223,\n",
-       "  0.15196078431372548,\n",
-       "  0.1170212765957447,\n",
-       "  0.24509803921568626,\n",
-       "  0.1894736842105263,\n",
-       "  0.3106796116504854,\n",
-       "  0.3240223463687151,\n",
-       "  0.20689655172413796,\n",
-       "  0.15686274509803924,\n",
-       "  0.11363636363636362,\n",
-       "  0.27636363636363637,\n",
-       "  0.2524271844660194,\n",
-       "  0.380952380952381,\n",
-       "  0.233502538071066,\n",
-       "  0.1842105263157895,\n",
-       "  0.22900763358778622,\n",
-       "  0.1518987341772152,\n",
-       "  0.2631578947368421,\n",
-       "  0.3764705882352941,\n",
-       "  0.2301369863013699,\n",
-       "  0.2553191489361702,\n",
-       "  0.20689655172413796,\n",
-       "  0.20930232558139533,\n",
-       "  0.4035087719298246,\n",
-       "  0.24295774647887325,\n",
-       "  0.271604938271605,\n",
-       "  0.22307692307692306,\n",
-       "  0.18181818181818182,\n",
-       "  0.37818181818181823,\n",
-       "  0.15436241610738255,\n",
-       "  0.17500000000000002,\n",
-       "  0.30075187969924805,\n",
-       "  0.1257142857142857,\n",
-       "  0.3574144486692016,\n",
-       "  0.20408163265306123,\n",
-       "  0.36601307189542487,\n",
-       "  0.2637362637362637,\n",
-       "  0.21794871794871798,\n",
-       "  0.2684563758389262,\n",
-       "  0.23354564755838642,\n",
-       "  0.1518987341772152,\n",
-       "  0.12743362831858404,\n",
-       "  0.18461538461538463,\n",
-       "  0.23809523809523808,\n",
-       "  0.2580645161290323,\n",
-       "  0.32352941176470584,\n",
-       "  0.2891566265060241,\n",
-       "  0.24050632911392403,\n",
-       "  0.18888888888888888,\n",
-       "  0.2956521739130435,\n",
-       "  0.18840579710144928,\n",
-       "  0.3103448275862069,\n",
-       "  0.05714285714285715,\n",
-       "  0.23076923076923078,\n",
-       "  0.3003952569169961,\n",
-       "  0.24719101123595508,\n",
-       "  0.1072961373390558,\n",
-       "  0.180327868852459,\n",
-       "  0.11666666666666667,\n",
-       "  0.2268041237113402,\n",
-       "  0.2709677419354839,\n",
-       "  0.3373493975903615,\n",
-       "  0.22525597269624573,\n",
-       "  0.2654028436018957,\n",
-       "  0.2072538860103627,\n",
-       "  0.13953488372093026,\n",
-       "  0.1981132075471698,\n",
-       "  0.25842696629213485,\n",
-       "  0.11864406779661017,\n",
-       "  0.1834061135371179,\n",
-       "  0.2310756972111554,\n",
-       "  0.3469387755102041,\n",
-       "  0.28571428571428575,\n",
-       "  0.18055555555555555,\n",
-       "  0.24489795918367346,\n",
-       "  0.26519337016574585,\n",
-       "  0.27586206896551724,\n",
-       "  0.1282798833819242,\n",
-       "  0.2105263157894737,\n",
-       "  0.20905923344947733,\n",
-       "  0.3793103448275862,\n",
-       "  0.25000000000000006,\n",
-       "  0.17142857142857143,\n",
-       "  0.35251798561151076,\n",
-       "  0.16551724137931034,\n",
-       "  0.13333333333333333,\n",
-       "  0.14835164835164835,\n",
-       "  0.11764705882352941,\n",
-       "  0.4154589371980677,\n",
-       "  0.4647887323943662,\n",
-       "  0.125,\n",
-       "  0.16296296296296295,\n",
-       "  0.37007874015748027]]"
+       "Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]"
       ]
      },
-     "execution_count": 39,
      "metadata": {},
-     "output_type": "execute_result"
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "39b71f5f8c0742f3b90da786e910eef3",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
     }
    ],
    "source": [
-    "e = Evaluation(metrics=[Rouge1])\n",
-    "e.eval(ds.select_columns([\"ground_truth\"]),\n",
-    "                         ds[\"generated_text\"])"
+    "ds.push_to_hub(\"explodinggradients/eli5-test\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 36,
-   "id": "86981217",
+   "execution_count": 81,
+   "id": "1dcbdbd1",
    "metadata": {},
    "outputs": [
     {
      "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "",
+       "version_major": 2,
+       "version_minor": 0
+      },
       "text/plain": [
-       "list"
+       "Map:   0%|          | 0/100 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "Dataset({\n",
+       "    features: ['ground_truth', 'generated_text', 'rouge1_score', 'rouge2_score', 'rougeL_score'],\n",
+       "    num_rows: 100\n",
+       "})"
       ]
      },
-     "execution_count": 36,
+     "execution_count": 81,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "type(ds[\"generated_text\"])"
+    "e = Evaluation(metrics=[Rouge1, Rouge2, RougeL])\n",
+    "e.eval(ds.select_columns([\"ground_truth\"]), ds[\"generated_text\"])"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "247af506",
+   "id": "27b0b9a7",
    "metadata": {},
    "outputs": [],
    "source": []