fix: Converted VG to hierarchical (embeddings-benchmark#694)

* Added hierarchical VG clustering tasks * Added startified subsampling for multilabel tasks to AbsTask * Added stratified subsampling to VG clustering * Fixed stratified subsampling for multilabel tasks * fix: Converted VG to AbsTaskClusteringFast * Added results for paraphrase model * Removed debugging print statements * Added 'not specified' license to VGHierarchical * Added proper license from Norsk Aviskorpus * Ran linting * Replaced stratification with just regular subsampling * fix: fixed subsampling * Added results for VG * Added points * fix: Fixed JSON in 694.jsonl --------- Co-authored-by: Kenneth Enevoldsen <kennethcenevoldsen@gmail.com>
dokato · May 24, 2024 · b5b3823 · b5b3823
1 parent fc576aa
commit b5b3823
Show file tree

Hide file tree

Showing 9 changed files with 321 additions and 2 deletions.
diff --git a/docs/mmteb/points/694.jsonl b/docs/mmteb/points/694.jsonl
@@ -0,0 +1,2 @@
+{"GitHub": "x-tabdeveloping", "Bug fixes": 4}
+{"GitHub": "KennethEnevoldsen", "Review PR": 2}
diff --git a/mteb/abstasks/AbsTask.py b/mteb/abstasks/AbsTask.py
@@ -3,12 +3,14 @@
 import logging
 import random
 from abc import ABC, abstractmethod
-from typing import Any
+from typing import Any, Sequence
 
 import datasets
 import numpy as np
 import torch
 from datasets import DatasetDict
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import MultiLabelBinarizer
 
 from mteb.abstasks.TaskMetadata import TaskMetadata
 from mteb.encoder_interface import Encoder, EncoderWithQueryCorpusEncode
@@ -19,6 +21,32 @@
 logger = logging.getLogger(__name__)
 
 
+def _multilabel_subsampling(
+    dataset_dict: datasets.DatasetDict,
+    seed: int,
+    splits: list[str] = ["test"],
+    label: str = "label",
+    n_samples: int = 2048,
+) -> datasets.DatasetDict:
+    """Startified subsampling for multilabel problems."""
+    for split in splits:
+        labels = dataset_dict[split][label]
+        encoded_labels = MultiLabelBinarizer().fit_transform(labels)
+        idxs = np.arange(len(labels))
+        try:
+            idxs, *_ = train_test_split(
+                idxs,
+                encoded_labels,
+                stratify=encoded_labels,
+                random_state=seed,
+                train_size=n_samples,
+            )
+        except ValueError:
+            logger.warn("Couldn't subsample, continuing with full split.")
+        dataset_dict.update({split: dataset_dict[split].select(idxs)})
+    return dataset_dict
+
+
 class AbsTask(ABC):
     metadata: TaskMetadata
     superseeded_by: None | str = None
@@ -113,7 +141,15 @@ def stratified_subsampling(
         """
         ## Can only do this if the label column is of ClassLabel.
         if not isinstance(dataset_dict[splits[0]].features[label], datasets.ClassLabel):
-            dataset_dict = dataset_dict.class_encode_column(label)
+            try:
+                dataset_dict = dataset_dict.class_encode_column(label)
+            except ValueError as e:
+                if isinstance(dataset_dict[splits[0]][label][0], Sequence):
+                    return _multilabel_subsampling(
+                        dataset_dict, seed, splits, label, n_samples
+                    )
+                else:
+                    raise e
 
         for split in splits:
             dataset_dict.update(

diff --git a/mteb/tasks/Clustering/__init__.py b/mteb/tasks/Clustering/__init__.py
@@ -38,6 +38,7 @@
 from .nob.snl_clustering import *
 from .nob.SNLHierarchicalClustering import *
 from .nob.vg_clustering import *
+from .nob.VGHierarchicalClustering import *
 from .pol.PolishClustering import *
 from .rom.RomaniBibleClustering import *
 from .spa.SpanishNewsClusteringP2P import *

diff --git a/mteb/tasks/Clustering/nob/VGHierarchicalClustering.py b/mteb/tasks/Clustering/nob/VGHierarchicalClustering.py
@@ -0,0 +1,99 @@
+from __future__ import annotations
+
+from mteb.abstasks import TaskMetadata
+from mteb.abstasks.AbsTaskClusteringFast import AbsTaskClusteringFast
+
+N_SAMPLES = 2048
+
+
+def split_labels(record: dict) -> dict:
+    record["labels"] = record["labels"].split(",")[:2]
+    return record
+
+
+class VGHierarchicalClusteringP2P(AbsTaskClusteringFast):
+    metadata = TaskMetadata(
+        name="VGHierarchicalClusteringP2P",
+        dataset={
+            "path": "navjordj/VG_summarization",
+            "revision": "d4c5a8ba10ae71224752c727094ac4c46947fa29",
+        },
+        description="Articles and their classes (e.g. sports) from VG news articles extracted from Norsk Aviskorpus.",
+        reference="https://huggingface.co/datasets/navjordj/VG_summarization",
+        type="Clustering",
+        category="p2p",
+        eval_splits=["test"],
+        eval_langs=["nob-Latn"],
+        main_score="v_measure",
+        date=("2020-01-01", "2024-12-31"),  # best guess
+        form=["written"],
+        domains=["News", "Non-fiction"],
+        license="CC-BY-NC 4.0",
+        socioeconomic_status="mixed",
+        annotations_creators="derived",
+        dialect=[],
+        task_subtypes=["Thematic clustering"],
+        text_creation="found",
+        bibtex_citation="""@mastersthesis{navjord2023beyond,
+    title={Beyond extractive: advancing abstractive automatic text summarization in Norwegian with transformers},
+    author={Navjord, J{\o}rgen Johnsen and Korsvik, Jon-Mikkel Ryen},
+    year={2023},
+    school={Norwegian University of Life Sciences, {\AA}s}
+}""",
+        n_samples={"test": N_SAMPLES},
+        avg_character_length={"test": 2670.3243084794544},
+    )
+
+    def dataset_transform(self) -> None:
+        self.dataset = self.dataset.rename_columns(
+            {"article": "sentences", "classes": "labels"}
+        )
+        self.dataset = self.dataset.map(split_labels)
+        # Subsampling the dataset
+        self.dataset["test"] = self.dataset["test"].train_test_split(
+            test_size=N_SAMPLES, seed=self.seed
+        )["test"]
+
+
+class VGHierarchicalClusteringS2S(AbsTaskClusteringFast):
+    metadata = TaskMetadata(
+        name="VGHierarchicalClusteringS2S",
+        dataset={
+            "path": "navjordj/VG_summarization",
+            "revision": "d4c5a8ba10ae71224752c727094ac4c46947fa29",
+        },
+        description="Articles and their classes (e.g. sports) from VG news articles extracted from Norsk Aviskorpus.",
+        reference="https://huggingface.co/datasets/navjordj/VG_summarization",
+        type="Clustering",
+        category="p2p",
+        eval_splits=["test"],
+        eval_langs=["nob-Latn"],
+        main_score="v_measure",
+        date=("2020-01-01", "2024-12-31"),  # best guess
+        form=["written"],
+        domains=["News", "Non-fiction"],
+        license="CC-BY-NC 4.0",
+        socioeconomic_status="mixed",
+        annotations_creators="derived",
+        dialect=[],
+        task_subtypes=["Thematic clustering"],
+        text_creation="found",
+        bibtex_citation="""@mastersthesis{navjord2023beyond,
+    title={Beyond extractive: advancing abstractive automatic text summarization in Norwegian with transformers},
+    author={Navjord, J{\o}rgen Johnsen and Korsvik, Jon-Mikkel Ryen},
+    year={2023},
+    school={Norwegian University of Life Sciences, {\AA}s}
+}""",
+        n_samples={"test": N_SAMPLES},
+        avg_character_length={"test": 139.31247668283325},
+    )
+
+    def dataset_transform(self) -> None:
+        self.dataset = self.dataset.rename_columns(
+            {"ingress": "sentences", "classes": "labels"}
+        )
+        self.dataset = self.dataset.map(split_labels)
+        # Subsampling the dataset
+        self.dataset["test"] = self.dataset["test"].train_test_split(
+            test_size=N_SAMPLES, seed=self.seed
+        )["test"]
diff --git a/mteb/tasks/Clustering/nob/vg_clustering.py b/mteb/tasks/Clustering/nob/vg_clustering.py
@@ -21,6 +21,7 @@ def batched(iterable: Iterable[T], n: int) -> Iterable[tuple[T, ...]]:
 
 
 class VGClustering(AbsTaskClustering):
+    superseeded_by = "VGHierarchicalClusteringP2P"
     metadata = TaskMetadata(
         name="VGClustering",
         dataset={

diff --git a/results/intfloat__multilingual-e5-small/VGHierarchicalClusteringP2P.json b/results/intfloat__multilingual-e5-small/VGHierarchicalClusteringP2P.json
@@ -0,0 +1,45 @@
+{
+  "dataset_revision": "d4c5a8ba10ae71224752c727094ac4c46947fa29",
+  "evaluation_time": 135.50683665275574,
+  "kg_co2_emissions": null,
+  "mteb_version": "1.11.6",
+  "scores": {
+    "test": [
+      {
+        "hf_subset": "default",
+        "languages": [
+          "nob-Latn"
+        ],
+        "main_score": 0.42051163852735396,
+        "v_measure": 0.42051163852735396,
+        "v_measures": {
+          "Level 0": [
+            0.38323314310337875,
+            0.3989320487473865,
+            0.4105931986190897,
+            0.38677810276773084,
+            0.42705826820548526,
+            0.4407980498265106,
+            0.3785528238444032,
+            0.41625337873137075,
+            0.3989633525705215,
+            0.4140992664333277
+          ],
+          "Level 1": [
+            0.43844921921634217,
+            0.42630343948954647,
+            0.40904538846014304,
+            0.43711022680518274,
+            0.4419996131929952,
+            0.4401999738405541,
+            0.4323128799875848,
+            0.43653274562159944,
+            0.4367445987697985,
+            0.4562730523141273
+          ]
+        }
+      }
+    ]
+  },
+  "task_name": "VGHierarchicalClusteringP2P"
+}
diff --git a/results/intfloat__multilingual-e5-small/VGHierarchicalClusteringS2S.json b/results/intfloat__multilingual-e5-small/VGHierarchicalClusteringS2S.json
@@ -0,0 +1,45 @@
+{
+  "dataset_revision": "d4c5a8ba10ae71224752c727094ac4c46947fa29",
+  "evaluation_time": 23.324329137802124,
+  "kg_co2_emissions": null,
+  "mteb_version": "1.11.6",
+  "scores": {
+    "test": [
+      {
+        "hf_subset": "default",
+        "languages": [
+          "nob-Latn"
+        ],
+        "main_score": 0.34103019423671366,
+        "v_measure": 0.34103019423671366,
+        "v_measures": {
+          "Level 0": [
+            0.3121116915411674,
+            0.3074277020644131,
+            0.34489975581262233,
+            0.3176749457376003,
+            0.3392949224315137,
+            0.3331527675548358,
+            0.32987604186610175,
+            0.34547068568262207,
+            0.3416311797106411,
+            0.3301424742817373
+          ],
+          "Level 1": [
+            0.3645265027939012,
+            0.34739983570204375,
+            0.33704618502701184,
+            0.3551549934940932,
+            0.3305621012508423,
+            0.34890269151079495,
+            0.35106274701661966,
+            0.3702643489228558,
+            0.3593753066096102,
+            0.35462700572324524
+          ]
+        }
+      }
+    ]
+  },
+  "task_name": "VGHierarchicalClusteringS2S"
+}
diff --git a/...ence-transformers__paraphrase-multilingual-MiniLM-L12-v2/VGHierarchicalClusteringP2P.json b/...ence-transformers__paraphrase-multilingual-MiniLM-L12-v2/VGHierarchicalClusteringP2P.json
@@ -0,0 +1,45 @@
+{
+  "dataset_revision": "d4c5a8ba10ae71224752c727094ac4c46947fa29",
+  "evaluation_time": 80.27384757995605,
+  "kg_co2_emissions": null,
+  "mteb_version": "1.11.6",
+  "scores": {
+    "test": [
+      {
+        "hf_subset": "default",
+        "languages": [
+          "nob-Latn"
+        ],
+        "main_score": 0.3544100886422448,
+        "v_measure": 0.3544100886422448,
+        "v_measures": {
+          "Level 0": [
+            0.3432523142155497,
+            0.36317673321215704,
+            0.3706145253800888,
+            0.35766818218679197,
+            0.36533142188274287,
+            0.3469079530784989,
+            0.34793190945020136,
+            0.37450460241437256,
+            0.361451598798914,
+            0.3468109403627138
+          ],
+          "Level 1": [
+            0.3546038236048674,
+            0.34579585642750266,
+            0.3459826646868124,
+            0.3484356826859091,
+            0.35659378058747193,
+            0.35447381402923767,
+            0.3361421350567975,
+            0.3430716898552756,
+            0.357995481445959,
+            0.36745666348303196
+          ]
+        }
+      }
+    ]
+  },
+  "task_name": "VGHierarchicalClusteringP2P"
+}
diff --git a/...ence-transformers__paraphrase-multilingual-MiniLM-L12-v2/VGHierarchicalClusteringS2S.json b/...ence-transformers__paraphrase-multilingual-MiniLM-L12-v2/VGHierarchicalClusteringS2S.json
@@ -0,0 +1,45 @@
+{
+  "dataset_revision": "d4c5a8ba10ae71224752c727094ac4c46947fa29",
+  "evaluation_time": 64.87495875358582,
+  "kg_co2_emissions": null,
+  "mteb_version": "1.11.6",
+  "scores": {
+    "test": [
+      {
+        "hf_subset": "default",
+        "languages": [
+          "nob-Latn"
+        ],
+        "main_score": 0.2886935384193351,
+        "v_measure": 0.2886935384193351,
+        "v_measures": {
+          "Level 0": [
+            0.25927593083469574,
+            0.26491058719044713,
+            0.19853402718054483,
+            0.25007657074147577,
+            0.30451066421726614,
+            0.2759300676280902,
+            0.25660890312269036,
+            0.3055592803074229,
+            0.30644353051055306,
+            0.271323953135058
+          ],
+          "Level 1": [
+            0.31352802114590117,
+            0.29961281553155933,
+            0.2955337891514432,
+            0.3222434800301185,
+            0.3101948959625172,
+            0.29249873108622443,
+            0.30737376823457874,
+            0.31481592521969615,
+            0.29827241585365416,
+            0.3266234113027661
+          ]
+        }
+      }
+    ]
+  },
+  "task_name": "VGHierarchicalClusteringS2S"
+}