embeddings-benchmark · KennethEnevoldsen · May 23, 2024 · May 23, 2024 · May 23, 2024
diff --git a/docs/mmteb/points/737.jsonl b/docs/mmteb/points/737.jsonl
@@ -0,0 +1,5 @@
+{"GitHub": "malteos", "New dataset": 6}
+{"GitHub": "KranthiGV",  "Review PR": 2}
+{"GitHub": "wissam-sib",  "Review PR": 2}
+{"GitHub": "Ruqyai",  "Review PR": 2}
+{"GitHub": "KennethEnevoldsen",  "Review PR": 2, "Bug Fixes": 2}
diff --git a/mteb/tasks/Classification/ara/AJGT.py b/mteb/tasks/Classification/ara/AJGT.py
@@ -8,7 +8,7 @@ class AJGT(AbsTaskClassification):
     metadata = TaskMetadata(
         name="AJGT",
         dataset={
-            "path": "ajgt_twitter_ar",
+            "path": "komari6/ajgt_twitter_ar",
             "revision": "af3f2fa5462ac461b696cb300d66e07ad366057f",
         },
         description="Arabic Jordanian General Tweets (AJGT) Corpus consisted of 1,800 tweets annotated as positive and negative. Modern Standard Arabic (MSA) or Jordanian dialect.",

diff --git a/mteb/tasks/Classification/ara/RestaurantReviewSentimentClassification.py b/mteb/tasks/Classification/ara/RestaurantReviewSentimentClassification.py
@@ -10,7 +10,7 @@ class RestaurantReviewSentimentClassification(AbsTaskClassification):
     metadata = TaskMetadata(
         name="RestaurantReviewSentimentClassification",
         dataset={
-            "path": "ar_res_reviews",
+            "path": "hadyelsahar/ar_res_reviews",
             "revision": "d51bf2435d030e0041344f576c5e8d7154828977",
         },
         description="Dataset of 8364 restaurant reviews from qaym.com in Arabic for sentiment analysis",

diff --git a/mteb/tasks/Classification/ara/TweetSarcasmClassification.py b/mteb/tasks/Classification/ara/TweetSarcasmClassification.py
@@ -8,7 +8,7 @@ class TweetSarcasmClassification(AbsTaskClassification):
     metadata = TaskMetadata(
         name="TweetSarcasmClassification",
         dataset={
-            "path": "ar_sarcasm",
+            "path": "iabufarha/ar_sarcasm",
             "revision": "557bf94ac6177cc442f42d0b09b6e4b76e8f47c9",
         },
         description="Arabic sarcasm detection dataset, which was created through the reannotation of available Arabic sentiment analysis datasets.",

diff --git a/mteb/tasks/Classification/ben/BengaliHateSpeechClassification.py b/mteb/tasks/Classification/ben/BengaliHateSpeechClassification.py
@@ -10,7 +10,7 @@ class BengaliHateSpeechClassification(AbsTaskClassification):
         description="The Bengali Hate Speech Dataset is a Bengali-language dataset of news articles collected from various Bengali media sources and categorized based on the type of hate in the text.",
         reference="https://huggingface.co/datasets/bn_hate_speech",
         dataset={
-            "path": "bn_hate_speech",
+            "path": "rezacsedu/bn_hate_speech",
             "revision": "99612296bc093f0720cac7d7cbfcb67eecf1ca2f",
         },
         type="Classification",

diff --git a/mteb/tasks/Classification/eng/FinancialPhrasebankClassification.py b/mteb/tasks/Classification/eng/FinancialPhrasebankClassification.py
@@ -11,7 +11,7 @@ class FinancialPhrasebankClassification(AbsTaskClassification):
         description="Polar sentiment dataset of sentences from financial news, categorized by sentiment into positive, negative, or neutral.",
         reference="https://arxiv.org/abs/1307.5336",
         dataset={
-            "path": "financial_phrasebank",
+            "path": "takala/financial_phrasebank",
             "revision": "1484d06fe7af23030c7c977b12556108d1f67039",
             "name": "sentences_allagree",
         },

diff --git a/mteb/tasks/Classification/eng/NewsClassification.py b/mteb/tasks/Classification/eng/NewsClassification.py
@@ -10,7 +10,7 @@ class NewsClassification(AbsTaskClassification):
         name="NewsClassification",
         description="Large News Classification Dataset",
         dataset={
-            "path": "ag_news",
+            "path": "fancyzhx/ag_news",
             "revision": "eb185aade064a813bc0b7f42de02595523103ca4",
         },
         reference="https://arxiv.org/abs/1509.01626",

diff --git a/mteb/tasks/Classification/fra/MovieReviewSentimentClassification.py b/mteb/tasks/Classification/fra/MovieReviewSentimentClassification.py
@@ -10,7 +10,7 @@ class MovieReviewSentimentClassification(AbsTaskClassification):
     metadata = TaskMetadata(
         name="MovieReviewSentimentClassification",
         dataset={
-            "path": "allocine",
+            "path": "tblard/allocine",
             "revision": "a4654f4896408912913a62ace89614879a549287",
         },
         description="The Allociné dataset is a French-language dataset for sentiment analysis that contains movie reviews produced by the online community of the Allociné.fr website.",

diff --git a/mteb/tasks/Classification/nld/DutchBookReviewSentimentClassification.py b/mteb/tasks/Classification/nld/DutchBookReviewSentimentClassification.py
@@ -10,7 +10,7 @@ class DutchBookReviewSentimentClassification(AbsTaskClassification):
         description="A Dutch book review for sentiment classification.",
         reference="https://github.com/benjaminvdb/DBRD",
         dataset={
-            "path": "dbrd",
+            "path": "benjaminvdb/dbrd",
             "revision": "3f756ab4572e071eb53e887ab629f19fa747d39e",
         },
         type="Classification",

diff --git a/mteb/tasks/PairClassification/__init__.py b/mteb/tasks/PairClassification/__init__.py
@@ -15,6 +15,7 @@
 from .multilingual.PawsX import *
 from .multilingual.RTE3 import *
 from .multilingual.XNLI import *
+from .multilingual.XStance import *
 from .pol.PolishPC import *
 from .por.Assin2RTE import *
 from .por.SickBrPC import *

diff --git a/mteb/tasks/PairClassification/multilingual/XStance.py b/mteb/tasks/PairClassification/multilingual/XStance.py
@@ -0,0 +1,109 @@
+from __future__ import annotations
+
+from datasets import load_dataset
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+from ....abstasks import MultilingualTask
+from ....abstasks.AbsTaskPairClassification import AbsTaskPairClassification
+
+
+class XStance(MultilingualTask, AbsTaskPairClassification):
+    metadata = TaskMetadata(
+        name="XStance",
+        dataset={
+            "path": "ZurichNLP/x_stance",
+            "revision": "810604b9ad3aafdc6144597fdaa40f21a6f5f3de",
+        },
+        description="A Multilingual Multi-Target Dataset for Stance Detection in French, German, and Italian.",
+        reference="https://github.com/ZurichNLP/xstance",
+        category="s2s",
+        type="PairClassification",
+        eval_splits=["test"],
+        eval_langs={
+            "de": ["deu-Latn"],
+            "fr": ["fra-Latn"],
+            "it": ["ita-Latn"],
+        },
+        main_score="ap",
+        date=("2011-01-01", "2020-12-31"),
+        form=["written"],
+        domains=["Social"],
+        task_subtypes=["Political classification"],
+        license="cc by-nc 4.0",
+        socioeconomic_status="medium",
+        annotations_creators="human-annotated",
+        dialect=[],
+        text_creation="created",
+        bibtex_citation="""
+            @inproceedings{vamvas2020xstance,
+                author    = "Vamvas, Jannis and Sennrich, Rico",
+                title     = "{X-Stance}: A Multilingual Multi-Target Dataset for Stance Detection",
+                booktitle = "Proceedings of the 5th Swiss Text Analytics Conference (SwissText)  16th Conference on Natural Language Processing (KONVENS)",
+                address   = "Zurich, Switzerland",
+                year      = "2020",
+                month     = "jun",
+                url       = "http://ceur-ws.org/Vol-2624/paper9.pdf"
+            }
+        """,
+        n_samples={"test": 2048},
+        avg_character_length={"test": 152.41},  # length of`sent1` + `sent2`
+    )
+
+    def load_data(self, **kwargs):
+        """Load dataset from HuggingFace hub"""
+        if self.data_loaded:
+            return
+
+        max_n_samples = 2048
+        self.dataset = {}
+        path = self.metadata_dict["dataset"]["path"]
+        revision = self.metadata_dict["dataset"]["revision"]
+        raw_dataset = load_dataset(path, revision=revision)
+
+        def convert_example(example):
+            return {
+                "sent1": example["question"],
+                "sent2": example["comment"],
+                "labels": 1 if example["label"] == "FAVOR" else 0,
+            }
+
+        for lang in self.metadata.eval_langs:
+            self.dataset[lang] = {}
+            for split in self.metadata_dict["eval_splits"]:
+                # filter by language
+                self.dataset[lang][split] = raw_dataset[split].filter(
+                    lambda row: row["language"] == lang
+                )
+
+                # reduce samples
+                if len(self.dataset[lang][split]) > max_n_samples:
+                    # only de + fr are larger than 2048 samples
+                    self.dataset[lang][split] = self.dataset[lang][split].select(
+                        range(max_n_samples)
+                    )
+
+                # convert examples
+                self.dataset[lang][split] = self.dataset[lang][split].map(
+                    convert_example,
+                    remove_columns=self.dataset[lang][split].column_names,
+                )
+
+        self.dataset_transform()
+        self.data_loaded = True
+
+    def dataset_transform(self):
+        """Transform dataset into sentence-pair format"""
+        _dataset = {}
+
+        for lang in self.metadata.eval_langs:
+            _dataset[lang] = {}
+            for split in self.metadata.eval_splits:
+                _dataset[lang][split] = [
+                    {
+                        "sent1": self.dataset[lang][split]["sent1"],
+                        "sent2": self.dataset[lang][split]["sent2"],
+                        "labels": self.dataset[lang][split]["labels"],
+                    }
+                ]
+        self.dataset = _dataset
diff --git a/mteb/tasks/PairClassification/por/Assin2RTE.py b/mteb/tasks/PairClassification/por/Assin2RTE.py
@@ -8,7 +8,7 @@ class Assin2RTE(AbsTaskPairClassification):
     metadata = TaskMetadata(
         name="Assin2RTE",
         dataset={
-            "path": "assin2",
+            "path": "nilc-nlp/assin2",
             "revision": "0ff9c86779e06855536d8775ce5550550e1e5a2d",
         },
         description="Recognizing Textual Entailment part of the ASSIN 2, an evaluation shared task collocated with STIL 2019.",

diff --git a/mteb/tasks/Retrieval/multilingual/XQuADRetrieval.py b/mteb/tasks/Retrieval/multilingual/XQuADRetrieval.py
@@ -29,7 +29,7 @@ class XQuADRetrieval(MultilingualTask, AbsTaskRetrieval):
     metadata = TaskMetadata(
         name="XQuADRetrieval",
         dataset={
-            "path": "xquad",
+            "path": "google/xquad",
             "revision": "51adfef1c1287aab1d2d91b5bead9bcfb9c68583",
         },
         description="XQuAD is a benchmark dataset for evaluating cross-lingual question answering performance. It is repurposed retrieving relevant context for each question.",

diff --git a/mteb/tasks/STS/por/Assin2STS.py b/mteb/tasks/STS/por/Assin2STS.py
@@ -8,7 +8,7 @@ class Assin2STS(AbsTaskSTS):
     metadata = TaskMetadata(
         name="Assin2STS",
         dataset={
-            "path": "assin2",
+            "path": "nilc-nlp/assin2",
             "revision": "0ff9c86779e06855536d8775ce5550550e1e5a2d",
         },
         description="Semantic Textual Similarity part of the ASSIN 2, an evaluation shared task collocated with STIL 2019.",

diff --git a/results/intfloat__multilingual-e5-small/XStance.json b/results/intfloat__multilingual-e5-small/XStance.json
@@ -0,0 +1,155 @@
+{
+  "dataset_revision": "810604b9ad3aafdc6144597fdaa40f21a6f5f3de",
+  "evaluation_time": 3.5373082160949707,
+  "kg_co2_emissions": null,
+  "mteb_version": "1.6.38",
+  "scores": {
+    "test": [
+      {
+        "cos_sim": {
+          "accuracy": 0.51806640625,
+          "accuracy_threshold": 0.8497587442398071,
+          "ap": 0.5038443561739071,
+          "f1": 0.6712195121951219,
+          "f1_threshold": 0.7853705883026123,
+          "precision": 0.5053868756121449,
+          "recall": 0.9990319457889641
+        },
+        "dot": {
+          "accuracy": 0.51806640625,
+          "accuracy_threshold": 0.8497587442398071,
+          "ap": 0.5038443561739071,
+          "f1": 0.6712195121951219,
+          "f1_threshold": 0.7853705286979675,
+          "precision": 0.5053868756121449,
+          "recall": 0.9990319457889641
+        },
+        "euclidean": {
+          "accuracy": 0.51806640625,
+          "accuracy_threshold": 0.5481629371643066,
+          "ap": 0.5038443561739071,
+          "f1": 0.6712195121951219,
+          "f1_threshold": 0.655178427696228,
+          "precision": 0.5053868756121449,
+          "recall": 0.9990319457889641
+        },
+        "hf_subset": "de",
+        "languages": [
+          "deu-Latn"
+        ],
+        "main_score": 0.5040793387249269,
+        "manhattan": {
+          "accuracy": 0.51416015625,
+          "accuracy_threshold": 8.64465045928955,
+          "ap": 0.5040793387249269,
+          "f1": 0.6714332141696457,
+          "f1_threshold": 10.558012962341309,
+          "precision": 0.5053816046966731,
+          "recall": 1.0
+        },
+        "max": {
+          "accuracy": 0.51806640625,
+          "ap": 0.5040793387249269,
+          "f1": 0.6714332141696457
+        }
+      },
+      {
+        "cos_sim": {
+          "accuracy": 0.5712890625,
+          "accuracy_threshold": 0.7847779989242554,
+          "ap": 0.5942397553753017,
+          "f1": 0.7265940902021772,
+          "f1_threshold": 0.761985719203949,
+          "precision": 0.570591108939912,
+          "recall": 1.0
+        },
+        "dot": {
+          "accuracy": 0.5712890625,
+          "accuracy_threshold": 0.7847781181335449,
+          "ap": 0.5942413209080255,
+          "f1": 0.7265940902021772,
+          "f1_threshold": 0.7619856595993042,
+          "precision": 0.570591108939912,
+          "recall": 1.0
+        },
+        "euclidean": {
+          "accuracy": 0.5712890625,
+          "accuracy_threshold": 0.6560750603675842,
+          "ap": 0.5942397553753018,
+          "f1": 0.7265940902021772,
+          "f1_threshold": 0.6898986101150513,
+          "precision": 0.570591108939912,
+          "recall": 1.0
+        },
+        "hf_subset": "fr",
+        "languages": [
+          "fra-Latn"
+        ],
+        "main_score": 0.5942413209080255,
+        "manhattan": {
+          "accuracy": 0.57177734375,
+          "accuracy_threshold": 9.83655834197998,
+          "ap": 0.5932090773271775,
+          "f1": 0.7265940902021772,
+          "f1_threshold": 10.73774242401123,
+          "precision": 0.570591108939912,
+          "recall": 1.0
+        },
+        "max": {
+          "accuracy": 0.57177734375,
+          "ap": 0.5942413209080255,
+          "f1": 0.7265940902021772
+        }
+      },
+      {
+        "cos_sim": {
+          "accuracy": 0.5408022130013831,
+          "accuracy_threshold": 0.7930425405502319,
+          "ap": 0.5182917988582176,
+          "f1": 0.701123595505618,
+          "f1_threshold": 0.7833303213119507,
+          "precision": 0.5397923875432526,
+          "recall": 1.0
+        },
+        "dot": {
+          "accuracy": 0.5408022130013831,
+          "accuracy_threshold": 0.7930425405502319,
+          "ap": 0.5182917988582176,
+          "f1": 0.701123595505618,
+          "f1_threshold": 0.7833303213119507,
+          "precision": 0.5397923875432526,
+          "recall": 1.0
+        },
+        "euclidean": {
+          "accuracy": 0.5408022130013831,
+          "accuracy_threshold": 0.6433612108230591,
+          "ap": 0.5182917988582176,
+          "f1": 0.701123595505618,
+          "f1_threshold": 0.6582847833633423,
+          "precision": 0.5397923875432526,
+          "recall": 1.0
+        },
+        "hf_subset": "it",
+        "languages": [
+          "ita-Latn"
+        ],
+        "main_score": 0.5182917988582176,
+        "manhattan": {
+          "accuracy": 0.5401106500691563,
+          "accuracy_threshold": 10.210434913635254,
+          "ap": 0.5178128531012762,
+          "f1": 0.7008547008547008,
+          "f1_threshold": 10.210434913635254,
+          "precision": 0.5398475398475399,
+          "recall": 0.9987179487179487
+        },
+        "max": {
+          "accuracy": 0.5408022130013831,
+          "ap": 0.5182917988582176,
+          "f1": 0.701123595505618
+        }
+      }
+    ]
+  },
+  "task_name": "XStance"
+}