Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Add JaGovFaqs and NLPJournal datasets #808

Merged
2 changes: 2 additions & 0 deletions docs/mmteb/points/808.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"GitHub": "awinml", "New dataset": 8}
{"GitHub": "KennethEnevoldsen", "Review PR": 2}
4 changes: 4 additions & 0 deletions mteb/tasks/Retrieval/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,11 @@
from .fra.FQuADRetrieval import *
from .fra.SyntecRetrieval import *
from .hun.HunSum2 import *
from .jpn.JaGovFaqsRetrieval import *
from .jpn.JaQuADRetrieval import *
from .jpn.NLPJournalAbsIntroRetrieval import *
from .jpn.NLPJournalTitleAbsRetrieval import *
from .jpn.NLPJournalTitleIntroRetrieval import *
from .kat.GeorgianFAQRetrieval import *
from .kor.KoMiracl import *
from .kor.KoStrategyQA import *
Expand Down
71 changes: 71 additions & 0 deletions mteb/tasks/Retrieval/jpn/JaGovFaqsRetrieval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
from __future__ import annotations

import datasets

from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval
from mteb.abstasks.TaskMetadata import TaskMetadata

_EVAL_SPLIT = "test"
_MAX_EVAL_SIZE = 2048


class JaGovFaqsRetrieval(AbsTaskRetrieval):
metadata = TaskMetadata(
name="JaGovFaqsRetrieval",
description="JaGovFaqs is a dataset consisting of FAQs manully extracted from the website of Japanese bureaus. The dataset consists of 22k FAQs, where the queries (questions) and corpus (answers) have been shuffled, and the goal is to match the answer with the question.",
reference="https://github.com/sbintuitions/JMTEB",
dataset={
"path": "sbintuitions/JMTEB",
"revision": "e4af6c73182bebb41d94cb336846e5a452454ea7",
},
type="Retrieval",
category="s2s",
eval_splits=[_EVAL_SPLIT],
eval_langs=["jpn-Jpan"],
main_score="ndcg_at_10",
date=("2000-01-01", "2023-12-31"),
form=["written"],
domains=["Web"],
task_subtypes=[],
license="cc-by-4.0",
socioeconomic_status="high",
annotations_creators="derived",
dialect=[],
text_creation="found",
bibtex_citation="",
n_samples={_EVAL_SPLIT: _MAX_EVAL_SIZE},
avg_character_length={_EVAL_SPLIT: 210.02},
)

def load_data(self, **kwargs):
if self.data_loaded:
return

query_list = datasets.load_dataset(
name="jagovfaqs_22k-query",
split=_EVAL_SPLIT,
**self.metadata_dict["dataset"],
)

# Limit the dataset size to make sure the task does not take too long to run, sample the dataset to 2048 queries
query_list = query_list.shuffle(seed=self.seed)
max_samples = min(_MAX_EVAL_SIZE, len(query_list))
query_list = query_list.select(range(max_samples))

queries = {}
qrels = {}
for row_id, row in enumerate(query_list):
queries[str(row_id)] = row["query"]
qrels[str(row_id)] = {str(row["relevant_docs"][0]): 1}

corpus_list = datasets.load_dataset(
name="jagovfaqs_22k-corpus", split="corpus", **self.metadata_dict["dataset"]
)

corpus = {str(row["docid"]): {"text": row["text"]} for row in corpus_list}

self.corpus = {_EVAL_SPLIT: corpus}
self.queries = {_EVAL_SPLIT: queries}
self.relevant_docs = {_EVAL_SPLIT: qrels}

self.data_loaded = True
67 changes: 67 additions & 0 deletions mteb/tasks/Retrieval/jpn/NLPJournalAbsIntroRetrieval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
from __future__ import annotations

import datasets

from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval
from mteb.abstasks.TaskMetadata import TaskMetadata

_EVAL_SPLIT = "test"


class NLPJournalAbsIntroRetrieval(AbsTaskRetrieval):
metadata = TaskMetadata(
name="NLPJournalAbsIntroRetrieval",
description="This dataset was created from the Japanese NLP Journal LaTeX Corpus. The titles, abstracts and introductions of the academic papers were shuffled. The goal is to find the corresponding introduction with the given abstract.",
reference="https://github.com/sbintuitions/JMTEB",
dataset={
"path": "sbintuitions/JMTEB",
"revision": "e4af6c73182bebb41d94cb336846e5a452454ea7",
},
type="Retrieval",
category="s2s",
eval_splits=[_EVAL_SPLIT],
eval_langs=["jpn-Jpan"],
main_score="ndcg_at_10",
date=("2000-01-01", "2023-12-31"),
form=["written"],
domains=["Academic"],
task_subtypes=[],
license="cc-by-4.0",
socioeconomic_status="high",
annotations_creators="derived",
dialect=[],
text_creation="found",
bibtex_citation="",
n_samples={_EVAL_SPLIT: 404},
avg_character_length={_EVAL_SPLIT: 1246.49},
)

def load_data(self, **kwargs):
if self.data_loaded:
return

query_list = datasets.load_dataset(
name="nlp_journal_abs_intro-query",
split=_EVAL_SPLIT,
**self.metadata_dict["dataset"],
)

queries = {}
qrels = {}
for row_id, row in enumerate(query_list):
queries[str(row_id)] = row["query"]
qrels[str(row_id)] = {str(row["relevant_docs"]): 1}

corpus_list = datasets.load_dataset(
name="nlp_journal_abs_intro-corpus",
split="corpus",
**self.metadata_dict["dataset"],
)

corpus = {str(row["docid"]): {"text": row["text"]} for row in corpus_list}

self.corpus = {_EVAL_SPLIT: corpus}
self.queries = {_EVAL_SPLIT: queries}
self.relevant_docs = {_EVAL_SPLIT: qrels}

self.data_loaded = True
67 changes: 67 additions & 0 deletions mteb/tasks/Retrieval/jpn/NLPJournalTitleAbsRetrieval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
from __future__ import annotations

import datasets

from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval
from mteb.abstasks.TaskMetadata import TaskMetadata

_EVAL_SPLIT = "test"


class NLPJournalTitleAbsRetrieval(AbsTaskRetrieval):
metadata = TaskMetadata(
name="NLPJournalTitleAbsRetrieval",
description="This dataset was created from the Japanese NLP Journal LaTeX Corpus. The titles, abstracts and introductions of the academic papers were shuffled. The goal is to find the corresponding abstract with the given title.",
reference="https://github.com/sbintuitions/JMTEB",
dataset={
"path": "sbintuitions/JMTEB",
"revision": "e4af6c73182bebb41d94cb336846e5a452454ea7",
},
type="Retrieval",
category="s2s",
eval_splits=[_EVAL_SPLIT],
eval_langs=["jpn-Jpan"],
main_score="ndcg_at_10",
date=("2000-01-01", "2023-12-31"),
form=["written"],
domains=["Academic"],
task_subtypes=[],
license="cc-by-4.0",
socioeconomic_status="high",
annotations_creators="derived",
dialect=[],
text_creation="found",
bibtex_citation="",
n_samples={_EVAL_SPLIT: 404},
avg_character_length={_EVAL_SPLIT: 234.59},
)

def load_data(self, **kwargs):
if self.data_loaded:
return

query_list = datasets.load_dataset(
name="nlp_journal_title_abs-query",
split=_EVAL_SPLIT,
**self.metadata_dict["dataset"],
)

queries = {}
qrels = {}
for row_id, row in enumerate(query_list):
queries[str(row_id)] = row["query"]
qrels[str(row_id)] = {str(row["relevant_docs"]): 1}

corpus_list = datasets.load_dataset(
name="nlp_journal_title_abs-corpus",
split="corpus",
**self.metadata_dict["dataset"],
)

corpus = {str(row["docid"]): {"text": row["text"]} for row in corpus_list}

self.corpus = {_EVAL_SPLIT: corpus}
self.queries = {_EVAL_SPLIT: queries}
self.relevant_docs = {_EVAL_SPLIT: qrels}

self.data_loaded = True
67 changes: 67 additions & 0 deletions mteb/tasks/Retrieval/jpn/NLPJournalTitleIntroRetrieval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
from __future__ import annotations

import datasets

from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval
from mteb.abstasks.TaskMetadata import TaskMetadata

_EVAL_SPLIT = "test"


class NLPJournalTitleIntroRetrieval(AbsTaskRetrieval):
metadata = TaskMetadata(
name="NLPJournalTitleIntroRetrieval",
description="This dataset was created from the Japanese NLP Journal LaTeX Corpus. The titles, abstracts and introductions of the academic papers were shuffled. The goal is to find the corresponding introduction with the given title.",
reference="https://github.com/sbintuitions/JMTEB",
dataset={
"path": "sbintuitions/JMTEB",
"revision": "e4af6c73182bebb41d94cb336846e5a452454ea7",
},
type="Retrieval",
category="s2s",
eval_splits=[_EVAL_SPLIT],
eval_langs=["jpn-Jpan"],
main_score="ndcg_at_10",
date=("2000-01-01", "2023-12-31"),
form=["written"],
domains=["Academic"],
task_subtypes=[],
license="cc-by-4.0",
socioeconomic_status="high",
annotations_creators="derived",
dialect=[],
text_creation="found",
bibtex_citation="",
n_samples={_EVAL_SPLIT: 404},
avg_character_length={_EVAL_SPLIT: 1040.19},
)

def load_data(self, **kwargs):
if self.data_loaded:
return

query_list = datasets.load_dataset(
name="nlp_journal_title_intro-query",
split=_EVAL_SPLIT,
**self.metadata_dict["dataset"],
)

queries = {}
qrels = {}
for row_id, row in enumerate(query_list):
queries[str(row_id)] = row["query"]
qrels[str(row_id)] = {str(row["relevant_docs"]): 1}

corpus_list = datasets.load_dataset(
name="nlp_journal_title_intro-corpus",
split="corpus",
**self.metadata_dict["dataset"],
)

corpus = {str(row["docid"]): {"text": row["text"]} for row in corpus_list}

self.corpus = {_EVAL_SPLIT: corpus}
self.queries = {_EVAL_SPLIT: queries}
self.relevant_docs = {_EVAL_SPLIT: qrels}

self.data_loaded = True
43 changes: 43 additions & 0 deletions results/intfloat__multilingual-e5-small/JaGovFaqsRetrieval.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
{
"dataset_revision": "e4af6c73182bebb41d94cb336846e5a452454ea7",
"mteb_dataset_name": "JaGovFaqsRetrieval",
"mteb_version": "1.10.15",
"test": {
"evaluation_time": 44.36,
"map_at_1": 0.50195,
"map_at_10": 0.6005,
"map_at_100": 0.60564,
"map_at_1000": 0.60589,
"map_at_20": 0.60358,
"map_at_3": 0.57829,
"map_at_5": 0.59252,
"mrr_at_1": 0.50293,
"mrr_at_10": 0.60095,
"mrr_at_100": 0.60604,
"mrr_at_1000": 0.60629,
"mrr_at_20": 0.60398,
"mrr_at_3": 0.57878,
"mrr_at_5": 0.59291,
"ndcg_at_1": 0.50195,
"ndcg_at_10": 0.64773,
"ndcg_at_100": 0.6725,
"ndcg_at_1000": 0.67931,
"ndcg_at_20": 0.65864,
"ndcg_at_3": 0.60285,
"ndcg_at_5": 0.62844,
"precision_at_1": 0.50195,
"precision_at_10": 0.07954,
"precision_at_100": 0.00911,
"precision_at_1000": 0.00096,
"precision_at_20": 0.04189,
"precision_at_3": 0.22461,
"precision_at_5": 0.14717,
"recall_at_1": 0.50195,
"recall_at_10": 0.79541,
"recall_at_100": 0.91064,
"recall_at_1000": 0.96436,
"recall_at_20": 0.83789,
"recall_at_3": 0.67383,
"recall_at_5": 0.73584
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
{
"dataset_revision": "e4af6c73182bebb41d94cb336846e5a452454ea7",
"mteb_dataset_name": "NLPJournalAbsIntroRetrieval",
"mteb_version": "1.10.15",
"test": {
"evaluation_time": 475.45,
"map_at_1": 0.7698,
"map_at_10": 0.82718,
"map_at_100": 0.8307,
"map_at_1000": 0.83074,
"map_at_20": 0.82955,
"map_at_3": 0.82054,
"map_at_5": 0.82537,
"mrr_at_1": 0.7698,
"mrr_at_10": 0.82718,
"mrr_at_100": 0.8307,
"mrr_at_1000": 0.83074,
"mrr_at_20": 0.82955,
"mrr_at_3": 0.82054,
"mrr_at_5": 0.82537,
"ndcg_at_1": 0.7698,
"ndcg_at_10": 0.84924,
"ndcg_at_100": 0.86592,
"ndcg_at_1000": 0.86693,
"ndcg_at_20": 0.85837,
"ndcg_at_3": 0.83619,
"ndcg_at_5": 0.84461,
"precision_at_1": 0.7698,
"precision_at_10": 0.09158,
"precision_at_100": 0.00993,
"precision_at_1000": 0.001,
"precision_at_20": 0.04765,
"precision_at_3": 0.29373,
"precision_at_5": 0.1802,
"recall_at_1": 0.7698,
"recall_at_10": 0.91584,
"recall_at_100": 0.99257,
"recall_at_1000": 1.0,
"recall_at_20": 0.95297,
"recall_at_3": 0.88119,
"recall_at_5": 0.90099
}
}
Loading
Loading