-
Notifications
You must be signed in to change notification settings - Fork 204
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
fix: Add JaGovFaqs and NLPJournal datasets (#808)
* Add JaGovFaqs dataset * Add NLPJournal datasets * Add JAQKET dataset * Add points * Fix metadata * Remove title from corpus for JAQKET dataset * Update JAQKET scores (without title) * Exclude JAQKET dataset * Add points for review
- Loading branch information
Showing
14 changed files
with
622 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
{"GitHub": "awinml", "New dataset": 8} | ||
{"GitHub": "KennethEnevoldsen", "Review PR": 2} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
from __future__ import annotations | ||
|
||
import datasets | ||
|
||
from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval | ||
from mteb.abstasks.TaskMetadata import TaskMetadata | ||
|
||
_EVAL_SPLIT = "test" | ||
_MAX_EVAL_SIZE = 2048 | ||
|
||
|
||
class JaGovFaqsRetrieval(AbsTaskRetrieval): | ||
metadata = TaskMetadata( | ||
name="JaGovFaqsRetrieval", | ||
description="JaGovFaqs is a dataset consisting of FAQs manully extracted from the website of Japanese bureaus. The dataset consists of 22k FAQs, where the queries (questions) and corpus (answers) have been shuffled, and the goal is to match the answer with the question.", | ||
reference="https://github.com/sbintuitions/JMTEB", | ||
dataset={ | ||
"path": "sbintuitions/JMTEB", | ||
"revision": "e4af6c73182bebb41d94cb336846e5a452454ea7", | ||
}, | ||
type="Retrieval", | ||
category="s2s", | ||
eval_splits=[_EVAL_SPLIT], | ||
eval_langs=["jpn-Jpan"], | ||
main_score="ndcg_at_10", | ||
date=("2000-01-01", "2023-12-31"), | ||
form=["written"], | ||
domains=["Web"], | ||
task_subtypes=[], | ||
license="cc-by-4.0", | ||
socioeconomic_status="high", | ||
annotations_creators="derived", | ||
dialect=[], | ||
text_creation="found", | ||
bibtex_citation="", | ||
n_samples={_EVAL_SPLIT: _MAX_EVAL_SIZE}, | ||
avg_character_length={_EVAL_SPLIT: 210.02}, | ||
) | ||
|
||
def load_data(self, **kwargs): | ||
if self.data_loaded: | ||
return | ||
|
||
query_list = datasets.load_dataset( | ||
name="jagovfaqs_22k-query", | ||
split=_EVAL_SPLIT, | ||
**self.metadata_dict["dataset"], | ||
) | ||
|
||
# Limit the dataset size to make sure the task does not take too long to run, sample the dataset to 2048 queries | ||
query_list = query_list.shuffle(seed=self.seed) | ||
max_samples = min(_MAX_EVAL_SIZE, len(query_list)) | ||
query_list = query_list.select(range(max_samples)) | ||
|
||
queries = {} | ||
qrels = {} | ||
for row_id, row in enumerate(query_list): | ||
queries[str(row_id)] = row["query"] | ||
qrels[str(row_id)] = {str(row["relevant_docs"][0]): 1} | ||
|
||
corpus_list = datasets.load_dataset( | ||
name="jagovfaqs_22k-corpus", split="corpus", **self.metadata_dict["dataset"] | ||
) | ||
|
||
corpus = {str(row["docid"]): {"text": row["text"]} for row in corpus_list} | ||
|
||
self.corpus = {_EVAL_SPLIT: corpus} | ||
self.queries = {_EVAL_SPLIT: queries} | ||
self.relevant_docs = {_EVAL_SPLIT: qrels} | ||
|
||
self.data_loaded = True |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
from __future__ import annotations | ||
|
||
import datasets | ||
|
||
from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval | ||
from mteb.abstasks.TaskMetadata import TaskMetadata | ||
|
||
_EVAL_SPLIT = "test" | ||
|
||
|
||
class NLPJournalAbsIntroRetrieval(AbsTaskRetrieval): | ||
metadata = TaskMetadata( | ||
name="NLPJournalAbsIntroRetrieval", | ||
description="This dataset was created from the Japanese NLP Journal LaTeX Corpus. The titles, abstracts and introductions of the academic papers were shuffled. The goal is to find the corresponding introduction with the given abstract.", | ||
reference="https://github.com/sbintuitions/JMTEB", | ||
dataset={ | ||
"path": "sbintuitions/JMTEB", | ||
"revision": "e4af6c73182bebb41d94cb336846e5a452454ea7", | ||
}, | ||
type="Retrieval", | ||
category="s2s", | ||
eval_splits=[_EVAL_SPLIT], | ||
eval_langs=["jpn-Jpan"], | ||
main_score="ndcg_at_10", | ||
date=("2000-01-01", "2023-12-31"), | ||
form=["written"], | ||
domains=["Academic"], | ||
task_subtypes=[], | ||
license="cc-by-4.0", | ||
socioeconomic_status="high", | ||
annotations_creators="derived", | ||
dialect=[], | ||
text_creation="found", | ||
bibtex_citation="", | ||
n_samples={_EVAL_SPLIT: 404}, | ||
avg_character_length={_EVAL_SPLIT: 1246.49}, | ||
) | ||
|
||
def load_data(self, **kwargs): | ||
if self.data_loaded: | ||
return | ||
|
||
query_list = datasets.load_dataset( | ||
name="nlp_journal_abs_intro-query", | ||
split=_EVAL_SPLIT, | ||
**self.metadata_dict["dataset"], | ||
) | ||
|
||
queries = {} | ||
qrels = {} | ||
for row_id, row in enumerate(query_list): | ||
queries[str(row_id)] = row["query"] | ||
qrels[str(row_id)] = {str(row["relevant_docs"]): 1} | ||
|
||
corpus_list = datasets.load_dataset( | ||
name="nlp_journal_abs_intro-corpus", | ||
split="corpus", | ||
**self.metadata_dict["dataset"], | ||
) | ||
|
||
corpus = {str(row["docid"]): {"text": row["text"]} for row in corpus_list} | ||
|
||
self.corpus = {_EVAL_SPLIT: corpus} | ||
self.queries = {_EVAL_SPLIT: queries} | ||
self.relevant_docs = {_EVAL_SPLIT: qrels} | ||
|
||
self.data_loaded = True |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
from __future__ import annotations | ||
|
||
import datasets | ||
|
||
from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval | ||
from mteb.abstasks.TaskMetadata import TaskMetadata | ||
|
||
_EVAL_SPLIT = "test" | ||
|
||
|
||
class NLPJournalTitleAbsRetrieval(AbsTaskRetrieval): | ||
metadata = TaskMetadata( | ||
name="NLPJournalTitleAbsRetrieval", | ||
description="This dataset was created from the Japanese NLP Journal LaTeX Corpus. The titles, abstracts and introductions of the academic papers were shuffled. The goal is to find the corresponding abstract with the given title.", | ||
reference="https://github.com/sbintuitions/JMTEB", | ||
dataset={ | ||
"path": "sbintuitions/JMTEB", | ||
"revision": "e4af6c73182bebb41d94cb336846e5a452454ea7", | ||
}, | ||
type="Retrieval", | ||
category="s2s", | ||
eval_splits=[_EVAL_SPLIT], | ||
eval_langs=["jpn-Jpan"], | ||
main_score="ndcg_at_10", | ||
date=("2000-01-01", "2023-12-31"), | ||
form=["written"], | ||
domains=["Academic"], | ||
task_subtypes=[], | ||
license="cc-by-4.0", | ||
socioeconomic_status="high", | ||
annotations_creators="derived", | ||
dialect=[], | ||
text_creation="found", | ||
bibtex_citation="", | ||
n_samples={_EVAL_SPLIT: 404}, | ||
avg_character_length={_EVAL_SPLIT: 234.59}, | ||
) | ||
|
||
def load_data(self, **kwargs): | ||
if self.data_loaded: | ||
return | ||
|
||
query_list = datasets.load_dataset( | ||
name="nlp_journal_title_abs-query", | ||
split=_EVAL_SPLIT, | ||
**self.metadata_dict["dataset"], | ||
) | ||
|
||
queries = {} | ||
qrels = {} | ||
for row_id, row in enumerate(query_list): | ||
queries[str(row_id)] = row["query"] | ||
qrels[str(row_id)] = {str(row["relevant_docs"]): 1} | ||
|
||
corpus_list = datasets.load_dataset( | ||
name="nlp_journal_title_abs-corpus", | ||
split="corpus", | ||
**self.metadata_dict["dataset"], | ||
) | ||
|
||
corpus = {str(row["docid"]): {"text": row["text"]} for row in corpus_list} | ||
|
||
self.corpus = {_EVAL_SPLIT: corpus} | ||
self.queries = {_EVAL_SPLIT: queries} | ||
self.relevant_docs = {_EVAL_SPLIT: qrels} | ||
|
||
self.data_loaded = True |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
from __future__ import annotations | ||
|
||
import datasets | ||
|
||
from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval | ||
from mteb.abstasks.TaskMetadata import TaskMetadata | ||
|
||
_EVAL_SPLIT = "test" | ||
|
||
|
||
class NLPJournalTitleIntroRetrieval(AbsTaskRetrieval): | ||
metadata = TaskMetadata( | ||
name="NLPJournalTitleIntroRetrieval", | ||
description="This dataset was created from the Japanese NLP Journal LaTeX Corpus. The titles, abstracts and introductions of the academic papers were shuffled. The goal is to find the corresponding introduction with the given title.", | ||
reference="https://github.com/sbintuitions/JMTEB", | ||
dataset={ | ||
"path": "sbintuitions/JMTEB", | ||
"revision": "e4af6c73182bebb41d94cb336846e5a452454ea7", | ||
}, | ||
type="Retrieval", | ||
category="s2s", | ||
eval_splits=[_EVAL_SPLIT], | ||
eval_langs=["jpn-Jpan"], | ||
main_score="ndcg_at_10", | ||
date=("2000-01-01", "2023-12-31"), | ||
form=["written"], | ||
domains=["Academic"], | ||
task_subtypes=[], | ||
license="cc-by-4.0", | ||
socioeconomic_status="high", | ||
annotations_creators="derived", | ||
dialect=[], | ||
text_creation="found", | ||
bibtex_citation="", | ||
n_samples={_EVAL_SPLIT: 404}, | ||
avg_character_length={_EVAL_SPLIT: 1040.19}, | ||
) | ||
|
||
def load_data(self, **kwargs): | ||
if self.data_loaded: | ||
return | ||
|
||
query_list = datasets.load_dataset( | ||
name="nlp_journal_title_intro-query", | ||
split=_EVAL_SPLIT, | ||
**self.metadata_dict["dataset"], | ||
) | ||
|
||
queries = {} | ||
qrels = {} | ||
for row_id, row in enumerate(query_list): | ||
queries[str(row_id)] = row["query"] | ||
qrels[str(row_id)] = {str(row["relevant_docs"]): 1} | ||
|
||
corpus_list = datasets.load_dataset( | ||
name="nlp_journal_title_intro-corpus", | ||
split="corpus", | ||
**self.metadata_dict["dataset"], | ||
) | ||
|
||
corpus = {str(row["docid"]): {"text": row["text"]} for row in corpus_list} | ||
|
||
self.corpus = {_EVAL_SPLIT: corpus} | ||
self.queries = {_EVAL_SPLIT: queries} | ||
self.relevant_docs = {_EVAL_SPLIT: qrels} | ||
|
||
self.data_loaded = True |
43 changes: 43 additions & 0 deletions
43
results/intfloat__multilingual-e5-small/JaGovFaqsRetrieval.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
{ | ||
"dataset_revision": "e4af6c73182bebb41d94cb336846e5a452454ea7", | ||
"mteb_dataset_name": "JaGovFaqsRetrieval", | ||
"mteb_version": "1.10.15", | ||
"test": { | ||
"evaluation_time": 44.36, | ||
"map_at_1": 0.50195, | ||
"map_at_10": 0.6005, | ||
"map_at_100": 0.60564, | ||
"map_at_1000": 0.60589, | ||
"map_at_20": 0.60358, | ||
"map_at_3": 0.57829, | ||
"map_at_5": 0.59252, | ||
"mrr_at_1": 0.50293, | ||
"mrr_at_10": 0.60095, | ||
"mrr_at_100": 0.60604, | ||
"mrr_at_1000": 0.60629, | ||
"mrr_at_20": 0.60398, | ||
"mrr_at_3": 0.57878, | ||
"mrr_at_5": 0.59291, | ||
"ndcg_at_1": 0.50195, | ||
"ndcg_at_10": 0.64773, | ||
"ndcg_at_100": 0.6725, | ||
"ndcg_at_1000": 0.67931, | ||
"ndcg_at_20": 0.65864, | ||
"ndcg_at_3": 0.60285, | ||
"ndcg_at_5": 0.62844, | ||
"precision_at_1": 0.50195, | ||
"precision_at_10": 0.07954, | ||
"precision_at_100": 0.00911, | ||
"precision_at_1000": 0.00096, | ||
"precision_at_20": 0.04189, | ||
"precision_at_3": 0.22461, | ||
"precision_at_5": 0.14717, | ||
"recall_at_1": 0.50195, | ||
"recall_at_10": 0.79541, | ||
"recall_at_100": 0.91064, | ||
"recall_at_1000": 0.96436, | ||
"recall_at_20": 0.83789, | ||
"recall_at_3": 0.67383, | ||
"recall_at_5": 0.73584 | ||
} | ||
} |
43 changes: 43 additions & 0 deletions
43
results/intfloat__multilingual-e5-small/NLPJournalAbsIntroRetrieval.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
{ | ||
"dataset_revision": "e4af6c73182bebb41d94cb336846e5a452454ea7", | ||
"mteb_dataset_name": "NLPJournalAbsIntroRetrieval", | ||
"mteb_version": "1.10.15", | ||
"test": { | ||
"evaluation_time": 475.45, | ||
"map_at_1": 0.7698, | ||
"map_at_10": 0.82718, | ||
"map_at_100": 0.8307, | ||
"map_at_1000": 0.83074, | ||
"map_at_20": 0.82955, | ||
"map_at_3": 0.82054, | ||
"map_at_5": 0.82537, | ||
"mrr_at_1": 0.7698, | ||
"mrr_at_10": 0.82718, | ||
"mrr_at_100": 0.8307, | ||
"mrr_at_1000": 0.83074, | ||
"mrr_at_20": 0.82955, | ||
"mrr_at_3": 0.82054, | ||
"mrr_at_5": 0.82537, | ||
"ndcg_at_1": 0.7698, | ||
"ndcg_at_10": 0.84924, | ||
"ndcg_at_100": 0.86592, | ||
"ndcg_at_1000": 0.86693, | ||
"ndcg_at_20": 0.85837, | ||
"ndcg_at_3": 0.83619, | ||
"ndcg_at_5": 0.84461, | ||
"precision_at_1": 0.7698, | ||
"precision_at_10": 0.09158, | ||
"precision_at_100": 0.00993, | ||
"precision_at_1000": 0.001, | ||
"precision_at_20": 0.04765, | ||
"precision_at_3": 0.29373, | ||
"precision_at_5": 0.1802, | ||
"recall_at_1": 0.7698, | ||
"recall_at_10": 0.91584, | ||
"recall_at_100": 0.99257, | ||
"recall_at_1000": 1.0, | ||
"recall_at_20": 0.95297, | ||
"recall_at_3": 0.88119, | ||
"recall_at_5": 0.90099 | ||
} | ||
} |
Oops, something went wrong.