Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MalteseNewsClassification added #546

Merged
merged 11 commits into from
May 18, 2024
2 changes: 2 additions & 0 deletions docs/mmteb/points/546.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"GitHub": "dokato", "New dataset": 2}
{"GitHub": "isaac-chung", "Review PR": 2}
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
from __future__ import annotations

from mteb.abstasks import AbsTaskMultilabelClassification
from mteb.abstasks.TaskMetadata import TaskMetadata


class MalteseNewsClassification(AbsTaskMultilabelClassification):
metadata = TaskMetadata(
name="MalteseNewsClassification",
description="""A multi-label topic classification dataset for Maltese News
Articles. The data was collected from the press_mt subset from Korpus
Malti v4.0. Article contents were cleaned to filter out JavaScript, CSS,
& repeated non-Maltese sub-headings. The labels are based on the category
field from this corpus.
""",
reference="https://huggingface.co/datasets/MLRS/maltese_news_categories",
dataset={
"path": "MLRS/maltese_news_categories",
"revision": "6bb0321659c4f07c4c2176c30c98c971be6571b4",
},
type="Classification",
category="s2s",
eval_splits=["test"],
eval_langs=["mlt-Latn"],
main_score="accuracy",
date=("2023-10-21", "2024-04-24"),
form=["written"],
domains=["Constructed"],
task_subtypes=["Topic classification"],
license="cc-by-nc-sa-4.0",
socioeconomic_status="high",
annotations_creators="expert-annotated",
dialect=[],
text_creation="found",
bibtex_citation="""@inproceedings{maltese-news-datasets,
title = "Topic Classification and Headline Generation for {M}altese using a Public News Corpus",
author = "Chaudhary, Amit Kumar and
Micallef, Kurt and
Borg, Claudia",
booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation",
month = may,
year = "2024",
publisher = "Association for Computational Linguistics",
}""",
n_samples={"train": 10784, "test": 2297},
avg_character_length={"train": 1595.63, "test": 1752.1},
)

def dataset_transform(self):
# 80% of categories have just one label, so it's safe to take the first
self.dataset = self.dataset.rename_columns({"labels": "label"})
remove_cols = [
col
for col in self.dataset["test"].column_names
if col not in ["text", "label"]
]
self.dataset = self.dataset.remove_columns(remove_cols)
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{
"dataset_revision": "6bb0321659c4f07c4c2176c30c98c971be6571b4",
"mteb_dataset_name": "MalteseNewsClassification",
"mteb_version": "1.6.37",
"test": {
"accuracy": 0.2266869830213322,
"accuracy_stderr": 0.02263575385876514,
"evaluation_time": 161.74,
"f1": 0.2281811500943983,
"f1_stderr": 0.019025670776051662,
"lrap": 0.3279256333210852,
"lrap_stderr": 0.03015123586760265,
"main_score": 0.2266869830213322
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{
"dataset_revision": "6bb0321659c4f07c4c2176c30c98c971be6571b4",
"mteb_dataset_name": "MalteseNewsClassification",
"mteb_version": "1.6.37",
"test": {
"accuracy": 0.07885117493472585,
"accuracy_stderr": 0.017705011126129034,
"evaluation_time": 20.53,
"f1": 0.07009299109901666,
"f1_stderr": 0.014374488952030922,
"lrap": 0.15509671666752245,
"lrap_stderr": 0.01950650468106085,
"main_score": 0.07885117493472585
}
}
Loading