Skip to content

Commit

Permalink
fix: Add MalteseNewsClassification (#546)
Browse files Browse the repository at this point in the history
* MalteseNewsClassification added

* lint fixes

* MalteseNewsClassification as multilabel + WIP stratification added

* results for multilabel classfication updated

* Maltese MultiLabelClassification added

---------

Co-authored-by: Isaac Chung <chungisaac1217@gmail.com>
  • Loading branch information
dokato and isaac-chung authored May 18, 2024
1 parent 76758f8 commit 5314bf5
Show file tree
Hide file tree
Showing 4 changed files with 89 additions and 0 deletions.
2 changes: 2 additions & 0 deletions docs/mmteb/points/546.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"GitHub": "dokato", "New dataset": 2}
{"GitHub": "isaac-chung", "Review PR": 2}
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
from __future__ import annotations

from mteb.abstasks import AbsTaskMultilabelClassification
from mteb.abstasks.TaskMetadata import TaskMetadata


class MalteseNewsClassification(AbsTaskMultilabelClassification):
metadata = TaskMetadata(
name="MalteseNewsClassification",
description="""A multi-label topic classification dataset for Maltese News
Articles. The data was collected from the press_mt subset from Korpus
Malti v4.0. Article contents were cleaned to filter out JavaScript, CSS,
& repeated non-Maltese sub-headings. The labels are based on the category
field from this corpus.
""",
reference="https://huggingface.co/datasets/MLRS/maltese_news_categories",
dataset={
"path": "MLRS/maltese_news_categories",
"revision": "6bb0321659c4f07c4c2176c30c98c971be6571b4",
},
type="Classification",
category="s2s",
eval_splits=["test"],
eval_langs=["mlt-Latn"],
main_score="accuracy",
date=("2023-10-21", "2024-04-24"),
form=["written"],
domains=["Constructed"],
task_subtypes=["Topic classification"],
license="cc-by-nc-sa-4.0",
socioeconomic_status="high",
annotations_creators="expert-annotated",
dialect=[],
text_creation="found",
bibtex_citation="""@inproceedings{maltese-news-datasets,
title = "Topic Classification and Headline Generation for {M}altese using a Public News Corpus",
author = "Chaudhary, Amit Kumar and
Micallef, Kurt and
Borg, Claudia",
booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation",
month = may,
year = "2024",
publisher = "Association for Computational Linguistics",
}""",
n_samples={"train": 10784, "test": 2297},
avg_character_length={"train": 1595.63, "test": 1752.1},
)

def dataset_transform(self):
# 80% of categories have just one label, so it's safe to take the first
self.dataset = self.dataset.rename_columns({"labels": "label"})
remove_cols = [
col
for col in self.dataset["test"].column_names
if col not in ["text", "label"]
]
self.dataset = self.dataset.remove_columns(remove_cols)
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{
"dataset_revision": "6bb0321659c4f07c4c2176c30c98c971be6571b4",
"mteb_dataset_name": "MalteseNewsClassification",
"mteb_version": "1.6.37",
"test": {
"accuracy": 0.2266869830213322,
"accuracy_stderr": 0.02263575385876514,
"evaluation_time": 161.74,
"f1": 0.2281811500943983,
"f1_stderr": 0.019025670776051662,
"lrap": 0.3279256333210852,
"lrap_stderr": 0.03015123586760265,
"main_score": 0.2266869830213322
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{
"dataset_revision": "6bb0321659c4f07c4c2176c30c98c971be6571b4",
"mteb_dataset_name": "MalteseNewsClassification",
"mteb_version": "1.6.37",
"test": {
"accuracy": 0.07885117493472585,
"accuracy_stderr": 0.017705011126129034,
"evaluation_time": 20.53,
"f1": 0.07009299109901666,
"f1_stderr": 0.014374488952030922,
"lrap": 0.15509671666752245,
"lrap_stderr": 0.01950650468106085,
"main_score": 0.07885117493472585
}
}

0 comments on commit 5314bf5

Please sign in to comment.