-
Notifications
You must be signed in to change notification settings - Fork 260
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
fix: Add MalteseNewsClassification (#546)
* MalteseNewsClassification added * lint fixes * MalteseNewsClassification as multilabel + WIP stratification added * results for multilabel classfication updated * Maltese MultiLabelClassification added --------- Co-authored-by: Isaac Chung <chungisaac1217@gmail.com>
- Loading branch information
1 parent
76758f8
commit 5314bf5
Showing
4 changed files
with
89 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
{"GitHub": "dokato", "New dataset": 2} | ||
{"GitHub": "isaac-chung", "Review PR": 2} |
57 changes: 57 additions & 0 deletions
57
mteb/tasks/MultiLabelClassification/mlt/MalteseNewsClassification.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
from __future__ import annotations | ||
|
||
from mteb.abstasks import AbsTaskMultilabelClassification | ||
from mteb.abstasks.TaskMetadata import TaskMetadata | ||
|
||
|
||
class MalteseNewsClassification(AbsTaskMultilabelClassification): | ||
metadata = TaskMetadata( | ||
name="MalteseNewsClassification", | ||
description="""A multi-label topic classification dataset for Maltese News | ||
Articles. The data was collected from the press_mt subset from Korpus | ||
Malti v4.0. Article contents were cleaned to filter out JavaScript, CSS, | ||
& repeated non-Maltese sub-headings. The labels are based on the category | ||
field from this corpus. | ||
""", | ||
reference="https://huggingface.co/datasets/MLRS/maltese_news_categories", | ||
dataset={ | ||
"path": "MLRS/maltese_news_categories", | ||
"revision": "6bb0321659c4f07c4c2176c30c98c971be6571b4", | ||
}, | ||
type="Classification", | ||
category="s2s", | ||
eval_splits=["test"], | ||
eval_langs=["mlt-Latn"], | ||
main_score="accuracy", | ||
date=("2023-10-21", "2024-04-24"), | ||
form=["written"], | ||
domains=["Constructed"], | ||
task_subtypes=["Topic classification"], | ||
license="cc-by-nc-sa-4.0", | ||
socioeconomic_status="high", | ||
annotations_creators="expert-annotated", | ||
dialect=[], | ||
text_creation="found", | ||
bibtex_citation="""@inproceedings{maltese-news-datasets, | ||
title = "Topic Classification and Headline Generation for {M}altese using a Public News Corpus", | ||
author = "Chaudhary, Amit Kumar and | ||
Micallef, Kurt and | ||
Borg, Claudia", | ||
booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation", | ||
month = may, | ||
year = "2024", | ||
publisher = "Association for Computational Linguistics", | ||
}""", | ||
n_samples={"train": 10784, "test": 2297}, | ||
avg_character_length={"train": 1595.63, "test": 1752.1}, | ||
) | ||
|
||
def dataset_transform(self): | ||
# 80% of categories have just one label, so it's safe to take the first | ||
self.dataset = self.dataset.rename_columns({"labels": "label"}) | ||
remove_cols = [ | ||
col | ||
for col in self.dataset["test"].column_names | ||
if col not in ["text", "label"] | ||
] | ||
self.dataset = self.dataset.remove_columns(remove_cols) |
15 changes: 15 additions & 0 deletions
15
results/intfloat__multilingual-e5-small/MalteseNewsClassification.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
{ | ||
"dataset_revision": "6bb0321659c4f07c4c2176c30c98c971be6571b4", | ||
"mteb_dataset_name": "MalteseNewsClassification", | ||
"mteb_version": "1.6.37", | ||
"test": { | ||
"accuracy": 0.2266869830213322, | ||
"accuracy_stderr": 0.02263575385876514, | ||
"evaluation_time": 161.74, | ||
"f1": 0.2281811500943983, | ||
"f1_stderr": 0.019025670776051662, | ||
"lrap": 0.3279256333210852, | ||
"lrap_stderr": 0.03015123586760265, | ||
"main_score": 0.2266869830213322 | ||
} | ||
} |
15 changes: 15 additions & 0 deletions
15
...ntence-transformers__paraphrase-multilingual-MiniLM-L12-v2/MalteseNewsClassification.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
{ | ||
"dataset_revision": "6bb0321659c4f07c4c2176c30c98c971be6571b4", | ||
"mteb_dataset_name": "MalteseNewsClassification", | ||
"mteb_version": "1.6.37", | ||
"test": { | ||
"accuracy": 0.07885117493472585, | ||
"accuracy_stderr": 0.017705011126129034, | ||
"evaluation_time": 20.53, | ||
"f1": 0.07009299109901666, | ||
"f1_stderr": 0.014374488952030922, | ||
"lrap": 0.15509671666752245, | ||
"lrap_stderr": 0.01950650468106085, | ||
"main_score": 0.07885117493472585 | ||
} | ||
} |