Skip to content

Commit

Permalink
fix: Add Hindi dialect classification (#616)
Browse files Browse the repository at this point in the history
* first commit for Telugu News Classification

* revert to original main

* complete langauges part

* add results

* fix import

* ruffen the code

* change acc->f1

* add points

* fix licence

---------

Co-authored-by: Isaac Chung <chungisaac1217@gmail.com>
  • Loading branch information
SaitejaUtpala and isaac-chung committed May 2, 2024
1 parent 42c0d65 commit 9939715
Show file tree
Hide file tree
Showing 5 changed files with 100 additions and 0 deletions.
2 changes: 2 additions & 0 deletions docs/mmteb/points/616.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"GitHub": "SaitejaUtpala", "New dataset": 54}
{"GitHub": "isaac-chung", "Review PR": 2}
1 change: 1 addition & 0 deletions mteb/tasks/Classification/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
from .mkd.MacedonianTweetSentimentClassification import *
from .multilingual.AmazonCounterfactualClassification import *
from .multilingual.AmazonReviewsClassification import *
from .multilingual.HinDialectClassification import *
from .multilingual.IndicLangClassification import *
from .multilingual.IndicSentimentClassification import *
from .multilingual.MasakhaNEWSClassification import *
Expand Down
71 changes: 71 additions & 0 deletions mteb/tasks/Classification/multilingual/HinDialectClassification.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
from __future__ import annotations

from mteb.abstasks.TaskMetadata import TaskMetadata

from ....abstasks import AbsTaskClassification

_LANGUAGES = {
"pan": ["pan-Guru"],
"bgc": ["bgc-Deva"],
"mag": ["mag-Deva"],
"bns": ["bns-Deva"],
"kfq": ["kfg-Deva"],
"noe": ["noe-Deva"],
"bhb": ["bhb-Deva"],
"bho": ["bho-Deva"],
"gbm": ["gbm-Deva"],
"mup": ["mup-Deva"],
"anp": ["anp-Deva"],
"hne": ["hne-Deva"],
"bra": ["bra-Deva"],
"raj": ["raj-Deva"],
"awa": ["awa-Deva"],
"guj": ["guj-Gujr"],
"ben": ["ben-Beng"],
"bhd": ["bhd-Deva"],
"kfy": ["kfy-Deva"],
"mar": ["mar-Deva"],
"bjj": ["bjj-Deva"],
}


class HinDialectClassification(AbsTaskClassification):
metadata = TaskMetadata(
name="HinDialectClassification",
dataset={
"path": "mlexplorer008/hin_dialect_classification",
"revision": "944a44cf93932ce62b51e7c07d44d8cc03d6bcae",
},
description="HinDialect: 26 Hindi-related languages and dialects of the Indic Continuum in North India",
reference="https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-4839",
category="s2s",
type="Classification",
eval_splits=["test"],
eval_langs=_LANGUAGES,
main_score="f1",
date=("2010-01-01", "2023-01-01"),
form=["written"],
domains=["Social", "Spoken"],
task_subtypes=["Language identification"],
license="CC-BY-SA-4.0",
socioeconomic_status="mixed",
annotations_creators="expert-annotated",
dialect=[],
text_creation="found",
bibtex_citation="""
@misc{11234/1-4839,
title = {{HinDialect} 1.1: 26 Hindi-related languages and dialects of the Indic Continuum in North India},
author = {Bafna, Niyati and {\v Z}abokrtsk{\'y}, Zden{\v e}k and Espa{\~n}a-Bonet, Cristina and van Genabith, Josef and Kumar, Lalit "Samyak Lalit" and Suman, Sharda and Shivay, Rahul},
url = {http://hdl.handle.net/11234/1-4839},
note = {{LINDAT}/{CLARIAH}-{CZ} digital library at the Institute of Formal and Applied Linguistics ({{\'U}FAL}), Faculty of Mathematics and Physics, Charles University},
copyright = {Creative Commons - Attribution-{NonCommercial}-{ShareAlike} 4.0 International ({CC} {BY}-{NC}-{SA} 4.0)},
year = {2022} }
""",
n_samples={"test": 1152},
avg_character_length={"test": 583.82},
)

def dataset_transform(self) -> None:
self.dataset = self.dataset.rename_columns(
{"folksong": "text", "language": "label"}
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{
"dataset_revision": "944a44cf93932ce62b51e7c07d44d8cc03d6bcae",
"mteb_dataset_name": "HinDialectClassification",
"mteb_version": "1.7.5",
"test": {
"accuracy": 0.5381944444444444,
"accuracy_stderr": 0.04503527102809611,
"evaluation_time": 920.79,
"f1": 0.34483787104216207,
"f1_stderr": 0.01786957063101452,
"main_score": 0.5381944444444444
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{
"dataset_revision": "944a44cf93932ce62b51e7c07d44d8cc03d6bcae",
"mteb_dataset_name": "HinDialectClassification",
"mteb_version": "1.7.5",
"test": {
"accuracy": 0.4080729166666667,
"accuracy_stderr": 0.022261690482453803,
"evaluation_time": 281.28,
"f1": 0.23680883800470895,
"f1_stderr": 0.0167555265168248,
"main_score": 0.4080729166666667
}
}

0 comments on commit 9939715

Please sign in to comment.