forked from embeddings-benchmark/mteb
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
fix: Converted VG to hierarchical (embeddings-benchmark#694)
* Added hierarchical VG clustering tasks * Added startified subsampling for multilabel tasks to AbsTask * Added stratified subsampling to VG clustering * Fixed stratified subsampling for multilabel tasks * fix: Converted VG to AbsTaskClusteringFast * Added results for paraphrase model * Removed debugging print statements * Added 'not specified' license to VGHierarchical * Added proper license from Norsk Aviskorpus * Ran linting * Replaced stratification with just regular subsampling * fix: fixed subsampling * Added results for VG * Added points * fix: Fixed JSON in 694.jsonl --------- Co-authored-by: Kenneth Enevoldsen <kennethcenevoldsen@gmail.com>
- Loading branch information
Showing
9 changed files
with
321 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
{"GitHub": "x-tabdeveloping", "Bug fixes": 4} | ||
{"GitHub": "KennethEnevoldsen", "Review PR": 2} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
from __future__ import annotations | ||
|
||
from mteb.abstasks import TaskMetadata | ||
from mteb.abstasks.AbsTaskClusteringFast import AbsTaskClusteringFast | ||
|
||
N_SAMPLES = 2048 | ||
|
||
|
||
def split_labels(record: dict) -> dict: | ||
record["labels"] = record["labels"].split(",")[:2] | ||
return record | ||
|
||
|
||
class VGHierarchicalClusteringP2P(AbsTaskClusteringFast): | ||
metadata = TaskMetadata( | ||
name="VGHierarchicalClusteringP2P", | ||
dataset={ | ||
"path": "navjordj/VG_summarization", | ||
"revision": "d4c5a8ba10ae71224752c727094ac4c46947fa29", | ||
}, | ||
description="Articles and their classes (e.g. sports) from VG news articles extracted from Norsk Aviskorpus.", | ||
reference="https://huggingface.co/datasets/navjordj/VG_summarization", | ||
type="Clustering", | ||
category="p2p", | ||
eval_splits=["test"], | ||
eval_langs=["nob-Latn"], | ||
main_score="v_measure", | ||
date=("2020-01-01", "2024-12-31"), # best guess | ||
form=["written"], | ||
domains=["News", "Non-fiction"], | ||
license="CC-BY-NC 4.0", | ||
socioeconomic_status="mixed", | ||
annotations_creators="derived", | ||
dialect=[], | ||
task_subtypes=["Thematic clustering"], | ||
text_creation="found", | ||
bibtex_citation="""@mastersthesis{navjord2023beyond, | ||
title={Beyond extractive: advancing abstractive automatic text summarization in Norwegian with transformers}, | ||
author={Navjord, J{\o}rgen Johnsen and Korsvik, Jon-Mikkel Ryen}, | ||
year={2023}, | ||
school={Norwegian University of Life Sciences, {\AA}s} | ||
}""", | ||
n_samples={"test": N_SAMPLES}, | ||
avg_character_length={"test": 2670.3243084794544}, | ||
) | ||
|
||
def dataset_transform(self) -> None: | ||
self.dataset = self.dataset.rename_columns( | ||
{"article": "sentences", "classes": "labels"} | ||
) | ||
self.dataset = self.dataset.map(split_labels) | ||
# Subsampling the dataset | ||
self.dataset["test"] = self.dataset["test"].train_test_split( | ||
test_size=N_SAMPLES, seed=self.seed | ||
)["test"] | ||
|
||
|
||
class VGHierarchicalClusteringS2S(AbsTaskClusteringFast): | ||
metadata = TaskMetadata( | ||
name="VGHierarchicalClusteringS2S", | ||
dataset={ | ||
"path": "navjordj/VG_summarization", | ||
"revision": "d4c5a8ba10ae71224752c727094ac4c46947fa29", | ||
}, | ||
description="Articles and their classes (e.g. sports) from VG news articles extracted from Norsk Aviskorpus.", | ||
reference="https://huggingface.co/datasets/navjordj/VG_summarization", | ||
type="Clustering", | ||
category="p2p", | ||
eval_splits=["test"], | ||
eval_langs=["nob-Latn"], | ||
main_score="v_measure", | ||
date=("2020-01-01", "2024-12-31"), # best guess | ||
form=["written"], | ||
domains=["News", "Non-fiction"], | ||
license="CC-BY-NC 4.0", | ||
socioeconomic_status="mixed", | ||
annotations_creators="derived", | ||
dialect=[], | ||
task_subtypes=["Thematic clustering"], | ||
text_creation="found", | ||
bibtex_citation="""@mastersthesis{navjord2023beyond, | ||
title={Beyond extractive: advancing abstractive automatic text summarization in Norwegian with transformers}, | ||
author={Navjord, J{\o}rgen Johnsen and Korsvik, Jon-Mikkel Ryen}, | ||
year={2023}, | ||
school={Norwegian University of Life Sciences, {\AA}s} | ||
}""", | ||
n_samples={"test": N_SAMPLES}, | ||
avg_character_length={"test": 139.31247668283325}, | ||
) | ||
|
||
def dataset_transform(self) -> None: | ||
self.dataset = self.dataset.rename_columns( | ||
{"ingress": "sentences", "classes": "labels"} | ||
) | ||
self.dataset = self.dataset.map(split_labels) | ||
# Subsampling the dataset | ||
self.dataset["test"] = self.dataset["test"].train_test_split( | ||
test_size=N_SAMPLES, seed=self.seed | ||
)["test"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
45 changes: 45 additions & 0 deletions
45
results/intfloat__multilingual-e5-small/VGHierarchicalClusteringP2P.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
{ | ||
"dataset_revision": "d4c5a8ba10ae71224752c727094ac4c46947fa29", | ||
"evaluation_time": 135.50683665275574, | ||
"kg_co2_emissions": null, | ||
"mteb_version": "1.11.6", | ||
"scores": { | ||
"test": [ | ||
{ | ||
"hf_subset": "default", | ||
"languages": [ | ||
"nob-Latn" | ||
], | ||
"main_score": 0.42051163852735396, | ||
"v_measure": 0.42051163852735396, | ||
"v_measures": { | ||
"Level 0": [ | ||
0.38323314310337875, | ||
0.3989320487473865, | ||
0.4105931986190897, | ||
0.38677810276773084, | ||
0.42705826820548526, | ||
0.4407980498265106, | ||
0.3785528238444032, | ||
0.41625337873137075, | ||
0.3989633525705215, | ||
0.4140992664333277 | ||
], | ||
"Level 1": [ | ||
0.43844921921634217, | ||
0.42630343948954647, | ||
0.40904538846014304, | ||
0.43711022680518274, | ||
0.4419996131929952, | ||
0.4401999738405541, | ||
0.4323128799875848, | ||
0.43653274562159944, | ||
0.4367445987697985, | ||
0.4562730523141273 | ||
] | ||
} | ||
} | ||
] | ||
}, | ||
"task_name": "VGHierarchicalClusteringP2P" | ||
} |
45 changes: 45 additions & 0 deletions
45
results/intfloat__multilingual-e5-small/VGHierarchicalClusteringS2S.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
{ | ||
"dataset_revision": "d4c5a8ba10ae71224752c727094ac4c46947fa29", | ||
"evaluation_time": 23.324329137802124, | ||
"kg_co2_emissions": null, | ||
"mteb_version": "1.11.6", | ||
"scores": { | ||
"test": [ | ||
{ | ||
"hf_subset": "default", | ||
"languages": [ | ||
"nob-Latn" | ||
], | ||
"main_score": 0.34103019423671366, | ||
"v_measure": 0.34103019423671366, | ||
"v_measures": { | ||
"Level 0": [ | ||
0.3121116915411674, | ||
0.3074277020644131, | ||
0.34489975581262233, | ||
0.3176749457376003, | ||
0.3392949224315137, | ||
0.3331527675548358, | ||
0.32987604186610175, | ||
0.34547068568262207, | ||
0.3416311797106411, | ||
0.3301424742817373 | ||
], | ||
"Level 1": [ | ||
0.3645265027939012, | ||
0.34739983570204375, | ||
0.33704618502701184, | ||
0.3551549934940932, | ||
0.3305621012508423, | ||
0.34890269151079495, | ||
0.35106274701661966, | ||
0.3702643489228558, | ||
0.3593753066096102, | ||
0.35462700572324524 | ||
] | ||
} | ||
} | ||
] | ||
}, | ||
"task_name": "VGHierarchicalClusteringS2S" | ||
} |
45 changes: 45 additions & 0 deletions
45
...ence-transformers__paraphrase-multilingual-MiniLM-L12-v2/VGHierarchicalClusteringP2P.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
{ | ||
"dataset_revision": "d4c5a8ba10ae71224752c727094ac4c46947fa29", | ||
"evaluation_time": 80.27384757995605, | ||
"kg_co2_emissions": null, | ||
"mteb_version": "1.11.6", | ||
"scores": { | ||
"test": [ | ||
{ | ||
"hf_subset": "default", | ||
"languages": [ | ||
"nob-Latn" | ||
], | ||
"main_score": 0.3544100886422448, | ||
"v_measure": 0.3544100886422448, | ||
"v_measures": { | ||
"Level 0": [ | ||
0.3432523142155497, | ||
0.36317673321215704, | ||
0.3706145253800888, | ||
0.35766818218679197, | ||
0.36533142188274287, | ||
0.3469079530784989, | ||
0.34793190945020136, | ||
0.37450460241437256, | ||
0.361451598798914, | ||
0.3468109403627138 | ||
], | ||
"Level 1": [ | ||
0.3546038236048674, | ||
0.34579585642750266, | ||
0.3459826646868124, | ||
0.3484356826859091, | ||
0.35659378058747193, | ||
0.35447381402923767, | ||
0.3361421350567975, | ||
0.3430716898552756, | ||
0.357995481445959, | ||
0.36745666348303196 | ||
] | ||
} | ||
} | ||
] | ||
}, | ||
"task_name": "VGHierarchicalClusteringP2P" | ||
} |
45 changes: 45 additions & 0 deletions
45
...ence-transformers__paraphrase-multilingual-MiniLM-L12-v2/VGHierarchicalClusteringS2S.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
{ | ||
"dataset_revision": "d4c5a8ba10ae71224752c727094ac4c46947fa29", | ||
"evaluation_time": 64.87495875358582, | ||
"kg_co2_emissions": null, | ||
"mteb_version": "1.11.6", | ||
"scores": { | ||
"test": [ | ||
{ | ||
"hf_subset": "default", | ||
"languages": [ | ||
"nob-Latn" | ||
], | ||
"main_score": 0.2886935384193351, | ||
"v_measure": 0.2886935384193351, | ||
"v_measures": { | ||
"Level 0": [ | ||
0.25927593083469574, | ||
0.26491058719044713, | ||
0.19853402718054483, | ||
0.25007657074147577, | ||
0.30451066421726614, | ||
0.2759300676280902, | ||
0.25660890312269036, | ||
0.3055592803074229, | ||
0.30644353051055306, | ||
0.271323953135058 | ||
], | ||
"Level 1": [ | ||
0.31352802114590117, | ||
0.29961281553155933, | ||
0.2955337891514432, | ||
0.3222434800301185, | ||
0.3101948959625172, | ||
0.29249873108622443, | ||
0.30737376823457874, | ||
0.31481592521969615, | ||
0.29827241585365416, | ||
0.3266234113027661 | ||
] | ||
} | ||
} | ||
] | ||
}, | ||
"task_name": "VGHierarchicalClusteringS2S" | ||
} |