Skip to content

Commit

Permalink
fix: Converted VG to hierarchical (embeddings-benchmark#694)
Browse files Browse the repository at this point in the history
* Added hierarchical VG clustering tasks

* Added startified subsampling for multilabel tasks to AbsTask

* Added stratified subsampling to VG clustering

* Fixed stratified subsampling for multilabel tasks

* fix: Converted VG to AbsTaskClusteringFast

* Added results for paraphrase model

* Removed debugging print statements

* Added 'not specified' license to VGHierarchical

* Added proper license from Norsk Aviskorpus

* Ran linting

* Replaced stratification with just regular subsampling

* fix: fixed subsampling

* Added results for VG

* Added points

* fix: Fixed JSON in 694.jsonl

---------

Co-authored-by: Kenneth Enevoldsen <kennethcenevoldsen@gmail.com>
  • Loading branch information
2 people authored and dokato committed May 24, 2024
1 parent fc576aa commit b5b3823
Show file tree
Hide file tree
Showing 9 changed files with 321 additions and 2 deletions.
2 changes: 2 additions & 0 deletions docs/mmteb/points/694.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"GitHub": "x-tabdeveloping", "Bug fixes": 4}
{"GitHub": "KennethEnevoldsen", "Review PR": 2}
40 changes: 38 additions & 2 deletions mteb/abstasks/AbsTask.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,14 @@
import logging
import random
from abc import ABC, abstractmethod
from typing import Any
from typing import Any, Sequence

import datasets
import numpy as np
import torch
from datasets import DatasetDict
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

from mteb.abstasks.TaskMetadata import TaskMetadata
from mteb.encoder_interface import Encoder, EncoderWithQueryCorpusEncode
Expand All @@ -19,6 +21,32 @@
logger = logging.getLogger(__name__)


def _multilabel_subsampling(
dataset_dict: datasets.DatasetDict,
seed: int,
splits: list[str] = ["test"],
label: str = "label",
n_samples: int = 2048,
) -> datasets.DatasetDict:
"""Startified subsampling for multilabel problems."""
for split in splits:
labels = dataset_dict[split][label]
encoded_labels = MultiLabelBinarizer().fit_transform(labels)
idxs = np.arange(len(labels))
try:
idxs, *_ = train_test_split(
idxs,
encoded_labels,
stratify=encoded_labels,
random_state=seed,
train_size=n_samples,
)
except ValueError:
logger.warn("Couldn't subsample, continuing with full split.")
dataset_dict.update({split: dataset_dict[split].select(idxs)})
return dataset_dict


class AbsTask(ABC):
metadata: TaskMetadata
superseeded_by: None | str = None
Expand Down Expand Up @@ -113,7 +141,15 @@ def stratified_subsampling(
"""
## Can only do this if the label column is of ClassLabel.
if not isinstance(dataset_dict[splits[0]].features[label], datasets.ClassLabel):
dataset_dict = dataset_dict.class_encode_column(label)
try:
dataset_dict = dataset_dict.class_encode_column(label)
except ValueError as e:
if isinstance(dataset_dict[splits[0]][label][0], Sequence):
return _multilabel_subsampling(
dataset_dict, seed, splits, label, n_samples
)
else:
raise e

for split in splits:
dataset_dict.update(
Expand Down
1 change: 1 addition & 0 deletions mteb/tasks/Clustering/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
from .nob.snl_clustering import *
from .nob.SNLHierarchicalClustering import *
from .nob.vg_clustering import *
from .nob.VGHierarchicalClustering import *
from .pol.PolishClustering import *
from .rom.RomaniBibleClustering import *
from .spa.SpanishNewsClusteringP2P import *
Expand Down
99 changes: 99 additions & 0 deletions mteb/tasks/Clustering/nob/VGHierarchicalClustering.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
from __future__ import annotations

from mteb.abstasks import TaskMetadata
from mteb.abstasks.AbsTaskClusteringFast import AbsTaskClusteringFast

N_SAMPLES = 2048


def split_labels(record: dict) -> dict:
record["labels"] = record["labels"].split(",")[:2]
return record


class VGHierarchicalClusteringP2P(AbsTaskClusteringFast):
metadata = TaskMetadata(
name="VGHierarchicalClusteringP2P",
dataset={
"path": "navjordj/VG_summarization",
"revision": "d4c5a8ba10ae71224752c727094ac4c46947fa29",
},
description="Articles and their classes (e.g. sports) from VG news articles extracted from Norsk Aviskorpus.",
reference="https://huggingface.co/datasets/navjordj/VG_summarization",
type="Clustering",
category="p2p",
eval_splits=["test"],
eval_langs=["nob-Latn"],
main_score="v_measure",
date=("2020-01-01", "2024-12-31"), # best guess
form=["written"],
domains=["News", "Non-fiction"],
license="CC-BY-NC 4.0",
socioeconomic_status="mixed",
annotations_creators="derived",
dialect=[],
task_subtypes=["Thematic clustering"],
text_creation="found",
bibtex_citation="""@mastersthesis{navjord2023beyond,
title={Beyond extractive: advancing abstractive automatic text summarization in Norwegian with transformers},
author={Navjord, J{\o}rgen Johnsen and Korsvik, Jon-Mikkel Ryen},
year={2023},
school={Norwegian University of Life Sciences, {\AA}s}
}""",
n_samples={"test": N_SAMPLES},
avg_character_length={"test": 2670.3243084794544},
)

def dataset_transform(self) -> None:
self.dataset = self.dataset.rename_columns(
{"article": "sentences", "classes": "labels"}
)
self.dataset = self.dataset.map(split_labels)
# Subsampling the dataset
self.dataset["test"] = self.dataset["test"].train_test_split(
test_size=N_SAMPLES, seed=self.seed
)["test"]


class VGHierarchicalClusteringS2S(AbsTaskClusteringFast):
metadata = TaskMetadata(
name="VGHierarchicalClusteringS2S",
dataset={
"path": "navjordj/VG_summarization",
"revision": "d4c5a8ba10ae71224752c727094ac4c46947fa29",
},
description="Articles and their classes (e.g. sports) from VG news articles extracted from Norsk Aviskorpus.",
reference="https://huggingface.co/datasets/navjordj/VG_summarization",
type="Clustering",
category="p2p",
eval_splits=["test"],
eval_langs=["nob-Latn"],
main_score="v_measure",
date=("2020-01-01", "2024-12-31"), # best guess
form=["written"],
domains=["News", "Non-fiction"],
license="CC-BY-NC 4.0",
socioeconomic_status="mixed",
annotations_creators="derived",
dialect=[],
task_subtypes=["Thematic clustering"],
text_creation="found",
bibtex_citation="""@mastersthesis{navjord2023beyond,
title={Beyond extractive: advancing abstractive automatic text summarization in Norwegian with transformers},
author={Navjord, J{\o}rgen Johnsen and Korsvik, Jon-Mikkel Ryen},
year={2023},
school={Norwegian University of Life Sciences, {\AA}s}
}""",
n_samples={"test": N_SAMPLES},
avg_character_length={"test": 139.31247668283325},
)

def dataset_transform(self) -> None:
self.dataset = self.dataset.rename_columns(
{"ingress": "sentences", "classes": "labels"}
)
self.dataset = self.dataset.map(split_labels)
# Subsampling the dataset
self.dataset["test"] = self.dataset["test"].train_test_split(
test_size=N_SAMPLES, seed=self.seed
)["test"]
1 change: 1 addition & 0 deletions mteb/tasks/Clustering/nob/vg_clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ def batched(iterable: Iterable[T], n: int) -> Iterable[tuple[T, ...]]:


class VGClustering(AbsTaskClustering):
superseeded_by = "VGHierarchicalClusteringP2P"
metadata = TaskMetadata(
name="VGClustering",
dataset={
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
{
"dataset_revision": "d4c5a8ba10ae71224752c727094ac4c46947fa29",
"evaluation_time": 135.50683665275574,
"kg_co2_emissions": null,
"mteb_version": "1.11.6",
"scores": {
"test": [
{
"hf_subset": "default",
"languages": [
"nob-Latn"
],
"main_score": 0.42051163852735396,
"v_measure": 0.42051163852735396,
"v_measures": {
"Level 0": [
0.38323314310337875,
0.3989320487473865,
0.4105931986190897,
0.38677810276773084,
0.42705826820548526,
0.4407980498265106,
0.3785528238444032,
0.41625337873137075,
0.3989633525705215,
0.4140992664333277
],
"Level 1": [
0.43844921921634217,
0.42630343948954647,
0.40904538846014304,
0.43711022680518274,
0.4419996131929952,
0.4401999738405541,
0.4323128799875848,
0.43653274562159944,
0.4367445987697985,
0.4562730523141273
]
}
}
]
},
"task_name": "VGHierarchicalClusteringP2P"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
{
"dataset_revision": "d4c5a8ba10ae71224752c727094ac4c46947fa29",
"evaluation_time": 23.324329137802124,
"kg_co2_emissions": null,
"mteb_version": "1.11.6",
"scores": {
"test": [
{
"hf_subset": "default",
"languages": [
"nob-Latn"
],
"main_score": 0.34103019423671366,
"v_measure": 0.34103019423671366,
"v_measures": {
"Level 0": [
0.3121116915411674,
0.3074277020644131,
0.34489975581262233,
0.3176749457376003,
0.3392949224315137,
0.3331527675548358,
0.32987604186610175,
0.34547068568262207,
0.3416311797106411,
0.3301424742817373
],
"Level 1": [
0.3645265027939012,
0.34739983570204375,
0.33704618502701184,
0.3551549934940932,
0.3305621012508423,
0.34890269151079495,
0.35106274701661966,
0.3702643489228558,
0.3593753066096102,
0.35462700572324524
]
}
}
]
},
"task_name": "VGHierarchicalClusteringS2S"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
{
"dataset_revision": "d4c5a8ba10ae71224752c727094ac4c46947fa29",
"evaluation_time": 80.27384757995605,
"kg_co2_emissions": null,
"mteb_version": "1.11.6",
"scores": {
"test": [
{
"hf_subset": "default",
"languages": [
"nob-Latn"
],
"main_score": 0.3544100886422448,
"v_measure": 0.3544100886422448,
"v_measures": {
"Level 0": [
0.3432523142155497,
0.36317673321215704,
0.3706145253800888,
0.35766818218679197,
0.36533142188274287,
0.3469079530784989,
0.34793190945020136,
0.37450460241437256,
0.361451598798914,
0.3468109403627138
],
"Level 1": [
0.3546038236048674,
0.34579585642750266,
0.3459826646868124,
0.3484356826859091,
0.35659378058747193,
0.35447381402923767,
0.3361421350567975,
0.3430716898552756,
0.357995481445959,
0.36745666348303196
]
}
}
]
},
"task_name": "VGHierarchicalClusteringP2P"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
{
"dataset_revision": "d4c5a8ba10ae71224752c727094ac4c46947fa29",
"evaluation_time": 64.87495875358582,
"kg_co2_emissions": null,
"mteb_version": "1.11.6",
"scores": {
"test": [
{
"hf_subset": "default",
"languages": [
"nob-Latn"
],
"main_score": 0.2886935384193351,
"v_measure": 0.2886935384193351,
"v_measures": {
"Level 0": [
0.25927593083469574,
0.26491058719044713,
0.19853402718054483,
0.25007657074147577,
0.30451066421726614,
0.2759300676280902,
0.25660890312269036,
0.3055592803074229,
0.30644353051055306,
0.271323953135058
],
"Level 1": [
0.31352802114590117,
0.29961281553155933,
0.2955337891514432,
0.3222434800301185,
0.3101948959625172,
0.29249873108622443,
0.30737376823457874,
0.31481592521969615,
0.29827241585365416,
0.3266234113027661
]
}
}
]
},
"task_name": "VGHierarchicalClusteringS2S"
}

0 comments on commit b5b3823

Please sign in to comment.