In [None]:
import numpy as np
from datasets import load_dataset

In [5]:
# Load the dataset
dataset = load_dataset("qanastek/HoC", split="train", trust_remote_code=True)
y = dataset['label']

In [6]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(y)
y.shape

(12119, 11)

In [16]:
freq = np.sum(y, axis=0)
irl_bl = np.max(freq) / freq
print(f"MEAN_IR: {np.mean(irl_bl):.2f}")

MEAN_IR: 28.99


In [None]:
import sys
sys.path.append("..")

# Source: https://github.com/dagamu/textaug_dashboard/blob/main/src/genetic_sampler.py
from src.genetic_sampler import GeneticSampler, uniform_crossover

sampler = GeneticSampler(pob_size=50, crossover=uniform_crossover)

In [27]:
def calc_pob_irlbl(pob):
      freq = pob @ y
      zero_mask = freq == 0
      freq = np.ma.masked_array( freq, zero_mask, fill_value=0.01 )
      
      multi_args = {} if pob.ndim == 1 else { "axis": 1, "keepdims": True }
      max_freq = np.max(freq, **multi_args )
      irl_bl = max_freq / freq
      return irl_bl

def loss(pob):
      irl_bl = calc_pob_irlbl(pob)
      return np.mean(irl_bl, axis=1)

optimal_sample = sampler.sample(y, loss=loss, max_iterations=300, target_actives=2000, keep_labels=True, verbose=50)
optimal_sample

0 - Min Loss: 25.5110, Mean Loss: 29.7101
50 - Min Loss: 13.2589, Mean Loss: 13.6085
100 - Min Loss: 11.3510, Mean Loss: 11.3994
150 - Min Loss: 11.2067, Mean Loss: 11.2067
200 - Min Loss: 11.1882, Mean Loss: 11.1952
250 - Min Loss: 11.1681, Mean Loss: 11.1681


array([0., 0., 0., ..., 0., 1., 0.])

In [28]:
freq = np.sum( y[optimal_sample.astype(bool)], axis=0)
irl_bl = np.max(freq) / freq

print(f"RESAMPLING MEAN_IR: {np.mean(irl_bl):.2f}")
print(f"NEW DATASET SIZE: { int(np.sum(optimal_sample)) }")

RESAMPLING MEAN_IR: 11.17
NEW DATASET SIZE: 1853
