In [2]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.naive_bayes import MultinomialNB
from skmultilearn.problem_transform import BinaryRelevance
from datasets import load_dataset

In [5]:

# Load the dataset
dataset = load_dataset("qanastek/HoC", trust_remote_code=True)
# Extract features and labels
X = dataset['train']['text']
y = dataset['train']['label']

In [9]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(y)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

classifier = BinaryRelevance(MultinomialNB())
classifier.fit(X_train_vectorized, y_train);

In [13]:
# Make predictions
y_pred = classifier.predict(X_test_vectorized)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='micro')

print(f'Accuracy: {accuracy:.4f}')
print(f'F1 Score: {f1:.4f}')

Accuracy: 0.6469
F1 Score: 0.7188


In [32]:
test = np.array([
    [0, 1, 0, 1, 1],
    [0, 0, 0, 1, 0],
    [1, 0 ,1, 0 ,0],
])

freq = np.sum(test, axis=0)
iblr = np.max(freq) / freq
iblr

array([2., 2., 2., 1., 2.])

In [43]:
def uniform_crossover( parentA, parentB ):
    mask = np.random.randint(0, 2, parentA.shape[0] )
    new_genA =  parentA * mask + parentB * (1-mask)
    new_genB =  parentB * mask + parentA * (1-mask)
    return [new_genA, new_genB]

class GeneticSampler:
  
  def __init__(self, pob_size, crossover):
    self.pob = []
    self.pob_size = pob_size
    self.crossover = crossover
    self.assert_time = 0

  def initialize_population(self, n_samples, target_actives=-1 ):

    n_actives = [target_actives] * self.pob_size
    if target_actives == -1:
        n_actives = np.random.binomial( n=n_samples, p=0.1, size=self.pob_size )
        
    self.pob = np.zeros( (self.pob_size, n_samples) )
    for i in range(self.pob_size):
        replace = n_samples < n_actives[i]
        ones_indices = np.random.choice( n_samples, n_actives[i], replace=replace )
        self.pob[i, ones_indices] = 1

  def assert_masks(self, y_):
    zero_cases = np.where( (self.pob @ y_) == 0 )
    zero_cases = ( zero_cases[0], self.pivot_samples[zero_cases[1]] )
    self.pob[zero_cases] = 1

  def calc_pivot_samples(self, y_):
    sorted_indices = np.argsort( np.sum(y_, axis=1) )
    self.pivot_samples = np.zeros(y_.shape[1], dtype=np.uint64)
    for i in range(y_.shape[1]):
        self.pivot_samples[i] = sorted_indices[ np.argmax( y_[sorted_indices, i] == 1 ) ]

  def mutate(self, new_genes, mutation_prob):
      for gen in new_genes:
          if np.random.rand() < mutation_prob:
              mutation_index = np.random.randint(0, gen.shape[0])
              gen[mutation_index] = np.random.randint(0, 2)
      return new_genes

  def update_pob(self, mutation_prob):
      newGen = self.pob_size // 2 + 1
      while newGen < self.pob_size - 1:
            parentA, parentB = np.random.choice( range(self.pob_size // 2), 2 )
            new_genes = self.crossover(self.pob[parentA], self.pob[parentB] )
            new_genes = self.mutate(new_genes, mutation_prob)
            self.pob[ newGen:newGen + len(new_genes) ] = new_genes
            newGen += len(new_genes)
    
  def sample(self, y_, loss, max_iterations=50, target_actives=-1, keep_labels=True, mutation_prob=0.01, verbose=0):
    # Y: MultiLabelBinarizer Product(Samples x Labels)
    y_ = np.array(y_, dtype=np.uint8 )
    n_samples = y_.shape[0]
      
    self.initialize_population(n_samples, target_actives)

    if keep_labels:
        self.calc_pivot_samples(y_)
      
    for i in range(max_iterations):

      if keep_labels:
          self.assert_masks(y_)
      pob_loss = loss(self.pob)

      sorted_indices = np.argsort(pob_loss)
      self.pob = self.pob[sorted_indices]

      if np.any(pob_loss == 0):
        break

      self.update_pob(mutation_prob)

      if verbose:
        if i % verbose == 0:
          print(f"{i} - Min Loss: {np.min(pob_loss):.4f}, Mean Loss: {np.mean(pob_loss):.4f}")   

    return self.pob[0]

In [40]:
def calc_irlbl(pob):
      freq = pob @ test
      zero_mask = freq == 0
      freq = np.ma.masked_array( freq, zero_mask, fill_value=0.01 )
      
      multi_args = {} if pob.ndim == 1 else { "axis": 1, "keepdims": True }
      max_freq = np.max(freq, **multi_args )
      irl_bl = max_freq / freq
      return irl_bl

def loss(pob):
      irl_bl = calc_irlbl(pob)
      return np.mean(irl_bl, axis=1)

In [45]:
sampler = GeneticSampler(10, uniform_crossover)
sampler.sample(test, loss, keep_labels=True)

array([1., 0., 1.])

In [None]:
import numpy as np

target = 100

a = np.random.randint(1, 20, 10)
a = np.ceil( target * a / np.sum(a) )

print(a, a.sum(),  int(np.sum(a) - target) )
selection = np.random.choice( np.where( a > 1 )[0], int(np.sum(a) - target), replace=True ) 
selection = np.bincount(selection)
print(selection.astype(float))
a[:len(selection)] -= selection

print(a)
a.sum()

[7. 3. 8. 7. 3. 5. 9. 4. 3. 7.] 56.0 6
[0. 0. 1. 0. 0. 1. 1. 0. 1. 2.]
[7. 3. 7. 7. 3. 4. 8. 4. 2. 5.]


np.float64(50.0)

In [144]:
np.zeros(10) + np.bincount( [1,2,3,2,2,2,5] )

ValueError: operands could not be broadcast together with shapes (10,) (6,) 