In [None]:
import pandas as pd
import numpy as np
import math
from scipy.spatial import distance_matrix
import random
from sklearn.neighbors import NearestNeighbors


In [None]:
data_path = 'content/abalone9-18.csv'
data = pd.read_csv(data_path)

In [None]:
np.random.seed(42)

In [None]:
data.shape

(731, 9)

In [None]:
data.head()

Unnamed: 0,1,2,3,4,5,6,7,8,label
0,1,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,0
1,2,0.475,0.37,0.125,0.5095,0.2165,0.1125,0.165,0
2,2,0.45,0.32,0.1,0.381,0.1705,0.075,0.115,0
3,1,0.55,0.415,0.135,0.7635,0.318,0.21,0.2,0
4,2,0.665,0.525,0.165,1.338,0.5515,0.3575,0.35,1


In [None]:
label_counts = data['label'].value_counts()
label_counts

label
0    689
1     42
Name: count, dtype: int64

In [None]:
missing_values = data.isnull().sum()
missing_values

1        0
2        0
3        0
4        0
5        0
6        0
7        0
8        0
label    0
dtype: int64

In [None]:
majority_label = label_counts.idxmax()
minority_label = label_counts.idxmin()

In [None]:
majority_label

0

In [None]:
minority_label

1

In [None]:
majority_class = data[data['label'] == majority_label].iloc[:, :-1].values
minority_class = data[data['label'] == minority_label].iloc[:, :-1].values
labels_majority = data[data['label'] == majority_label]['label'].values
labels_minority = data[data['label'] == minority_label]['label'].values

In [None]:
majority_class[:25]

array([[1.    , 0.53  , 0.42  , 0.135 , 0.677 , 0.2565, 0.1415, 0.21  ],
       [2.    , 0.475 , 0.37  , 0.125 , 0.5095, 0.2165, 0.1125, 0.165 ],
       [2.    , 0.45  , 0.32  , 0.1   , 0.381 , 0.1705, 0.075 , 0.115 ],
       [1.    , 0.55  , 0.415 , 0.135 , 0.7635, 0.318 , 0.21  , 0.2   ],
       [2.    , 0.355 , 0.29  , 0.09  , 0.3275, 0.134 , 0.086 , 0.09  ],
       [1.    , 0.45  , 0.335 , 0.105 , 0.425 , 0.1865, 0.091 , 0.115 ],
       [2.    , 0.47  , 0.37  , 0.12  , 0.5795, 0.293 , 0.227 , 0.14  ],
       [1.    , 0.525 , 0.425 , 0.16  , 0.8355, 0.3545, 0.2135, 0.245 ],
       [2.    , 0.505 , 0.405 , 0.11  , 0.625 , 0.305 , 0.16  , 0.175 ],
       [2.    , 0.595 , 0.475 , 0.14  , 0.944 , 0.3625, 0.189 , 0.315 ],
       [1.    , 0.575 , 0.445 , 0.14  , 0.941 , 0.3845, 0.252 , 0.285 ],
       [1.    , 0.46  , 0.355 , 0.13  , 0.517 , 0.2205, 0.114 , 0.165 ],
       [2.    , 0.535 , 0.435 , 0.15  , 0.725 , 0.269 , 0.1385, 0.25  ],
       [1.    , 0.51  , 0.39  , 0.135 , 0.6335, 0.2

In [None]:
minority_class

array([[2.    , 0.665 , 0.525 , 0.165 , 1.338 , 0.5515, 0.3575, 0.35  ],
       [2.    , 0.71  , 0.54  , 0.165 , 1.959 , 0.7665, 0.261 , 0.78  ],
       [1.    , 0.725 , 0.56  , 0.21  , 2.141 , 0.65  , 0.398 , 1.005 ],
       [2.    , 0.61  , 0.5   , 0.24  , 1.642 , 0.532 , 0.3345, 0.69  ],
       [1.    , 0.58  , 0.455 , 0.155 , 0.8365, 0.315 , 0.1385, 0.32  ],
       [1.    , 0.645 , 0.525 , 0.19  , 1.8085, 0.7035, 0.3885, 0.395 ],
       [2.    , 0.605 , 0.49  , 0.18  , 1.227 , 0.48  , 0.287 , 0.35  ],
       [1.    , 0.61  , 0.485 , 0.165 , 1.0915, 0.3935, 0.2435, 0.33  ],
       [1.    , 0.575 , 0.45  , 0.17  , 1.0475, 0.3775, 0.1705, 0.385 ],
       [1.    , 0.57  , 0.45  , 0.175 , 0.9555, 0.38  , 0.1665, 0.295 ],
       [2.    , 0.565 , 0.455 , 0.17  , 0.9065, 0.342 , 0.156 , 0.32  ],
       [1.    , 0.565 , 0.455 , 0.15  , 0.8205, 0.365 , 0.159 , 0.26  ],
       [3.    , 0.525 , 0.41  , 0.175 , 0.874 , 0.3585, 0.207 , 0.205 ],
       [3.    , 0.495 , 0.37  , 0.125 , 0.4775, 0.1

In [None]:
features = data.drop('label', axis=1).values
labels = data['label'].values

In [None]:
labels=np.reshape(labels, newshape=(len(data),1))

In [None]:
features.shape

(731, 8)

In [None]:
majority_count = len(labels_majority)
minority_count = len(labels_minority)

In [None]:
majority_count

689

In [None]:
if minority_count > 0:
    imbalance_ratio = majority_count / minority_count
else:
    imbalance_ratio = 0  # To handle cases with no majority class instances
print(f"Imbalance Ratio: {imbalance_ratio:.4f}")

Imbalance Ratio: 16.4048


In [None]:
def sample_size(N, num_min):
    p = num_min / N
    if p == 0 and N >= 9:
        size1 = 0
    elif p == 0 or p == 1:
        size1 = 1
    else:
        Z = 1.64
        epsilon = 0.05
        e = epsilon + np.log(N) / N
        x = (Z**2 * p * (1-p)) / (e**2)
        size1 = (N * x) / (x + N - 1)
    return math.ceil(size1)

In [None]:
calculated_sample_size = sample_size(majority_count, minority_count)
calculated_sample_size

41

In [None]:

def calculate_weights(data, labels, k=5):
    labels = np.array(labels).flatten()
    nbrs = NearestNeighbors(n_neighbors=k+1, algorithm='ball_tree').fit(data)
    distances, indices = nbrs.kneighbors(data)

    weights = np.zeros(data.shape[0])
    for i in range(data.shape[0]):
        distances_i = distances[i, 1:]  # ignore the first one because it's the distance to itself
        indices_i = indices[i, 1:]
        inverse_distances = 1 / (distances_i + 1e-5)

        same_class_mask = (labels[indices_i] == labels[i])
        different_class_mask = ~same_class_mask

        same_class_weights = inverse_distances * same_class_mask
        different_class_weights = inverse_distances * different_class_mask

        total_same_class_weight = np.sum(same_class_weights)
        total_different_class_weight = np.sum(different_class_weights)
        total_weight = total_same_class_weight + total_different_class_weight

        if total_weight > 0:
            complexity_score = total_same_class_weight / total_weight
        else:
            complexity_score = 0  # Handle cases with no effective neighbors

        weights[i] = complexity_score

    return weights


In [None]:
weights = calculate_weights(features, labels, k=5)

In [None]:
weights.shape

(731,)

In [None]:
majority_weights = weights[labels.flatten() == majority_label]
minority_weights = weights[labels.flatten() == minority_label]

In [None]:
minority_weights

array([0.        , 0.38575041, 0.24992685, 0.63869948, 0.19180826,
       0.        , 0.        , 0.        , 0.17828388, 0.204649  ,
       0.19191533, 0.        , 0.        , 0.        , 0.        ,
       0.10555741, 0.2451499 , 0.48755938, 0.16867578, 0.21849434,
       0.61722834, 0.        , 0.17785492, 0.18408574, 0.39801757,
       0.        , 0.        , 0.        , 0.37936736, 0.28115806,
       0.        , 0.19131505, 0.42076677, 0.        , 0.2111699 ,
       0.        , 0.66808429, 0.48648307, 0.44073223, 0.        ,
       0.60966803, 0.        ])

In [None]:
majority_weights

array([0.80798585, 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 0.80393099,
       1.        , 0.73915717, 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 0.75767657,
       1.        , 1.        , 1.        , 1.        , 0.83537961,
       1.        , 1.        , 0.73113932, 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 0.79299798,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       0.81851137, 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 0.79895233,
       1.        , 1.        , 1.        , 1.        , 1.     

In [None]:
minority_complexity = sum(minority_weights)/len(minority_weights)
majority_complexity = sum(majority_weights)/len(majority_weights)
print(f"Minority Class Complexity: {minority_complexity:.4f}")
print(f"Majority Class Complexity: {majority_complexity:.4f}")

Minority Class Complexity: 0.1984
Majority Class Complexity: 0.9804


In [None]:
print("Data Shape:", data.shape)
print("Labels Shape:", labels.shape)
print("Weights Shape:", weights.shape)
print("Majority weights shape: ", majority_weights.shape)
print("Majority weights shape: ", minority_weights.shape)
print("Adjusted Labels Shape:", labels.shape)
print("Minority class shape: ", minority_class.shape)
print("Majority class shape: ", majority_class.shape)

Data Shape: (731, 9)
Labels Shape: (731, 1)
Weights Shape: (731,)
Majority weights shape:  (689,)
Majority weights shape:  (42,)
Adjusted Labels Shape: (731, 1)
Minority class shape:  (42, 8)
Majority class shape:  (689, 8)


In [None]:
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import pairwise_distances

def initialize_population(valid_indices, population_size, individual_size):
    """Initialize the population with random selections of valid indices."""
    return [np.random.choice(valid_indices, size=individual_size, replace=False) for _ in range(population_size)]

def concatenate_samples(P, N, SetN):
    """Concatenate samples as described."""
    P = np.atleast_2d(P)
    N = np.atleast_2d(N)
    SetN = np.atleast_2d(SetN)

    P_cartesian = np.array([np.concatenate([p1, p2]) for p1 in P for p2 in P])
    N_cartesian = np.array([np.concatenate([n, sn]) for n in N for sn in SetN])

    return P_cartesian, N_cartesian

def calculate_weights(data, labels, k=5):
    labels = np.array(labels).flatten()
    nbrs = NearestNeighbors(n_neighbors=k+1, algorithm='ball_tree').fit(data)
    distances, indices = nbrs.kneighbors(data)

    weights = np.zeros(data.shape[0])
    for i in range(data.shape[0]):
        distances_i = distances[i, 1:]  # ignore the first one because it's the distance to itself
        indices_i = indices[i, 1:]
        inverse_distances = 1 / (distances_i + 1e-5)

        same_class_mask = (labels[indices_i] == labels[i])
        different_class_mask = ~same_class_mask

        same_class_weights = inverse_distances * same_class_mask
        different_class_weights = inverse_distances * different_class_mask

        total_same_class_weight = np.sum(same_class_weights)
        total_different_class_weight = np.sum(different_class_weights)
        total_weight = total_same_class_weight + total_different_class_weight

        if total_weight > 0:
            complexity_score = total_same_class_weight / total_weight
        else:
            complexity_score = 0  # Handle cases with no effective neighbors

        weights[i] = complexity_score

    return weights

def calculate_class_complexity(P_cartesian, N_cartesian, weights):
    """Calculate complexities for concatenated minority and majority classes."""
    if len(P_cartesian) > 0:
        minority_weights_new = weights[:len(P_cartesian)]
        minority_complexity = np.mean(minority_weights_new) if len(minority_weights_new) > 0 else 0
    else:
        minority_complexity = 0

    if len(N_cartesian) > 0:
        majority_weights_new = weights[len(P_cartesian):]
        majority_complexity = np.mean(majority_weights_new) if len(majority_weights_new) > 0 else 0
    else:
        majority_complexity = 0

    return minority_complexity, majority_complexity

def calculate_fitness(individual, features, minority_class, majority_class, k=5):
    """Fitness function considering class complexity after concatenation."""
    best_samples = features[individual]

    # Concatenate the candidate subset with the majority class
    P_cartesian, N_cartesian = concatenate_samples(minority_class, majority_class, best_samples)

    # Calculate new weights
    new_data = np.vstack((P_cartesian, N_cartesian))
    new_labels = np.concatenate([np.zeros(len(P_cartesian)), np.ones(len(N_cartesian))])
    new_weights = calculate_weights(new_data, new_labels, k=k)

    # Calculate class complexities
    minority_complexity, majority_complexity = calculate_class_complexity(P_cartesian, N_cartesian, new_weights)

    # Combine the complexities into the fitness score
    fitness_score = minority_complexity + majority_complexity

    return fitness_score, minority_complexity, majority_complexity

def select_parents(population, fitnesses, num_parents):
    """Tournament selection to select parents based on their fitness scores."""
    parents = []
    for _ in range(num_parents):
        participants = np.random.choice(len(population), size=3, replace=False)
        best = participants[np.argmax(fitnesses[participants])]
        parents.append(population[best])
    return parents

def crossover(parent1, parent2):
    """Perform one-point crossover between two parents."""
    crossover_point = np.random.randint(1, len(parent1))
    child1 = np.concatenate([parent1[:crossover_point], parent2[crossover_point:]])
    child2 = np.concatenate([parent2[:crossover_point], parent1[crossover_point:]])
    return child1, child2

def mutate(individual, mutation_rate, valid_indices):
    """Introduce a dynamic mutation based on the generation count."""
    for i in range(len(individual)):
        if np.random.rand() < mutation_rate:
            individual[i] = np.random.choice(valid_indices)
    return individual

def genetic_algorithm(features, population_size, individual_size, generations, initial_mutation_rate, optimal_fitness, minority_class, majority_class, k=5):
    valid_indices = np.arange(features.shape[0])
    population = initialize_population(valid_indices, population_size, individual_size)

    best_overall_fitness = -np.inf
    best_overall_individual = None
    best_minority_complexity = 0
    best_majority_complexity = 0

    for generation in range(generations):
        fitness_results = [calculate_fitness(ind, features, minority_class, majority_class, k=k) for ind in population]
        fitnesses = [result[0] for result in fitness_results]
        minority_complexities = [result[1] for result in fitness_results]
        majority_complexities = [result[2] for result in fitness_results]

        # Handle any NaN values in fitnesses
        fitnesses = np.nan_to_num(fitnesses, nan=0.0)

        best_index = np.argmax(fitnesses)
        if fitnesses[best_index] > best_overall_fitness:
            best_overall_fitness = fitnesses[best_index]
            best_overall_individual = population[best_index]
            best_minority_complexity = minority_complexities[best_index]
            best_majority_complexity = majority_complexities[best_index]

        if best_overall_fitness >= optimal_fitness:
            break

        mutation_rate = initial_mutation_rate * (1 - generation / generations)  # Decreasing mutation rate
        parents = select_parents(population, fitnesses, len(population) - 1)  # Save one slot for elitism
        next_population = [best_overall_individual]  # Elitism: keep the best individual

        for i in range(0, len(parents), 2):
            if i + 1 < len(parents):
                child1, child2 = crossover(parents[i], parents[i + 1])
                child1 = mutate(child1, mutation_rate, valid_indices)
                child2 = mutate(child2, mutation_rate, valid_indices)
                next_population.extend([child1, child2])
        population = next_population[:population_size]

        # Verbose output to track progress
        print(f"Generation {generation + 1}, Best Fitness: {best_overall_fitness:.4f}, Minority Complexity: {best_minority_complexity:.4f}, Majority Complexity: {best_majority_complexity:.4f}")

    best_samples = features[best_overall_individual]
    best_fitness = calculate_fitness(best_overall_individual, features, minority_class, majority_class, k=k)[0]
    return best_samples, best_fitness


# Parameters for the GA
population_size = 10
individual_size = calculated_sample_size  # Ensure it’s less than len(valid_indices)
generations = 200
initial_mutation_rate = 0.1
optimal_fitness = 2*0.999  # Set a reasonable upper bound for fitness to ensure it runs enough generations

# Run the genetic algorithm
best_samples, best_fitness = genetic_algorithm(features, population_size, individual_size, generations, initial_mutation_rate, optimal_fitness, minority_class, majority_class)
print("Selected Samples (Best Solution):\n", best_samples)
print("Sum of Complexity for the Best Solution:", best_fitness)


Generation 1, Best Fitness: 1.7719, Minority Complexity: 0.7727, Majority Complexity: 0.9992
Generation 2, Best Fitness: 1.7719, Minority Complexity: 0.7727, Majority Complexity: 0.9992
Generation 3, Best Fitness: 1.7719, Minority Complexity: 0.7727, Majority Complexity: 0.9992
Generation 4, Best Fitness: 1.7961, Minority Complexity: 0.7973, Majority Complexity: 0.9989
Generation 5, Best Fitness: 1.7961, Minority Complexity: 0.7973, Majority Complexity: 0.9989
Generation 6, Best Fitness: 1.7970, Minority Complexity: 0.7982, Majority Complexity: 0.9987
Generation 7, Best Fitness: 1.7981, Minority Complexity: 0.7988, Majority Complexity: 0.9992
Generation 8, Best Fitness: 1.8102, Minority Complexity: 0.8111, Majority Complexity: 0.9991
Generation 9, Best Fitness: 1.8169, Minority Complexity: 0.8178, Majority Complexity: 0.9991
Generation 10, Best Fitness: 1.8175, Minority Complexity: 0.8183, Majority Complexity: 0.9991
Generation 11, Best Fitness: 1.8217, Minority Complexity: 0.8225, Maj

In [None]:
def concatenate_samples(P, N, SetN):
    # Ensure all inputs are at least 2D
    P = np.atleast_2d(P)
    N = np.atleast_2d(N)
    SetN = np.atleast_2d(SetN)

    # Cartesian product for P: all combinations of P with itself
    P_cartesian = np.array([np.concatenate([p1, p2]) for p1 in P for p2 in P])

    # Cartesian product for N and SetN: all combinations of N with SetN
    N_cartesian = np.array([np.concatenate([n, sn]) for n in N for sn in SetN])

    return P_cartesian, N_cartesian

In [None]:
P_cartesian, N_cartesian = concatenate_samples(minority_class,majority_class, best_samples)


In [None]:
P_cartesian.shape

(1764, 16)

In [None]:
N_cartesian.shape

(28249, 16)

In [None]:
new_weights = calculate_weights(np.vstack((P_cartesian, N_cartesian)), np.concatenate([np.zeros(len(P_cartesian)), np.ones(len(N_cartesian))]), k=5)

In [None]:
minority_weights_new = new_weights[:len(P_cartesian)]
majority_weights_new = new_weights[len(P_cartesian):]

In [None]:
minority_weights_new.shape

(1764,)

In [None]:
majority_weights_new.shape

(28249,)

In [None]:
# Calculate complexities based on the weights calculated
minority_complexity = np.mean(minority_weights_new)  # Complexity for minority class after concatenation
majority_complexity = np.mean(majority_weights_new)  # Complexity for majority class after concatenation

print(f"Minority Class Complexity after Concatenation: {minority_complexity:.4f}")
print(f"Majority Class Complexity after Concatenation: {majority_complexity:.4f}")


Minority Class Complexity after Concatenation: 0.9174
Majority Class Complexity after Concatenation: 0.9999
