In [199]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import math
import random
from scipy.spatial import distance_matrix
from sklearn.neighbors import NearestNeighbors
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset, SubsetRandomSampler
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, recall_score, roc_curve, precision_recall_curve, roc_auc_score, confusion_matrix
from imblearn.metrics import geometric_mean_score
import matplotlib.pyplot as plt
from sklearn.model_selection import RepeatedStratifiedKFold


In [200]:
data_path = '/content/haberman.csv'
data = pd.read_csv(data_path)

In [201]:
data.shape

(306, 4)

In [202]:
data.head()

Unnamed: 0,1,2,3,label
0,30,64,1,0
1,30,62,3,0
2,30,65,0,0
3,31,59,2,0
4,31,65,4,0


In [203]:
label_counts = data['label'].value_counts()
label_counts

label
0    225
1     81
Name: count, dtype: int64

In [204]:
missing_values = data.isnull().sum()
missing_values

1        0
2        0
3        0
label    0
dtype: int64

In [205]:
majority_label = label_counts.idxmax()
minority_label = label_counts.idxmin()

In [206]:
majority_label

0

In [207]:
minority_label

1

In [208]:
majority_class = data[data['label'] == majority_label].iloc[:, :-1].values
minority_class = data[data['label'] == minority_label].iloc[:, :-1].values
labels_majority = data[data['label'] == majority_label]['label'].values
labels_minority = data[data['label'] == minority_label]['label'].values

In [209]:
majority_class[:25]

array([[30, 64,  1],
       [30, 62,  3],
       [30, 65,  0],
       [31, 59,  2],
       [31, 65,  4],
       [33, 58, 10],
       [33, 60,  0],
       [34, 58, 30],
       [34, 60,  1],
       [34, 61, 10],
       [34, 67,  7],
       [34, 60,  0],
       [35, 64, 13],
       [35, 63,  0],
       [36, 60,  1],
       [36, 69,  0],
       [37, 60,  0],
       [37, 63,  0],
       [37, 58,  0],
       [37, 59,  6],
       [37, 60, 15],
       [37, 63,  0],
       [38, 59,  2],
       [38, 60,  0],
       [38, 60,  0]])

In [210]:
minority_class

array([[34, 59,  0],
       [34, 66,  9],
       [38, 69, 21],
       [39, 66,  0],
       [41, 60, 23],
       [41, 64,  0],
       [41, 67,  0],
       [42, 69,  1],
       [42, 59,  0],
       [43, 58, 52],
       [43, 59,  2],
       [43, 64,  0],
       [43, 64,  0],
       [44, 64,  6],
       [44, 58,  9],
       [44, 63, 19],
       [45, 65,  6],
       [45, 66,  0],
       [45, 67,  1],
       [46, 58,  2],
       [46, 69,  3],
       [46, 62,  5],
       [46, 65, 20],
       [47, 63, 23],
       [47, 62,  0],
       [47, 65,  0],
       [48, 58, 11],
       [48, 58, 11],
       [48, 67,  7],
       [49, 63,  0],
       [49, 64, 10],
       [50, 63, 13],
       [50, 64,  0],
       [51, 59, 13],
       [51, 59,  3],
       [52, 69,  3],
       [52, 59,  2],
       [52, 62,  3],
       [52, 66,  4],
       [53, 58,  4],
       [53, 65,  1],
       [53, 59,  3],
       [53, 60,  9],
       [53, 63, 24],
       [53, 65, 12],
       [54, 60, 11],
       [54, 65, 23],
       [54, 6

In [211]:
features = data.drop('label', axis=1).values
labels = data['label'].values

In [212]:
labels=np.reshape(labels, newshape=(len(data),1))

In [213]:
features.shape

(306, 3)

In [214]:
majority_count = len(labels_majority)
minority_count = len(labels_minority)

In [215]:
majority_count

225

In [216]:
if minority_count > 0:
    imbalance_ratio = majority_count / minority_count
else:
    imbalance_ratio = 0  # To handle cases with no majority class instances
print(f"Imbalance Ratio: {imbalance_ratio:.4f}")

Imbalance Ratio: 2.7778


In [217]:
def sample_size(A, num_min):
    p = num_min / A
    if p == 0 and A >= 9:
        size1 = 0
    elif p == 0 or p == 1:
        size1 = 1
    else:
        Z = 1.64
        epsilon = 0.05
        e = epsilon + np.log(A) / A
        x = (Z**2 * p * (1-p)) / (e**2)
        size1 = (A * x) / (x + A - 1)
    return math.ceil(size1)

In [218]:
calculated_sample_size = sample_size(majority_count, minority_count)
calculated_sample_size

76

In [219]:

def calculate_weights(data, labels, k=5):
    labels = np.array(labels).flatten()
    nbrs = NearestNeighbors(n_neighbors=k+1, algorithm='ball_tree').fit(data)
    distances, indices = nbrs.kneighbors(data)

    weights = np.zeros(data.shape[0])
    for i in range(data.shape[0]):
        distances_i = distances[i, 1:]  # ignore the first one because it's the distance to itself
        indices_i = indices[i, 1:]
        inverse_distances = 1 / (distances_i + 1e-5)

        same_class_mask = (labels[indices_i] == labels[i])
        different_class_mask = ~same_class_mask

        same_class_weights = inverse_distances * same_class_mask
        different_class_weights = inverse_distances * different_class_mask

        total_same_class_weight = np.sum(same_class_weights)
        total_different_class_weight = np.sum(different_class_weights)
        total_weight = total_same_class_weight + total_different_class_weight

        if total_weight > 0:
            complexity_score = total_same_class_weight / total_weight
        else:
            complexity_score = 0  # Handle cases with no effective neighbors

        weights[i] = complexity_score

    return weights


In [220]:
weights = calculate_weights(features, labels, k=5)

In [221]:
weights.shape

(306,)

In [222]:
majority_weights = weights[labels.flatten() == majority_label]
minority_weights = weights[labels.flatten() == minority_label]

In [223]:
minority_weights

array([0.        , 0.        , 0.57102648, 0.12417353, 0.54319856,
       0.99996586, 0.        , 0.44759984, 0.        , 0.50764853,
       0.15528144, 0.99997138, 0.99997138, 0.51860421, 0.39322704,
       0.398695  , 0.51387521, 0.20710683, 0.99997846, 0.11755716,
       0.18387879, 0.3345176 , 0.52335907, 0.8220282 , 0.12237986,
       0.13304315, 0.99999535, 0.99999535, 0.20835625, 0.18469917,
       0.38918181, 0.81293854, 0.99998086, 1.        , 0.40350357,
       0.20188401, 0.41523949, 0.        , 0.39779041, 0.64042073,
       0.        , 0.41523949, 0.43052398, 0.53902451, 0.82652233,
       0.82201316, 0.45567319, 0.81066124, 0.38157007, 0.64941895,
       0.54489507, 0.43346403, 0.25890901, 0.20933658, 0.62505379,
       0.        , 0.34101377, 0.83978555, 0.25052796, 0.13368864,
       0.24264032, 0.        , 0.63808689, 0.        , 0.56373159,
       0.        , 0.99997293, 0.197932  , 0.33466314, 0.60327036,
       0.999975  , 0.61882992, 0.45176892, 0.        , 0.45767

In [224]:
majority_weights

array([1.00000000e+00, 1.00000000e+00, 1.00000000e+00, 8.12723655e-01,
       8.62329051e-01, 8.64036053e-01, 7.69204017e-01, 3.76384181e-01,
       7.81111355e-01, 8.21141139e-01, 6.29536107e-01, 7.35488336e-01,
       7.90524647e-01, 1.00000000e+00, 1.00000000e+00, 6.36309746e-01,
       1.00000000e+00, 1.00000000e+00, 1.00000000e+00, 1.00000000e+00,
       1.00000000e+00, 1.00000000e+00, 1.00000000e+00, 1.00000000e+00,
       1.00000000e+00, 1.00000000e+00, 1.00000000e+00, 6.57272614e-01,
       7.75735964e-01, 1.00000000e+00, 1.00000000e+00, 8.19071970e-01,
       5.01309575e-01, 1.00000000e+00, 1.00000000e+00, 1.00000000e+00,
       1.00000000e+00, 1.00000000e+00, 6.56853762e-01, 8.31925330e-01,
       7.19368350e-01, 7.49472038e-01, 3.41406700e-05, 6.42603865e-01,
       9.99990000e-01, 9.99990000e-01, 7.26351833e-01, 6.07921543e-01,
       5.16369025e-01, 7.96494733e-01, 1.34427196e-01, 8.39811061e-01,
       4.25809109e-01, 8.01070514e-01, 8.63189477e-01, 1.00000000e+00,
      

In [225]:
minority_complexity = sum(minority_weights)/len(minority_weights)
majority_complexity = sum(majority_weights)/len(majority_weights)
print(f"Minority Class Complexity: {minority_complexity:.4f}")
print(f"Majority Class Complexity: {majority_complexity:.4f}")

Minority Class Complexity: 0.4227
Majority Class Complexity: 0.7805


In [226]:
print("Data Shape:", data.shape)
print("Labels Shape:", labels.shape)
print("Weights Shape:", weights.shape)
print("Majority weights shape: ", majority_weights.shape)
print("Majority weights shape: ", minority_weights.shape)
print("Adjusted Labels Shape:", labels.shape)
print("Minority class shape: ", minority_class.shape)
print("Majority class shape: ", majority_class.shape)

Data Shape: (306, 4)
Labels Shape: (306, 1)
Weights Shape: (306,)
Majority weights shape:  (225,)
Majority weights shape:  (81,)
Adjusted Labels Shape: (306, 1)
Minority class shape:  (81, 3)
Majority class shape:  (225, 3)


In [227]:
# Setting random seeds for reproducibility
random_seed = 42
np.random.seed(random_seed)
torch.manual_seed(random_seed)
random.seed(random_seed)

# For CUDA
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(random_seed)


In [228]:
# Define the MLP model class
class MLP(nn.Module):
    def __init__(self, input_size, output_size, device):
        super(MLP, self).__init__()
        h = 2 * (input_size + output_size) // 3
        self.device = device
        self.hidden = nn.Linear(input_size, h).to(device)
        self.output = nn.Linear(h, output_size).to(device)

    def forward(self, x):
        x = F.relu(self.hidden(x))
        x = self.output(x)
        return x

In [229]:
# Function to load data into DataLoaders
def load_data(features, labels, batch_size):
    dataset = TensorDataset(torch.tensor(features, dtype=torch.float32), torch.tensor(labels, dtype=torch.int64))
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    return loader

In [230]:
# Function to train the model
def train_model(model, train_loader, criterion, optimizer, device, num_epochs):
    best_recall_score = float('-inf')
    best_model_path = "best_fitnes.pth"

    for epoch in range(num_epochs):
        model.train()
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

        # Evaluate on the training set for early stopping (optional)
        all_predictions, all_labels = [], []
        with torch.no_grad():
            for inputs, labels in train_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                _, predicted = torch.max(outputs.data, 1)
                all_predictions.extend(predicted.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())
        fold_recall = recall_score(all_labels, all_predictions)
        if fold_recall > best_recall_score:
            best_recall_score = fold_recall
            torch.save(model.state_dict(), best_model_path)

    model.load_state_dict(torch.load(best_model_path))
    model.to(device).eval()
    return model


In [231]:
# Function to evaluate the model
def evaluate_model(model, test_loader, criterion, device):
    test_loss = 0
    all_labels, all_predictions, all_probabilities = [], [], []

    model.eval()
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            test_loss += loss.item() * inputs.size(0)
            probabilities = torch.softmax(outputs, dim=1)
            _, predicted = torch.max(outputs, 1)
            all_labels.extend(labels.cpu().numpy())
            all_predictions.extend(predicted.cpu().numpy())
            all_probabilities.extend(probabilities[:, 1].cpu().numpy())  # Assuming binary classification

    test_loss /= len(test_loader.dataset)
    accuracy = accuracy_score(all_labels, all_predictions)
    recall = recall_score(all_labels, all_predictions)
    f1 = f1_score(all_labels, all_predictions)
    gmean = geometric_mean_score(all_labels, all_predictions)
    auc_roc = roc_auc_score(all_labels, all_probabilities)

    print(f'Test Loss: {test_loss:.4f},'
          f' Accuracy: {accuracy:.4f},'
          f' Recall: {recall:.4f},'
          f' F1 Score: {f1:.4f},'
          f' G-Mean: {gmean:.4f},'
          f' ROC AUC: {auc_roc:.4f}')

    return auc_roc, recall, gmean, f1

In [232]:
# Define GA and related functions
def initialize_population(valid_indices, population_size, individual_size):
    return [np.random.choice(valid_indices, size=individual_size, replace=False) for _ in range(population_size)]

def concatenate_samples(B, A, SetA):
    P = np.atleast_2d(B)
    N = np.atleast_2d(A)
    SetA = np.atleast_2d(SetA)
    P_cartesian = np.array([np.concatenate([p1, p2]) for p1 in P for p2 in B])
    N_cartesian = np.array([np.concatenate([n, sn]) for n in N for sn in SetA])
    return P_cartesian, N_cartesian

def calculate_weights(data, labels, k=5):
    labels = np.array(labels).flatten()
    nbrs = NearestNeighbors(n_neighbors=k+1, algorithm='ball_tree').fit(data)
    distances, indices = nbrs.kneighbors(data)
    weights = np.zeros(data.shape[0])
    for i in range(data.shape[0]):
        distances_i = distances[i, 1:]
        indices_i = indices[i, 1:]
        inverse_distances = 1 / (distances_i + 1e-5)
        same_class_mask = (labels[indices_i] == labels[i])
        different_class_mask = ~same_class_mask
        same_class_weights = inverse_distances * same_class_mask
        different_class_weights = inverse_distances * different_class_mask
        total_same_class_weight = np.sum(same_class_weights)
        total_different_class_weight = np.sum(different_class_weights)
        total_weight = total_same_class_weight + total_different_class_weight
        if total_weight > 0:
            complexity_score = total_same_class_weight / total_weight
        else:
            complexity_score = 0
        weights[i] = complexity_score
    return weights

def calculate_class_complexity(B_cartesian, A_cartesian, weights):
    if len(B_cartesian) > 0:
        minority_weights_new = weights[:len(B_cartesian)]
        minority_complexity = np.mean(minority_weights_new) if len(minority_weights_new) > 0 else 0
    else:
        minority_complexity = 0
    if len(A_cartesian) > 0:
        majority_weights_new = weights[len(A_cartesian):]
        majority_complexity = np.mean(majority_weights_new) if len(majority_weights_new) > 0 else 0
    else:
        majority_complexity = 0
    return minority_complexity, majority_complexity

def calculate_fitness(individual, features, minority_class, majority_class, k=5):
    best_samples = features[individual]
    B_cartesian, A_cartesian = concatenate_samples(minority_class, majority_class, best_samples)
    new_data = np.vstack((B_cartesian, A_cartesian))
    new_labels = np.concatenate([np.zeros(len(B_cartesian)), np.ones(len(A_cartesian))])
    new_weights = calculate_weights(new_data, new_labels, k=k)
    minority_complexity, majority_complexity = calculate_class_complexity(B_cartesian, A_cartesian, new_weights)
    fitness_score = minority_complexity + majority_complexity
    return fitness_score, minority_complexity, majority_complexity

def select_parents(population, fitnesses, num_parents):
    parents = []
    for _ in range(num_parents):
        participants = np.random.choice(len(population), size=3, replace=False)
        best = participants[np.argmax(fitnesses[participants])]
        parents.append(population[best])
    return parents

def crossover(parent1, parent2):
    crossover_point = np.random.randint(1, len(parent1))
    child1 = np.concatenate([parent1[:crossover_point], parent2[crossover_point:]])
    child2 = np.concatenate([parent2[:crossover_point], parent1[crossover_point:]])
    return child1, child2

def mutate(individual, mutation_rate, valid_indices):
    for i in range(len(individual)):
        if np.random.rand() < mutation_rate:
            individual[i] = np.random.choice(valid_indices)
    return individual

def genetic_algorithm(features, population_size, individual_size, generations, initial_mutation_rate, optimal_fitness, minority_class, majority_class, k=5):
    valid_indices = np.arange(features.shape[0])
    population = initialize_population(valid_indices, population_size, individual_size)
    best_overall_fitness = -np.inf
    best_overall_individual = None
    best_minority_complexity = 0
    best_majority_complexity = 0
    for generation in range(generations):
        fitness_results = [calculate_fitness(ind, features, minority_class, majority_class, k=k) for ind in population]
        fitnesses = [result[0] for result in fitness_results]
        minority_complexities = [result[1] for result in fitness_results]
        majority_complexities = [result[2] for result in fitness_results]
        fitnesses = np.nan_to_num(fitnesses, nan=0.0)
        best_index = np.argmax(fitnesses)
        if fitnesses[best_index] > best_overall_fitness:
            best_overall_fitness = fitnesses[best_index]
            best_overall_individual = population[best_index]
            best_minority_complexity = minority_complexities[best_index]
            best_majority_complexity = majority_complexities[best_index]
        if best_overall_fitness >= optimal_fitness:
            break
        mutation_rate = initial_mutation_rate * (1 - generation / generations)
        parents = select_parents(population, fitnesses, len(population) - 1)
        next_population = [best_overall_individual]
        for i in range(0, len(parents), 2):
            if i + 1 < len(parents):
                child1, child2 = crossover(parents[i], parents[i + 1])
                child1 = mutate(child1, mutation_rate, valid_indices)
                child2 = mutate(child2, mutation_rate, valid_indices)
                next_population.extend([child1, child2])
        population = next_population[:population_size]
        print(f"Generation {generation + 1}, Best Fitness: {best_overall_fitness:.4f}, Minority Complexity: {best_minority_complexity:.4f}, Majority Complexity: {best_majority_complexity:.4f}")
    best_samples = features[best_overall_individual]
    best_fitness = calculate_fitness(best_overall_individual, features, minority_class, majority_class, k=k)[0]
    return best_samples, best_fitness

def concatenate_with_itself(data):
    """Concatenate each sample with itself."""
    return np.hstack([data, data])

In [233]:
def main(data_path, config):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    criterion = nn.CrossEntropyLoss()
    data = pd.read_csv(data_path)
    train_data, test_data = train_test_split(data, test_size=0.2, random_state=42, stratify=data['label'])
    train_features = train_data.drop('label', axis=1).values
    train_labels = train_data['label'].values
    test_features = test_data.drop('label', axis=1).values
    test_labels = test_data['label'].values
    X_train, X_test, y_train, y_test = train_test_split(train_features, train_labels, test_size=0.2, random_state=42, stratify=train_labels)
    minority_class_train = X_train[y_train == 1]
    majority_class_train = X_train[y_train == 0]
    best_samples, _ = genetic_algorithm(
        majority_class_train, config['population_size'], config['individual_size'], config['generations'],
        config['initial_mutation_rate'], config['optimal_fitness'], minority_class_train, majority_class_train
    )
    X_train_majority_selected = np.array([np.concatenate([x, y]) for x in best_samples for y in best_samples])
    X_train_minority_selected = concatenate_with_itself(minority_class_train)
    X_train_selected = np.vstack((X_train_minority_selected, X_train_majority_selected))
    y_train_selected = np.hstack((np.ones(len(X_train_minority_selected)), np.zeros(len(X_train_majority_selected))))
    X_test_majority = X_test[y_test == 0]
    X_test_minority = X_test[y_test == 1]
    X_test_majority_concatenated = np.array([np.concatenate([x, y]) for x in X_test_majority for y in X_test_majority])
    X_test_minority_concatenated = np.array([np.concatenate([x, y]) for x in X_test_minority for y in X_test_minority])
    X_test_concatenated = np.vstack((X_test_minority_concatenated, X_test_majority_concatenated))
    y_test_concatenated = np.hstack((np.ones(len(X_test_minority_concatenated)), np.zeros(len(X_test_majority_concatenated))))
    train_dataset = TensorDataset(torch.tensor(X_train_selected, dtype=torch.float32), torch.tensor(y_train_selected, dtype=torch.int64))
    test_dataset = TensorDataset(torch.tensor(X_test_concatenated, dtype=torch.float32), torch.tensor(y_test_concatenated, dtype=torch.int64))
    train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=config['batch_size'], shuffle=False)
    print(f"X_train_selected shape: {X_train_selected.shape}")
    print(f"y_train_selected shape: {y_train_selected.shape}")
    print(f"X_test_concatenated shape: {X_test_concatenated.shape}")
    print(f"y_test_concatenated shape: {y_test_concatenated.shape}")
    model = MLP(X_train_selected.shape[1], 2, device)
    optimizer = torch.optim.Adam(model.parameters(), lr=config['learning_rate'])
    model = train_model(model, train_loader, criterion, optimizer, device, config['num_epochs'])
    auc_roc, recall, gmean, f1 = evaluate_model(model, test_loader, criterion, device)
    return auc_roc, recall, gmean, f1

config = {
    'population_size': 10,
    'individual_size': calculated_sample_size,
    'generations': 500,
    'initial_mutation_rate': 0.01,
    'optimal_fitness': 2 * 0.999,
    'k_folds': 5,
    'batch_size': 32,
    'num_epochs': 50,
    'learning_rate': 0.001,
    'random_seed': 42
}

auc_roc, recall, gmean, f1 = main(data_path, config)
print(f"AUC: {auc_roc:.4f}, Recall: {recall:.4f}, G-mean: {gmean:.4f}, F1 Score: {f1:.4f}")


Generation 1, Best Fitness: 1.8388, Minority Complexity: 0.8547, Majority Complexity: 0.9841
Generation 2, Best Fitness: 1.8557, Minority Complexity: 0.8660, Majority Complexity: 0.9896
Generation 3, Best Fitness: 1.8579, Minority Complexity: 0.8686, Majority Complexity: 0.9893
Generation 4, Best Fitness: 1.8666, Minority Complexity: 0.8762, Majority Complexity: 0.9904
Generation 5, Best Fitness: 1.8666, Minority Complexity: 0.8762, Majority Complexity: 0.9904
Generation 6, Best Fitness: 1.8668, Minority Complexity: 0.8765, Majority Complexity: 0.9904
Generation 7, Best Fitness: 1.8701, Minority Complexity: 0.8792, Majority Complexity: 0.9909
Generation 8, Best Fitness: 1.8701, Minority Complexity: 0.8792, Majority Complexity: 0.9909
Generation 9, Best Fitness: 1.8701, Minority Complexity: 0.8792, Majority Complexity: 0.9909
Generation 10, Best Fitness: 1.8701, Minority Complexity: 0.8799, Majority Complexity: 0.9903
Generation 11, Best Fitness: 1.8708, Minority Complexity: 0.8799, Maj