In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import math
import random
from scipy.spatial import distance_matrix
from sklearn.neighbors import NearestNeighbors
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset, SubsetRandomSampler
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, recall_score, roc_curve, precision_recall_curve, roc_auc_score, confusion_matrix
from imblearn.metrics import geometric_mean_score
import matplotlib.pyplot as plt
from sklearn.model_selection import RepeatedStratifiedKFold


In [None]:
data_path = '/content/ecoli-0-6-7_vs_5.csv'
data = pd.read_csv(data_path)

In [None]:
np.random.seed(42)

In [None]:
data.shape

(220, 7)

In [None]:
data.head()

Unnamed: 0,1,2,3,4,5,6,label
0,49.0,29.0,48.0,56.0,24.0,35.0,0
1,7.0,4.0,48.0,54.0,35.0,44.0,0
2,56.0,4.0,48.0,49.0,37.0,46.0,0
3,59.0,49.0,48.0,52.0,45.0,36.0,0
4,23.0,32.0,48.0,55.0,25.0,35.0,0


In [None]:
label_counts = data['label'].value_counts()
label_counts

label
0    200
1     20
Name: count, dtype: int64

In [None]:
missing_values = data.isnull().sum()
missing_values

1        0
2        0
3        0
4        0
5        0
6        0
label    0
dtype: int64

In [None]:
majority_label = label_counts.idxmax()
minority_label = label_counts.idxmin()

In [None]:
majority_label

0

In [None]:
minority_label

1

In [None]:
majority_class = data[data['label'] == majority_label].iloc[:, :-1].values
minority_class = data[data['label'] == minority_label].iloc[:, :-1].values
labels_majority = data[data['label'] == majority_label]['label'].values
labels_minority = data[data['label'] == minority_label]['label'].values

In [None]:
majority_class[:25]

array([[49., 29., 48., 56., 24., 35.],
       [ 7.,  4., 48., 54., 35., 44.],
       [56.,  4., 48., 49., 37., 46.],
       [59., 49., 48., 52., 45., 36.],
       [23., 32., 48., 55., 25., 35.],
       [67., 39., 48., 36., 38., 46.],
       [29., 28., 48., 44., 23., 34.],
       [21., 34., 48., 51., 28., 39.],
       [ 2., 44., 48., 46., 51., 57.],
       [42.,  4., 48., 56., 18.,  3.],
       [42., 24., 48., 57., 27., 37.],
       [25., 48., 48., 44., 17., 29.],
       [39., 32., 48., 46., 24., 35.],
       [51.,  5., 48., 46., 32., 35.],
       [22., 43., 48., 48., 16., 28.],
       [25.,  4., 48., 46., 44., 52.],
       [34., 45., 48., 38., 24., 35.],
       [44., 27., 48., 55., 52., 58.],
       [23.,  4., 48., 39., 28., 38.],
       [41., 57., 48., 39., 21., 32.],
       [ 4., 45., 48., 38., 22.,  0.],
       [31., 23., 48., 73.,  5., 14.],
       [51., 54., 48., 41., 34., 43.],
       [ 3., 16., 48., 56., 11., 23.],
       [36., 39., 48., 48., 22., 23.]])

In [None]:
minority_class

array([[78., 68., 48., 83.,  4., 29.],
       [63., 69., 48., 65., 41., 28.],
       [67., 88., 48., 73.,  5., 25.],
       [61., 75., 48., 51., 33., 33.],
       [67., 84., 48., 74., 54., 37.],
       [74.,  9., 48., 57., 53., 29.],
       [73., 84., 48., 86., 58., 29.],
       [75., 76., 48., 83., 57.,  3.],
       [77., 57., 48., 88., 53.,  2.],
       [74., 78., 48., 75., 54., 15.],
       [68., 76., 48., 84., 45., 27.],
       [56., 68., 48., 77., 36., 45.],
       [65., 51., 48., 66., 54., 33.],
       [52., 81., 48., 72., 38., 38.],
       [64., 57., 48.,  7., 33., 26.],
       [ 6., 76.,  1., 77., 59., 52.],
       [69., 59., 48., 77., 39., 21.],
       [63., 49., 48., 79., 45., 28.],
       [71., 71., 48., 68., 43., 36.],
       [68., 63., 48., 73.,  4.,  3.]])

In [None]:
features = data.drop('label', axis=1).values
labels = data['label'].values

In [None]:
labels=np.reshape(labels, newshape=(len(data),1))

In [None]:
features.shape

(220, 6)

In [None]:
majority_count = len(labels_majority)
minority_count = len(labels_minority)

In [None]:
majority_count

200

In [None]:
if minority_count > 0:
    imbalance_ratio = majority_count / minority_count
else:
    imbalance_ratio = 0  # To handle cases with no majority class instances
print(f"Imbalance Ratio: {imbalance_ratio:.4f}")

Imbalance Ratio: 10.0000


In [None]:
def sample_size(N, num_min):
    p = num_min / N
    if p == 0 and N >= 9:
        size1 = 0
    elif p == 0 or p == 1:
        size1 = 1
    else:
        Z = 1.64
        epsilon = 0.05
        e = epsilon + np.log(N) / N
        x = (Z**2 * p * (1-p)) / (e**2)
        size1 = (N * x) / (x + N - 1)
    return math.ceil(size1)

In [None]:
calculated_sample_size = sample_size(majority_count, minority_count)
calculated_sample_size

35

In [None]:

def calculate_weights(data, labels, k=5):
    labels = np.array(labels).flatten()
    nbrs = NearestNeighbors(n_neighbors=k+1, algorithm='ball_tree').fit(data)
    distances, indices = nbrs.kneighbors(data)

    weights = np.zeros(data.shape[0])
    for i in range(data.shape[0]):
        distances_i = distances[i, 1:]  # ignore the first one because it's the distance to itself
        indices_i = indices[i, 1:]
        inverse_distances = 1 / (distances_i + 1e-5)

        same_class_mask = (labels[indices_i] == labels[i])
        different_class_mask = ~same_class_mask

        same_class_weights = inverse_distances * same_class_mask
        different_class_weights = inverse_distances * different_class_mask

        total_same_class_weight = np.sum(same_class_weights)
        total_different_class_weight = np.sum(different_class_weights)
        total_weight = total_same_class_weight + total_different_class_weight

        if total_weight > 0:
            complexity_score = total_same_class_weight / total_weight
        else:
            complexity_score = 0  # Handle cases with no effective neighbors

        weights[i] = complexity_score

    return weights


In [None]:
weights = calculate_weights(features, labels, k=5)

In [None]:
weights.shape

(220,)

In [None]:
majority_weights = weights[labels.flatten() == majority_label]
minority_weights = weights[labels.flatten() == minority_label]

In [None]:
minority_weights

array([0.81422376, 0.62756528, 0.45767952, 0.16085269, 0.80570897,
       0.        , 1.        , 1.        , 1.        , 0.83496027,
       1.        , 0.81630406, 0.56606049, 1.        , 0.        ,
       0.        , 1.        , 0.84567045, 0.54498283, 0.35887012])

In [None]:
majority_weights

array([1.        , 1.        , 1.        , 0.82481837, 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 0.79731293, 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.     

In [None]:
minority_complexity = sum(minority_weights)/len(minority_weights)
majority_complexity = sum(majority_weights)/len(majority_weights)
print(f"Minority Class Complexity: {minority_complexity:.4f}")
print(f"Majority Class Complexity: {majority_complexity:.4f}")

Minority Class Complexity: 0.6416
Majority Class Complexity: 0.9782


In [None]:
print("Data Shape:", data.shape)
print("Labels Shape:", labels.shape)
print("Weights Shape:", weights.shape)
print("Majority weights shape: ", majority_weights.shape)
print("Majority weights shape: ", minority_weights.shape)
print("Adjusted Labels Shape:", labels.shape)
print("Minority class shape: ", minority_class.shape)
print("Majority class shape: ", majority_class.shape)

Data Shape: (220, 7)
Labels Shape: (220, 1)
Weights Shape: (220,)
Majority weights shape:  (200,)
Majority weights shape:  (20,)
Adjusted Labels Shape: (220, 1)
Minority class shape:  (20, 6)
Majority class shape:  (200, 6)


In [None]:
import random

# Setting random seeds for reproducibility
random_seed = 42
np.random.seed(random_seed)
torch.manual_seed(random_seed)
random.seed(random_seed)

# For CUDA
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(random_seed)


In [None]:
# Define the MLP model class
class MLP(nn.Module):
    def __init__(self, input_size, output_size, device):
        super(MLP, self).__init__()
        h = 2 * (input_size + output_size) // 3
        self.device = device
        self.hidden = nn.Linear(input_size, h).to(device)
        self.output = nn.Linear(h, output_size).to(device)

    def forward(self, x):
        x = F.relu(self.hidden(x))
        x = self.output(x)
        return x

# Function to load data into DataLoaders
def load_data(features, labels, batch_size):
    dataset = TensorDataset(torch.tensor(features, dtype=torch.float32), torch.tensor(labels, dtype=torch.int64))
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    return loader

# Function to train the model
def train_model(model, train_loader, criterion, optimizer, device, num_epochs):
    best_recall_score = float('-inf')
    best_model_path = "best_model.pth"

    for epoch in range(num_epochs):
        model.train()
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

        # Evaluate on the training set for early stopping (optional)
        all_predictions, all_labels = [], []
        with torch.no_grad():
            for inputs, labels in train_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                _, predicted = torch.max(outputs.data, 1)
                all_predictions.extend(predicted.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())
        fold_recall = recall_score(all_labels, all_predictions)
        if fold_recall > best_recall_score:
            best_recall_score = fold_recall
            torch.save(model.state_dict(), best_model_path)

    model.load_state_dict(torch.load(best_model_path))
    model.to(device).eval()
    return model

# Function to evaluate the model
def evaluate_model(model, test_loader, criterion, device):
    test_loss = 0
    all_labels, all_predictions, all_probabilities = [], [], []

    model.eval()
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            test_loss += loss.item() * inputs.size(0)
            probabilities = torch.softmax(outputs, dim=1)
            _, predicted = torch.max(outputs, 1)
            all_labels.extend(labels.cpu().numpy())
            all_predictions.extend(predicted.cpu().numpy())
            all_probabilities.extend(probabilities[:, 1].cpu().numpy())  # Assuming binary classification

    test_loss /= len(test_loader.dataset)
    accuracy = accuracy_score(all_labels, all_predictions)
    recall = recall_score(all_labels, all_predictions)
    f1 = f1_score(all_labels, all_predictions)
    gmean = geometric_mean_score(all_labels, all_predictions)
    auc_roc = roc_auc_score(all_labels, all_probabilities)

    print(f'Test Loss: {test_loss:.4f},'
          f' Accuracy: {accuracy:.4f},'
          f' Recall: {recall:.4f},'
          f' F1 Score: {f1:.4f},'
          f' G-Mean: {gmean:.4f},'
          f' ROC AUC: {auc_roc:.4f}')

    return auc_roc, recall, gmean, f1

# Define GA and related functions
def initialize_population(valid_indices, population_size, individual_size):
    return [np.random.choice(valid_indices, size=individual_size, replace=False) for _ in range(population_size)]

def concatenate_samples(P, N, SetN):
    P = np.atleast_2d(P)
    N = np.atleast_2d(N)
    SetN = np.atleast_2d(SetN)
    P_cartesian = np.array([np.concatenate([p1, p2]) for p1 in P for p2 in P])
    N_cartesian = np.array([np.concatenate([n, sn]) for n in N for sn in SetN])
    return P_cartesian, N_cartesian

def calculate_weights(data, labels, k=5):
    labels = np.array(labels).flatten()
    nbrs = NearestNeighbors(n_neighbors=k+1, algorithm='ball_tree').fit(data)
    distances, indices = nbrs.kneighbors(data)
    weights = np.zeros(data.shape[0])
    for i in range(data.shape[0]):
        distances_i = distances[i, 1:]
        indices_i = indices[i, 1:]
        inverse_distances = 1 / (distances_i + 1e-5)
        same_class_mask = (labels[indices_i] == labels[i])
        different_class_mask = ~same_class_mask
        same_class_weights = inverse_distances * same_class_mask
        different_class_weights = inverse_distances * different_class_mask
        total_same_class_weight = np.sum(same_class_weights)
        total_different_class_weight = np.sum(different_class_weights)
        total_weight = total_same_class_weight + total_different_class_weight
        if total_weight > 0:
            complexity_score = total_same_class_weight / total_weight
        else:
            complexity_score = 0
        weights[i] = complexity_score
    return weights

def calculate_class_complexity(P_cartesian, N_cartesian, weights):
    if len(P_cartesian) > 0:
        minority_weights_new = weights[:len(P_cartesian)]
        minority_complexity = np.mean(minority_weights_new) if len(minority_weights_new) > 0 else 0
    else:
        minority_complexity = 0
    if len(N_cartesian) > 0:
        majority_weights_new = weights[len(P_cartesian):]
        majority_complexity = np.mean(majority_weights_new) if len(majority_weights_new) > 0 else 0
    else:
        majority_complexity = 0
    return minority_complexity, majority_complexity

def calculate_fitness(individual, features, minority_class, majority_class, k=5):
    best_samples = features[individual]
    P_cartesian, N_cartesian = concatenate_samples(minority_class, majority_class, best_samples)
    new_data = np.vstack((P_cartesian, N_cartesian))
    new_labels = np.concatenate([np.zeros(len(P_cartesian)), np.ones(len(N_cartesian))])
    new_weights = calculate_weights(new_data, new_labels, k=k)
    minority_complexity, majority_complexity = calculate_class_complexity(P_cartesian, N_cartesian, new_weights)
    fitness_score = minority_complexity + majority_complexity
    return fitness_score, minority_complexity, majority_complexity

def select_parents(population, fitnesses, num_parents):
    parents = []
    for _ in range(num_parents):
        participants = np.random.choice(len(population), size=3, replace=False)
        best = participants[np.argmax(fitnesses[participants])]
        parents.append(population[best])
    return parents

def crossover(parent1, parent2):
    crossover_point = np.random.randint(1, len(parent1))
    child1 = np.concatenate([parent1[:crossover_point], parent2[crossover_point:]])
    child2 = np.concatenate([parent2[:crossover_point], parent1[crossover_point:]])
    return child1, child2

def mutate(individual, mutation_rate, valid_indices):
    for i in range(len(individual)):
        if np.random.rand() < mutation_rate:
            individual[i] = np.random.choice(valid_indices)
    return individual

def genetic_algorithm(features, population_size, individual_size, generations, initial_mutation_rate, optimal_fitness, minority_class, majority_class, k=5):
    valid_indices = np.arange(features.shape[0])
    population = initialize_population(valid_indices, population_size, individual_size)
    best_overall_fitness = -np.inf
    best_overall_individual = None
    best_minority_complexity = 0
    best_majority_complexity = 0
    for generation in range(generations):
        fitness_results = [calculate_fitness(ind, features, minority_class, majority_class, k=k) for ind in population]
        fitnesses = [result[0] for result in fitness_results]
        minority_complexities = [result[1] for result in fitness_results]
        majority_complexities = [result[2] for result in fitness_results]
        fitnesses = np.nan_to_num(fitnesses, nan=0.0)
        best_index = np.argmax(fitnesses)
        if fitnesses[best_index] > best_overall_fitness:
            best_overall_fitness = fitnesses[best_index]
            best_overall_individual = population[best_index]
            best_minority_complexity = minority_complexities[best_index]
            best_majority_complexity = majority_complexities[best_index]
        if best_overall_fitness >= optimal_fitness:
            break
        mutation_rate = initial_mutation_rate * (1 - generation / generations)
        parents = select_parents(population, fitnesses, len(population) - 1)
        next_population = [best_overall_individual]
        for i in range(0, len(parents), 2):
            if i + 1 < len(parents):
                child1, child2 = crossover(parents[i], parents[i + 1])
                child1 = mutate(child1, mutation_rate, valid_indices)
                child2 = mutate(child2, mutation_rate, valid_indices)
                next_population.extend([child1, child2])
        population = next_population[:population_size]
        print(f"Generation {generation + 1}, Best Fitness: {best_overall_fitness:.4f}, Minority Complexity: {best_minority_complexity:.4f}, Majority Complexity: {best_majority_complexity:.4f}")
    best_samples = features[best_overall_individual]
    best_fitness = calculate_fitness(best_overall_individual, features, minority_class, majority_class, k=k)[0]
    return best_samples, best_fitness

def concatenate_with_itself(data):
    """Concatenate each sample with itself."""
    return np.hstack([data, data])

def main(data_path, config):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    criterion = nn.CrossEntropyLoss()
    data = pd.read_csv(data_path)
    train_data, test_data = train_test_split(data, test_size=0.2, random_state=42, stratify=data['label'])
    train_features = train_data.drop('label', axis=1).values
    train_labels = train_data['label'].values
    test_features = test_data.drop('label', axis=1).values
    test_labels = test_data['label'].values
    X_train, X_test, y_train, y_test = train_test_split(train_features, train_labels, test_size=0.2, random_state=42, stratify=train_labels)
    minority_class_train = X_train[y_train == 1]
    majority_class_train = X_train[y_train == 0]
    best_samples, _ = genetic_algorithm(
        majority_class_train, config['population_size'], config['individual_size'], config['generations'],
        config['initial_mutation_rate'], config['optimal_fitness'], minority_class_train, majority_class_train
    )
    X_train_majority_selected = np.array([np.concatenate([x, y]) for x in best_samples for y in best_samples])
    X_train_minority_selected = concatenate_with_itself(minority_class_train)
    X_train_selected = np.vstack((X_train_minority_selected, X_train_majority_selected))
    y_train_selected = np.hstack((np.ones(len(X_train_minority_selected)), np.zeros(len(X_train_majority_selected))))
    X_test_majority = X_test[y_test == 0]
    X_test_minority = X_test[y_test == 1]
    X_test_majority_concatenated = np.array([np.concatenate([x, y]) for x in X_test_majority for y in X_test_majority])
    X_test_minority_concatenated = np.array([np.concatenate([x, y]) for x in X_test_minority for y in X_test_minority])
    X_test_concatenated = np.vstack((X_test_minority_concatenated, X_test_majority_concatenated))
    y_test_concatenated = np.hstack((np.ones(len(X_test_minority_concatenated)), np.zeros(len(X_test_majority_concatenated))))
    train_dataset = TensorDataset(torch.tensor(X_train_selected, dtype=torch.float32), torch.tensor(y_train_selected, dtype=torch.int64))
    test_dataset = TensorDataset(torch.tensor(X_test_concatenated, dtype=torch.float32), torch.tensor(y_test_concatenated, dtype=torch.int64))
    train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=config['batch_size'], shuffle=False)
    print(f"X_train_selected shape: {X_train_selected.shape}")
    print(f"y_train_selected shape: {y_train_selected.shape}")
    print(f"X_test_concatenated shape: {X_test_concatenated.shape}")
    print(f"y_test_concatenated shape: {y_test_concatenated.shape}")
    model = MLP(X_train_selected.shape[1], 2, device)
    optimizer = torch.optim.Adam(model.parameters(), lr=config['learning_rate'])
    model = train_model(model, train_loader, criterion, optimizer, device, config['num_epochs'])
    auc_roc, recall, gmean, f1 = evaluate_model(model, test_loader, criterion, device)
    return auc_roc, recall, gmean, f1

config = {
    'population_size': 15,
    'individual_size': calculated_sample_size,
    'generations': 20,
    'initial_mutation_rate': 0.01,
    'optimal_fitness': 2 * 0.999,
    'k_folds': 5,
    'batch_size': 32,
    'num_epochs': 50,
    'learning_rate': 0.001,
    'random_seed': 42
}

data_path = '/content/glass0123vs456.csv'

auc_roc, recall, gmean, f1 = main(data_path, config)
print(f"AUC: {auc_roc:.4f}, Recall: {recall:.4f}, G-mean: {gmean:.4f}, F1 Score: {f1:.4f}")


Generation 1, Best Fitness: 1.9951, Minority Complexity: 0.9957, Majority Complexity: 0.9994
Generation 2, Best Fitness: 1.9961, Minority Complexity: 0.9961, Majority Complexity: 1.0000
Generation 3, Best Fitness: 1.9961, Minority Complexity: 0.9961, Majority Complexity: 1.0000
Generation 4, Best Fitness: 1.9962, Minority Complexity: 0.9962, Majority Complexity: 1.0000
Generation 5, Best Fitness: 1.9962, Minority Complexity: 0.9962, Majority Complexity: 1.0000
Generation 6, Best Fitness: 1.9962, Minority Complexity: 0.9962, Majority Complexity: 1.0000
Generation 7, Best Fitness: 1.9962, Minority Complexity: 0.9962, Majority Complexity: 1.0000
Generation 8, Best Fitness: 1.9962, Minority Complexity: 0.9962, Majority Complexity: 1.0000
Generation 9, Best Fitness: 1.9962, Minority Complexity: 0.9962, Majority Complexity: 1.0000
Generation 10, Best Fitness: 1.9962, Minority Complexity: 0.9962, Majority Complexity: 1.0000
Generation 11, Best Fitness: 1.9962, Minority Complexity: 0.9962, Maj