In [None]:
import pandas as pd
import numpy as np
import math
from scipy.spatial import distance_matrix
import random
from sklearn.neighbors import NearestNeighbors


In [None]:
data_path = 'content/parkinsons.csv'
data = pd.read_csv(data_path)


In [None]:
data.shape

(195, 23)

In [None]:
data.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,14,15,16,17,18,19,20,21,22,label
0,119.992,157.302,74.997,0.00784,7e-05,0.0037,0.00554,0.01109,0.04374,0.426,...,0.06545,0.02211,21.033,0.414783,0.815285,-4.813031,0.266482,2.301442,0.284654,0
1,122.4,148.65,113.819,0.00968,8e-05,0.00465,0.00696,0.01394,0.06134,0.626,...,0.09403,0.01929,19.085,0.458359,0.819521,-4.075192,0.33559,2.486855,0.368674,0
2,116.682,131.111,111.555,0.0105,9e-05,0.00544,0.00781,0.01633,0.05233,0.482,...,0.0827,0.01309,20.651,0.429895,0.825288,-4.443179,0.311173,2.342259,0.332634,0
3,116.676,137.871,111.366,0.00997,9e-05,0.00502,0.00698,0.01505,0.05492,0.517,...,0.08771,0.01353,20.644,0.434969,0.819235,-4.117501,0.334147,2.405554,0.368975,0
4,116.014,141.781,110.655,0.01284,0.00011,0.00655,0.00908,0.01966,0.06425,0.584,...,0.1047,0.01767,19.649,0.417356,0.823484,-3.747787,0.234513,2.33218,0.410335,0


In [None]:
label_counts = data['label'].value_counts()
label_counts

label
0    147
1     48
Name: count, dtype: int64

In [None]:
missing_values = data.isnull().sum()
missing_values

1        0
2        0
3        0
4        0
5        0
6        0
7        0
8        0
9        0
10       0
11       0
12       0
13       0
14       0
15       0
16       0
17       0
18       0
19       0
20       0
21       0
22       0
label    0
dtype: int64

In [None]:
majority_label = label_counts.idxmax()
minority_label = label_counts.idxmin()

In [None]:
majority_label

0

In [None]:
minority_label

1

In [None]:
majority_class = data[data['label'] == majority_label].iloc[:, :-1].values
minority_class = data[data['label'] == minority_label].iloc[:, :-1].values
labels_majority = data[data['label'] == majority_label]['label'].values
labels_minority = data[data['label'] == minority_label]['label'].values

In [None]:
majority_class[:25]

array([[ 1.199920e+02,  1.573020e+02,  7.499700e+01,  7.840000e-03,
         7.000000e-05,  3.700000e-03,  5.540000e-03,  1.109000e-02,
         4.374000e-02,  4.260000e-01,  2.182000e-02,  3.130000e-02,
         2.971000e-02,  6.545000e-02,  2.211000e-02,  2.103300e+01,
         4.147830e-01,  8.152850e-01, -4.813031e+00,  2.664820e-01,
         2.301442e+00,  2.846540e-01],
       [ 1.224000e+02,  1.486500e+02,  1.138190e+02,  9.680000e-03,
         8.000000e-05,  4.650000e-03,  6.960000e-03,  1.394000e-02,
         6.134000e-02,  6.260000e-01,  3.134000e-02,  4.518000e-02,
         4.368000e-02,  9.403000e-02,  1.929000e-02,  1.908500e+01,
         4.583590e-01,  8.195210e-01, -4.075192e+00,  3.355900e-01,
         2.486855e+00,  3.686740e-01],
       [ 1.166820e+02,  1.311110e+02,  1.115550e+02,  1.050000e-02,
         9.000000e-05,  5.440000e-03,  7.810000e-03,  1.633000e-02,
         5.233000e-02,  4.820000e-01,  2.757000e-02,  3.858000e-02,
         3.590000e-02,  8.270000e-02, 

In [None]:
minority_class

array([[1.970760e+02, 2.068960e+02, 1.920550e+02, ..., 1.775510e-01,
        1.743867e+00, 8.556900e-02],
       [1.992280e+02, 2.095120e+02, 1.920910e+02, ..., 1.733190e-01,
        2.103106e+00, 6.850100e-02],
       [1.983830e+02, 2.152030e+02, 1.931040e+02, ..., 1.751810e-01,
        1.512275e+00, 9.632000e-02],
       ...,
       [1.746880e+02, 2.400050e+02, 7.428700e+01, ..., 1.584530e-01,
        2.679772e+00, 1.317280e-01],
       [1.987640e+02, 3.969610e+02, 7.490400e+01, ..., 2.074540e-01,
        2.138608e+00, 1.233060e-01],
       [2.142890e+02, 2.602770e+02, 7.797300e+01, ..., 1.906670e-01,
        2.555477e+00, 1.485690e-01]])

In [None]:
features = data.drop('label', axis=1).values  # Assuming 'label' is your class label column
labels = data['label'].values

In [None]:
labels=np.reshape(labels, newshape=(len(data),1))

In [None]:
features.shape

(195, 22)

In [None]:
majority_count = len(labels_majority)
minority_count = len(labels_minority)

In [None]:
majority_count

147

In [None]:
if minority_count > 0:
    imbalance_ratio = majority_count / minority_count
else:
    imbalance_ratio = 0  # To handle cases with no majority class instances
print(f"Imbalance Ratio: {imbalance_ratio:.4f}")

Imbalance Ratio: 3.0625


In [None]:
def sample_size(N, num_min):
    p = num_min / N
    if p == 0 and N >= 9:
        size1 = 0
    elif p == 0 or p == 1:
        size1 = 1
    else:
        Z = 1.64
        epsilon = 0.05
        e = epsilon + np.log(N) / N
        x = (Z**2 * p * (1-p)) / (e**2)
        size1 = (N * x) / (x + N - 1)
    return math.ceil(size1)

In [None]:
calculated_sample_size = sample_size(majority_count, minority_count)
calculated_sample_size

54

In [None]:

def calculate_weights(data, labels, k=5):
    labels = np.array(labels).flatten()
    nbrs = NearestNeighbors(n_neighbors=k+1, algorithm='ball_tree').fit(data)
    distances, indices = nbrs.kneighbors(data)

    weights = np.zeros(data.shape[0])
    for i in range(data.shape[0]):

        distances_i = distances[i, 1:]  # ignore the first one because it's the distance to itself
        indices_i = indices[i, 1:]
        inverse_distances = 1 / (distances_i + 1e-5)

        same_class_mask = (labels[indices_i] == labels[i])
        different_class_mask = ~same_class_mask

        same_class_weights = inverse_distances * same_class_mask
        different_class_weights = inverse_distances * different_class_mask

        total_same_class_weight = np.sum(same_class_weights)
        total_different_class_weight = np.sum(different_class_weights)
        total_weight = total_same_class_weight + total_different_class_weight

        if total_weight > 0:
            complexity_score = total_same_class_weight / total_weight
        else:
            complexity_score = 0  # Handle cases with no effective neighbors

        weights[i] = complexity_score

    return weights


In [None]:
weights = calculate_weights(features, labels, k=5)

In [None]:
weights.shape

(195,)

In [None]:
majority_weights = weights[labels.flatten() == majority_label]
minority_weights = weights[labels.flatten() == minority_label]

In [None]:
minority_weights

array([1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 0.64728821, 0.68444206, 0.78374225,
       0.64248574, 0.72807015, 0.7032221 , 0.42478093, 0.56396843,
       0.39874132, 1.        , 1.        , 0.71764557, 0.68814714,
       1.        , 1.        , 0.        , 1.        , 1.        ,
       0.19989978, 0.30374441, 0.51121327, 0.        , 0.23784556,
       0.        , 0.23809991, 0.        , 0.17973657, 0.44129808,
       0.40272143, 0.18938806, 0.51416573, 0.37084344, 0.28632871,
       0.18425742, 0.        , 0.57724861])

In [None]:
majority_weights

array([1.        , 1.        , 1.        , 1.        , 1.        ,
       0.57800943, 0.61352969, 0.33966985, 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 0.7624337 , 0.51601672, 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       0.56342666, 1.        , 1.        , 0.83724026, 1.        ,
       1.        , 1.        , 1.        , 1.        , 0.90144088,
       1.        , 0.83112739, 1.        , 1.        , 0.81648107,
       0.82926183, 0.76841388, 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 0.68710032, 0.25237315,
       0.67352781, 0.67309447, 0.76444988, 0.72291313, 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 0.82577893, 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 0.85661289, 0.88330

In [None]:
minority_complexity = sum(minority_weights)/len(minority_weights)
majority_complexity = sum(majority_weights)/len(majority_weights)
print(f"Minority Class Complexity: {minority_complexity:.4f}")
print(f"Majority Class Complexity: {majority_complexity:.4f}")

Minority Class Complexity: 0.6171
Majority Class Complexity: 0.8862


In [None]:
print("Data Shape:", data.shape)
print("Labels Shape:", labels.shape)
print("Weights Shape:", weights.shape)
print("Majority weights shape: ", majority_weights.shape)
print("Majority weights shape: ", minority_weights.shape)
print("Adjusted Labels Shape:", labels.shape)
print("Minority class shape: ", minority_class.shape)
print("Majority class shape: ", majority_class.shape)

Data Shape: (195, 23)
Labels Shape: (195, 1)
Weights Shape: (195,)
Majority weights shape:  (147,)
Majority weights shape:  (48,)
Adjusted Labels Shape: (195, 1)
Minority class shape:  (48, 22)
Majority class shape:  (147, 22)


In [None]:
def initialize_population(valid_indices, population_size, individual_size):
    """ Initialize the population with random selections of valid indices. """
    return [np.random.choice(valid_indices, size=individual_size, replace=False) for _ in range(population_size)]

def calculate_fitness(individual, weights):
    """ Fitness is the sum of weights of selected samples. """
    return np.sum(weights[individual])

def select_parents(population, fitnesses, num_parents):
    """ Select parents based on their fitness scores using roulette wheel selection. """
    total_fitness = np.sum(fitnesses)
    probabilities = fitnesses / total_fitness
    parents_indices = np.random.choice(range(len(population)), size=num_parents, replace=True, p=probabilities)
    return [population[idx] for idx in parents_indices]

def crossover(parent1, parent2):
    """ Perform one-point crossover between two parents. """
    crossover_point = np.random.randint(1, len(parent1))
    child1 = np.concatenate([parent1[:crossover_point], parent2[crossover_point:]])
    child2 = np.concatenate([parent2[:crossover_point], parent1[crossover_point:]])
    return child1, child2

def mutate(individual, mutation_rate, valid_indices):
    """ Mutate an individual's genes. """
    for i in range(len(individual)):
        if np.random.rand() < mutation_rate:
            individual[i] = np.random.choice(valid_indices)
    return individual

def genetic_algorithm(features, weights, population_size, individual_size, generations, mutation_rate):
    threshold = np.mean(weights)  # Mean weight as threshold to avoid overlap
    valid_indices = np.where(weights > threshold)[0]

    population = initialize_population(valid_indices, population_size, individual_size)

    for _ in range(generations):
        fitnesses = [calculate_fitness(ind, weights) for ind in population]
        parents = select_parents(population, fitnesses, len(population))

        next_population = []
        for i in range(0, len(parents), 2):
            child1, child2 = crossover(parents[i], parents[(i + 1) % len(parents)])
            child1 = mutate(child1, mutation_rate, valid_indices)
            child2 = mutate(child2, mutation_rate, valid_indices)
            next_population.extend([child1, child2])
        population = next_population[:population_size]

    final_fitnesses = [calculate_fitness(ind, weights) for ind in population]
    best_index = np.argmax(final_fitnesses)
    best_individual = population[best_index]
    best_individual_fitness = final_fitnesses[best_index]

    # Fetch the actual samples corresponding to the best indices
    best_samples = features[best_individual]
    return best_samples, best_individual_fitness

# Parameters for the GA
population_size = 50
individual_size = calculated_sample_size  # Ensure it’s less than len(valid_indices)
generations = 100
mutation_rate = 0.05

# Run the genetic algorithm
best_samples, best_fitness = genetic_algorithm(features, weights, population_size, individual_size, generations, mutation_rate)

print("Selected Samples (Best Solution):\n", best_samples)
print("Sum of Weights for the Best Solution:", best_fitness)


Selected Samples (Best Solution):
 [[1.739170e+02 1.927350e+02 8.618000e+01 ... 2.102790e-01 2.547508e+00
  2.535560e-01]
 [1.009600e+02 1.100190e+02 9.562800e+01 ... 1.469480e-01 2.428306e+00
  2.646660e-01]
 [2.524550e+02 2.614870e+02 1.827860e+02 ... 2.008730e-01 2.028612e+00
  8.639800e-02]
 ...
 [1.284510e+02 1.504490e+02 7.563200e+01 ... 3.101630e-01 2.638279e+00
  3.568810e-01]
 [1.504400e+02 1.634410e+02 1.447360e+02 ... 1.832180e-01 2.264226e+00
  1.441050e-01]
 [1.697740e+02 1.917590e+02 1.514510e+02 ... 4.147580e-01 3.413649e+00
  4.575330e-01]]
Sum of Weights for the Best Solution: 54.0


In [None]:
def concatenate_samples(P, N, SetN):
    # Ensure all inputs are at least 2D
    P = np.atleast_2d(P)
    N = np.atleast_2d(N)
    SetN = np.atleast_2d(SetN)

    # Cartesian product for P: all combinations of P with itself
    P_cartesian = np.array([np.concatenate([p1, p2]) for p1 in P for p2 in P])

    # Cartesian product for N and SetN: all combinations of N with SetN
    N_cartesian = np.array([np.concatenate([n, sn]) for n in N for sn in SetN])

    return P_cartesian, N_cartesian

In [None]:
P_cartesian, N_cartesian = concatenate_samples(minority_class,majority_class, best_samples)


In [None]:
P_cartesian

array([[1.970760e+02, 2.068960e+02, 1.920550e+02, ..., 1.775510e-01,
        1.743867e+00, 8.556900e-02],
       [1.970760e+02, 2.068960e+02, 1.920550e+02, ..., 1.733190e-01,
        2.103106e+00, 6.850100e-02],
       [1.970760e+02, 2.068960e+02, 1.920550e+02, ..., 1.751810e-01,
        1.512275e+00, 9.632000e-02],
       ...,
       [2.142890e+02, 2.602770e+02, 7.797300e+01, ..., 1.584530e-01,
        2.679772e+00, 1.317280e-01],
       [2.142890e+02, 2.602770e+02, 7.797300e+01, ..., 2.074540e-01,
        2.138608e+00, 1.233060e-01],
       [2.142890e+02, 2.602770e+02, 7.797300e+01, ..., 1.906670e-01,
        2.555477e+00, 1.485690e-01]])

In [None]:
N_cartesian.shape

(7938, 44)

In [None]:
new_weights = calculate_weights(np.vstack((P_cartesian, N_cartesian)), np.concatenate([np.zeros(len(P_cartesian)), np.ones(len(N_cartesian))]), k=5)

In [None]:
minority_weights_new = new_weights[:len(P_cartesian)]
majority_weights_new = new_weights[len(P_cartesian):]

In [None]:
minority_weights_new.shape

(2304,)

In [None]:
majority_weights_new.shape

(7938,)

In [None]:
# Calculate complexities based on the weights calculated
minority_complexity = np.mean(minority_weights_new)  # Complexity for minority class after concatenation
majority_complexity = np.mean(majority_weights_new)  # Complexity for majority class after concatenation

print(f"Minority Class Complexity after Concatenation: {minority_complexity:.4f}")
print(f"Majority Class Complexity after Concatenation: {majority_complexity:.4f}")


Minority Class Complexity after Concatenation: 0.9541
Majority Class Complexity after Concatenation: 0.9912


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset, SubsetRandomSampler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, recall_score, roc_curve, precision_recall_curve, roc_auc_score
from imblearn.metrics import geometric_mean_score
from sklearn.metrics import confusion_matrix


class MLP(nn.Module):
    """
    A simple Multi-Layer Perceptron model class that inherits from nn.Module.
    """
    def __init__(self, input_size, output_size, device):
        super(MLP, self).__init__()
        h = 2 * (input_size + output_size) // 3
        self.device = device
        self.hidden = nn.Linear(input_size, h).to(device)
        self.output = nn.Linear(h, output_size).to(device)

    def forward(self, x):
        x = F.relu(self.hidden(x))
        x = self.output(x)
        return x


def load_data(train_features, train_labels, train_ids, test_ids, batch_size, device):
    """
    Load and create tensor datasets for both training and validation data.
    """
    features_tensor = torch.tensor(train_features, dtype=torch.float32)
    labels_tensor = torch.tensor(train_labels, dtype=torch.int64)
    dataset = TensorDataset(features_tensor, labels_tensor)

    train_subsampler = SubsetRandomSampler(train_ids)
    test_subsampler = SubsetRandomSampler(test_ids)
    train_loader = DataLoader(dataset, batch_size=batch_size, sampler=train_subsampler)
    val_loader = DataLoader(dataset, batch_size=batch_size, sampler=test_subsampler)

    return train_loader, val_loader


def train_model(model, train_loader, val_loader, criterion, optimizer, device, num_epochs, Set_N_selection, fold):
    """
    Train the MLP model with the specified parameters.
    """
    best_recall_score = float('-inf')
    best_model_path = ""

    for epoch in range(num_epochs):
        model.train()
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

        model.eval()
        all_predictions, all_labels = [], []
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                _, predicted = torch.max(outputs.data, 1)
                all_predictions.extend(predicted.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())
        fold_recall = recall_score(all_labels, all_predictions)
    #   fold_f1_score = f1_score(all_labels, all_predictions, average='weighted')
        if fold_recall > best_recall_score:
            best_recall_score = fold_recall
            if Set_N_selection == 'baseline':
                best_model_path = f"model_baseline_fold_{fold+1}.pth"
                torch.save(model.state_dict(), best_model_path)
            else:
                best_model_path = f"model_GA_fold_{fold+1}.pth"
                torch.save(model.state_dict(), best_model_path)

    model.load_state_dict(torch.load(best_model_path))
    model.to(device).eval()
    return model


def evaluate_model(model, test_loader, criterion, device):
    """
    Evaluate the model on the test dataset.
    """
    test_loss = 0
    all_labels, all_predictions, all_probabilities = [], [], []

    model.eval()
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            test_loss += loss.item() * inputs.size(0)
            probabilities = torch.softmax(outputs, dim=1)
            _, predicted = torch.max(outputs, 1)
            all_labels.extend(labels.cpu().numpy())
            all_predictions.extend(predicted.cpu().numpy())
            all_probabilities.extend(probabilities[:, 1].cpu().numpy())  # Assuming binary classification

    test_loss /= len(test_loader.dataset)
    accuracy = accuracy_score(all_labels, all_predictions)
    recall = recall_score(all_labels, all_predictions)
    f1 = f1_score(all_labels, all_predictions)
    gmean = geometric_mean_score(all_labels, all_predictions)
    auc_roc = roc_auc_score(all_labels, all_probabilities)

    print(f'Test Loss: {test_loss:.4f},'
          f' Accuracy: {accuracy:.4f},'
          f' Recall: {recall:.4f},'
          f' F1 Score: {f1:.4f},'
          f' G-Mean: {gmean:.4f},'
          f' ROC AUC: {auc_roc:.4f}')
    # Calculate and print the confusion matrix
    conf_matrix = confusion_matrix(all_labels, all_predictions)
    l = [(all_labels[i], all_predictions[i]) for i in range(len(all_labels))]
    print(l)
    print(len(all_labels))
    print("Confusion Matrix:\n", conf_matrix)
    plot_performance_curves(all_labels, all_probabilities, auc_roc)


def plot_performance_curves(labels, probabilities, auc_roc):
    """
    Plot ROC and Precision-Recall curves.
    """
    fpr, tpr, _ = roc_curve(labels, probabilities)
    precision, recall, _ = precision_recall_curve(labels, probabilities)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % auc_roc)
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc="lower right")

    plt.subplot(1, 2, 2)
    plt.plot(recall, precision, color='blue', lw=2, label='Precision-Recall curve')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curve')
    plt.legend(loc="lower left")

    plt.show()


def main(train_data, test_data, config, Set_N_selection):
    """
    Main function to run the training and testing of the MLP model.
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    criterion = nn.CrossEntropyLoss()

    train_features = train_data.drop('class_label', axis=1).values
    train_labels = train_data['class_label'].values
    test_features = test_data.drop('class_label', axis=1).values
    test_labels = test_data['class_label'].values

    # Load Data
    k_folds = config['k_folds']
    batch_size = config['batch_size']
    num_epochs = config['num_epochs']
    learning_rate = config['learning_rate']

    skf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=config['random_seed'])
    for fold, (train_ids, test_ids) in enumerate(skf.split(train_features, train_labels)):
        print(f"Starting fold {fold+1} of {k_folds}")
        train_loader, val_loader = load_data(train_features, train_labels, train_ids, test_ids, batch_size, device)
        model = MLP(train_features.shape[1], len(np.unique(train_labels)), device)
        optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
        model = train_model(model, train_loader, val_loader, criterion,
                            optimizer, device, num_epochs, Set_N_selection, fold)

    # Test Model
    test_features_tensor, test_labels_tensor = (torch.tensor(test_features, dtype=torch.float32),
                                                torch.tensor(test_labels, dtype=torch.int64))
    test_dataset = TensorDataset(test_features_tensor, test_labels_tensor)
    test_loader = DataLoader(test_dataset, batch_size=16, shuffle=True)
    evaluate_model(model, test_loader, criterion, device)

# Run the main function with proper data and configuration settings
# main(train_data, test_data, config)

In [None]:
P_features.shape

torch.Size([2304, 44])

In [None]:
from torch.utils.data import DataLoader, TensorDataset, SubsetRandomSampler
from sklearn.model_selection import train_test_split
import torch

# Convert data to PyTorch tensors
P_features = torch.tensor(P_cartesian, dtype=torch.float32)
N_features = torch.tensor(N_cartesian, dtype=torch.float32)
features_combined = torch.vstack((P_features, N_features))
labels_combined = torch.cat((torch.zeros(len(P_cartesian)), torch.ones(len(N_cartesian))))

# Split data into train and test sets
train_features, test_features, train_labels, test_labels = train_test_split(features_combined, labels_combined, test_size=0.2, random_state=42)

# Create TensorDatasets
train_dataset = TensorDataset(train_features, train_labels)
test_dataset = TensorDataset(test_features, test_labels)

# Create DataLoaders
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


In [None]:
def train_model(model, train_loader, criterion, optimizer, device, num_epochs):
    model.train()
    for epoch in range(num_epochs):
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels.long())  # Ensure labels are long type for CE Loss
            loss.backward()
            optimizer.step()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MLP(features_combined.shape[1], 2, device)  # 2 outputs assuming binary classification
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
train_model(model, train_loader, criterion, optimizer, device, num_epochs=50)


In [None]:
def evaluate_model(model, test_loader, criterion, device):
    model.eval()
    test_loss = 0
    all_labels, all_probabilities = [], []

    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels.long())
            test_loss += loss.item() * inputs.size(0)

            probabilities = torch.softmax(outputs, dim=1)[:, 1].cpu().numpy()  # Get probabilities for class 1
            all_labels.extend(labels.cpu().numpy())
            all_probabilities.extend(probabilities)

    # Convert lists to numpy arrays for metrics calculation
    all_labels = np.array(all_labels)
    all_probabilities = np.array(all_probabilities)

    # Calculate metrics using numpy array
    predictions = (all_probabilities > 0.5).astype(int)  # Convert probabilities to 0 or 1 based on threshold
    accuracy = accuracy_score(all_labels, predictions)
    recall = recall_score(all_labels, predictions)
    f1 = f1_score(all_labels, predictions)
    gmean = geometric_mean_score(all_labels, predictions)
    auc_roc = roc_auc_score(all_labels, all_probabilities)

    print(f'Test Loss: {test_loss/len(test_loader.dataset):.4f},'
          f' Accuracy: {accuracy:.4f},'
          f' Recall: {recall:.4f},'
          f' F1 Score: {f1:.4f},'
          f' G-Mean: {gmean:.4f},'
          f' ROC AUC: {auc_roc:.4f}')

evaluate_model(model, test_loader, criterion, device)


Test Loss: 0.1518, Accuracy: 0.9356, Recall: 0.9510, F1 Score: 0.9577, G-Mean: 0.9173, ROC AUC: 0.9836


In [None]:
N_cartesian

array([[1.199920e+02, 1.573020e+02, 7.499700e+01, ..., 2.102790e-01,
        2.547508e+00, 2.535560e-01],
       [1.199920e+02, 1.573020e+02, 7.499700e+01, ..., 1.469480e-01,
        2.428306e+00, 2.646660e-01],
       [1.199920e+02, 1.573020e+02, 7.499700e+01, ..., 2.008730e-01,
        2.028612e+00, 8.639800e-02],
       ...,
       [1.498180e+02, 1.634170e+02, 1.447860e+02, ..., 3.101630e-01,
        2.638279e+00, 3.568810e-01],
       [1.498180e+02, 1.634170e+02, 1.447860e+02, ..., 1.832180e-01,
        2.264226e+00, 1.441050e-01],
       [1.498180e+02, 1.634170e+02, 1.447860e+02, ..., 4.147580e-01,
        3.413649e+00, 4.575330e-01]])

In [None]:
from torch.utils.data import DataLoader, TensorDataset, SubsetRandomSampler
from sklearn.model_selection import train_test_split
import torch

# Convert data to PyTorch tensors
P_features = torch.tensor(P_cartesian, dtype=torch.float32)
N_features = torch.tensor(N_cartesian, dtype=torch.float32)
features_combined = torch.vstack((P_features, N_features))
labels_combined = torch.cat((torch.zeros(len(P_cartesian)), torch.ones(len(N_cartesian))))

# Split data into train and test sets
train_features, test_features, train_labels, test_labels = train_test_split(features_combined, labels_combined, test_size=0.2, random_state=42)

# Create TensorDatasets
train_dataset = TensorDataset(train_features, train_labels)
test_dataset = TensorDataset(test_features, test_labels)

# Create DataLoaders
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)