# Data Augmentation - Conditional Wasserstein GANs - GP

# Synthetic Dataset Testing - Sensitivity Analysis

This notebook presents the sensitivity analysis made on the effect of supplementing imbalanced datasets with samples of the minority class generated by trained CWGAN-GP models to balance datasets on supervised analysis. This testing is made on the synthetic dataset made with class separation 1.2, changing the number of samples, features, redundant features and number of clusters per class. **To have 0.6 and 2.0 class_sep, change this parameter when creating the dataset.**

**Takes a long time.**

Contains info related to Supplementary Tables 5 to 7. '200_600_5_2' combination of parameters is the same as the one used in the main study.

Notebook Organization:
- Create all variations of the synthetic dataset with class separation 1.2 (repeat for 0.6 and 2.0).
- Unsupervised and Supervised statistical analysis and univariate analysis of the synthetic datasets.
- Creation of the imbalanced datasets for each synthetic datasets, creating 5 folds for each one.
- Setup the CWGAN-GP model and train all models, with the corresponding training data.
- Generate GAN samples and add them to the corresponding imbalanced training sets in small increments.
- Build and evaluate performance of RF and PLS-DA models from the imbalanced datasets and the imbalanced datasets supplemented with minority class samples for each synthetic dataset and each fold.

#### Due to stochasticity, re-running the notebook will get slightly different results. Thus, figures in the paper can be slightly different.

In [None]:
# json for persistence
from time import perf_counter

import numpy as np
import pandas as pd

import scipy.spatial.distance as dist
import scipy.cluster.hierarchy as hier
import scipy.stats as stats

import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from matplotlib import ticker

import seaborn as sns
from collections import namedtuple, Counter

from tqdm import tqdm
from IPython import display as ipythondisplay

from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.cross_decomposition import PLSRegression
import sklearn.cluster as skclust
from sklearn.metrics import (adjusted_rand_score, precision_recall_fscore_support, r2_score, roc_auc_score,
                             roc_curve, auc, f1_score, precision_score, recall_score)
from sklearn.datasets import make_classification
import sklearn.ensemble as skensemble
import sklearn.model_selection
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, cross_validate

import pickle
import tensorflow as tf
from keras import backend

# Metabolinks package
import metabolinks as mtl
import metabolinks.transformations as transf

# Python files in the repository
import multianalysis as ma
from elips import plot_confidence_ellipse
import gan_evaluation_metrics as gem
import linear_augmentation_functions as laf

In [None]:
# Import needed functions from GAN_functions
from GAN_functions import gradient_penalty_cwgan
from GAN_functions import critic_loss_wgan
from GAN_functions import generator_loss_wgan

### Functions for unsupervised analysis

In [None]:
# Functions to plot PCA
def plot_PCA(principaldf, label_colors, components=(1,2), title="PCA", ax=None):
    "Plot the projection of samples in the 2 main components of a PCA model."
    
    if ax is None:
        ax = plt.gca()
    
    loc_c1, loc_c2 = [c - 1 for c in components]
    col_c1_name, col_c2_name = principaldf.columns[[loc_c1, loc_c2]]
    
    #ax.axis('equal')
    ax.set_xlabel(f'{col_c1_name}')
    ax.set_ylabel(f'{col_c2_name}')

    unique_labels = principaldf['Label'].unique()

    for lbl in unique_labels:
        subset = principaldf[principaldf['Label']==lbl]
        ax.scatter(subset[col_c1_name],
                   subset[col_c2_name],
                   s=50, color=label_colors[lbl], label=lbl)

    #ax.legend(framealpha=1)
    ax.set_title(title, fontsize=15)

def plot_ellipses_PCA(principaldf, label_colors, components=(1,2),ax=None, q=None, nstd=2):
    "Plot confidence ellipses of a class' samples based on their projection in the 2 main components of a PCA model."
    
    if ax is None:
        ax = plt.gca()
    
    loc_c1, loc_c2 = [c - 1 for c in components]
    points = principaldf.iloc[:, [loc_c1, loc_c2]]
    
    #ax.axis('equal')

    unique_labels = principaldf['Label'].unique()

    for lbl in unique_labels:
        subset_points = points[principaldf['Label']==lbl]
        plot_confidence_ellipse(subset_points, q, nstd, ax=ax, ec=label_colors[lbl], fc='none')


#### Hierarchical Clustering Analysis (HCA)

In [None]:
def perform_HCA(df, metric='euclidean', method='average'):
    "Performs Hierarchical Clustering Analysis of a data set with chosen linkage method and distance metric."
    
    distances = dist.pdist(df, metric=metric)
    
    # method is one of
    # ward, average, centroid, single, complete, weighted, median
    Z = hier.linkage(distances, method=method)

    # Cophenetic Correlation Coefficient
    # (see how the clustering - from hier.linkage - preserves the original distances)
    coph = hier.cophenet(Z, distances)
    # Baker's gamma
    mr = ma.mergerank(Z)
    bg = mr[mr!=0]

    return {'Z': Z, 'distances': distances, 'coph': coph, 'merge_rank': mr, "Baker's Gamma": bg}

In [None]:
def compute_clustering_metrics(res_dict, labels):
    """Fill dict with clustering performance metrics."""
    
    discrim = ma.dist_discrim(res_dict['Z'], labels, # all samples have the same order
                              method = 'average')
    res_dict['Average discrim dist'] = discrim[0]
    correct = np.array(list(discrim[1].values()))
    
    classes = pd.unique(labels)
    res_dict['% correct clustering'] = (100/len(classes)) * len(correct[correct>0])

    # Correct First Cluster Percentage
    res_dict['% correct 1st clustering'] = 100 * ma.correct_1stcluster_fraction(res_dict['Z'],labels)

#### K-Means Clustering

In [None]:
def perform_KMeans(dataset, target, iter_num=150, best_fraction=0.1):
    "Perform K-means Clustering Analysis and calculate discrimination evaluation metrics."
    
    sample_labels = target
    n_classes = len(pd.unique(sample_labels))
    
    df = dataset
    
    discrim = ma.Kmeans_discrim(df, sample_labels,
                                method='average', 
                                iter_num=iter_num,
                                best_fraction=best_fraction)

    
    # Lists for the results of the best k-means clustering
    average = []
    correct = []
    rand = []
    
    for j in discrim:
        global_disc_dist, disc_dists, rand_index, SSE = discrim[j]
        
        # Average of discrimination distances
        average.append(global_disc_dist) 
        
        # Correct Clustering Percentages
        all_correct = np.array(list(disc_dists.values()))
        correct.append(len(all_correct[all_correct>0]))
        
        # Adjusted Rand Index
        rand.append(rand_index) 
    
    return{'dataset': dataset,
           'Discrimination Distance': np.median(average),
           '% correct clusters':np.median(correct)*100/n_classes,
           'Rand Index': np.median(rand)}

### Synthetic Dataset Creation

**Base dataset Characteristics:** 

- 2 classes
- Class Separation: 1.2 (or 0.6 or 2.0, change in the appropriate parameter)
- 20 informative features
- No random flipping of class labels (flip_y)

**Characteristics that are changed and permutated:**

- Number of samples: 100, 200 and 400
- Number of features: 300, 600 and 1200
- Ratio of redundant to informative features: 2.5, 5, 10
- Nº of clusters per class: 2, 3

Syntethic datasets are Pareto scaled so their values are mainly between -1 and 1.

In [None]:
dfs = {}
lbls = {}
dfs_no_t = {}

# CLASS SEP - change between 0.6, 1.2 and 2.0
class_sep = 1.2

for n_samp in [100, 200, 400]: # Number of samples
    dfs[n_samp] = {}
    lbls[n_samp] = {}
    dfs_no_t[n_samp] = {}

    for n_feat in [300, 600, 1200]: # Number of features
        dfs[n_samp][n_feat] = {}
        lbls[n_samp][n_feat] = {}
        dfs_no_t[n_samp][n_feat] = {}

        for ratio_red_inf in [2.5, 5, 10]: # Ratio of redundant to informative features
            dfs[n_samp][n_feat][ratio_red_inf] = {}
            lbls[n_samp][n_feat][ratio_red_inf] = {}
            dfs_no_t[n_samp][n_feat][ratio_red_inf] = {}

            for n_cluster in [2, 3]: # Nº of clusters per class
                df_f, lbl_f = make_classification(n_samples=n_samp, n_features=n_feat, n_informative=20, 
                                                  n_redundant=int(20*ratio_red_inf),
                                       n_classes=2, n_clusters_per_class=n_cluster, weights=None,
                                       flip_y=0, class_sep=class_sep, random_state=52683)
                dfs_no_t[n_samp][n_feat][ratio_red_inf][n_cluster] = pd.DataFrame(df_f) # Save non-treated synthetic datasets
                dfs[n_samp][n_feat][ratio_red_inf][n_cluster] = transf.pareto_scale(pd.DataFrame(df_f)) # Save treated synthetic datasets
                lbls[n_samp][n_feat][ratio_red_inf][n_cluster] = [str(i) for i in lbl_f]
                
lbls_orig = lbls # Save original labels

In [None]:
colours = sns.color_palette('Set1', 3)

ordered_labels = pd.unique(lbls[n_samp][n_feat][ratio_red_inf][n_cluster])

label_colors = {lbl: c for lbl, c in zip(ordered_labels, colours)}
sample_colors = [label_colors[lbl] for lbl in lbls[n_samp][n_feat][ratio_red_inf][n_cluster]]

sns.palplot(label_colors.values())
new_ticks = plt.xticks(range(len(ordered_labels)), ordered_labels)

#### Random Forest

In [None]:
# RF_model_CV - RF application and result extraction.
def RF_model_CV(df, y, iter_num=1, n_fold=5, n_trees=200):
    nfeats = df.shape[1]

    # Setting up variables for result storing
    imp_feat = np.zeros((iter_num * n_fold, nfeats))
    accuracy_scores = []
    f1_scores = []
    prec_scores = []
    rec_scores = []
    f = 0

    # Number of times Random Forest cross-validation is made
    # with `n_fold` randomly generated folds.
    for _ in range(iter_num):
        # Use stratified n_fold cross validation
        kf = StratifiedKFold(n_fold, shuffle=True)
        CV_accuracy_scores = []
        CV_f1_scores = []
        CV_prec_scores = []
        CV_rec_scores = []
        # Fit and evaluate a Random Forest model for each fold
        for train_index, test_index in kf.split(df, y):
            # Random Forest setup and fit
            rf = skensemble.RandomForestClassifier(n_estimators=n_trees)
            X_train, X_test = df.iloc[train_index, :], df.iloc[test_index, :]
            y_train, y_test = [y[i] for i in train_index], [y[i] for i in test_index]
            rf.fit(X_train, y_train)

            # Compute performance and important features
            CV_accuracy_scores.append(rf.score(X_test, y_test)) # Predictive Accuracy
            preds = rf.predict(X_test)
            prec, rec, f1, sup = precision_recall_fscore_support(y_test, preds,
                                                                pos_label='1', average='binary',
                                                                zero_division=0)
            CV_f1_scores.append(f1)
            CV_prec_scores.append(prec)
            CV_rec_scores.append(rec)
            imp_feat[f, :] = rf.feature_importances_ # Importance of each feature
            f = f + 1

        # Average Predictive Accuracy in this iteration
        accuracy_scores.append(np.mean(CV_accuracy_scores))
        f1_scores.append(np.mean(CV_f1_scores))
        prec_scores.append(np.mean(CV_prec_scores))
        rec_scores.append(np.mean(CV_rec_scores))

    # Collect and order all important features values from each Random Forest
    imp_feat_sum = imp_feat.sum(axis=0) / (iter_num * n_fold)
    sorted_imp_feat = sorted(enumerate(imp_feat_sum), key=lambda x: x[1], reverse=True)

    # locs are sufficient as a reference to features
    #imp_feat_tuples = [(loc, importance) for loc, importance in sorted_imp_feat]
    
    if iter_num == 1:
        return {'accuracy': accuracy_scores[0], 'F1-Score':f1_scores[0], 'Precision':prec_scores[0],
                'Recall':rec_scores[0], 'important_features': sorted_imp_feat}
    else:
        return {'accuracy': accuracy_scores, 'F1-Score':f1_scores, 'Precision':prec_scores,
                'Recall':rec_scores, 'important_features': sorted_imp_feat}

In [None]:
iter_num=20

RF_all = {}

# Application of the Random Forests for each differently-treated dataset
for n_samp in dfs:

    for n_feat in dfs[n_samp]:

        for ratio_red_inf in dfs[n_samp][n_feat]:

            for n_cluster in dfs[n_samp][n_feat][ratio_red_inf]:
                rfname = str(n_samp) + '_' + str(n_feat) + '_' + str(ratio_red_inf) + '_' + str(n_cluster)
                print(f'Fitting random forest ({rfname})', end=' ...')
                RF_all[rfname] = {'treatment':str(n_samp) + '_' + str(n_feat) + '_' + str(ratio_red_inf) + '_' + str(n_cluster)}
                n_fold = 5

                fit = RF_model_CV(dfs[n_samp][n_feat][ratio_red_inf][n_cluster],
                                  lbls_orig[n_samp][n_feat][ratio_red_inf][n_cluster], iter_num=iter_num, n_fold=n_fold, n_trees=200)
                RF_all[rfname].update(fit)

                print(f'done')

In [None]:
# F1-scores across the iterations
F1s = pd.DataFrame({name: RF_all[name]['F1-Score'] for name in RF_all})

F1s_stats_RF = pd.DataFrame({'Average accuracy': F1s.mean(axis=0),
                               'STD': F1s.std(axis=0)})
F1s_stats_RF = F1s_stats_RF.assign(treatment=[RF_all[name]['treatment'] for name in RF_all])
F1s_stats_RF

#### PLS-DA

In [None]:
def PLSDA_model_CV(df, labels, n_comp=10,
                   kf = None, n_fold=5,
                   iter_num=1,
                   encode2as1vector=True,
                   scale=False,
                   feat_type='VIP'):
    
    """Perform PLS-DA with n-fold cross-validation.

       df: pandas DataFrame; includes X equivalent in PLS-DA (training vectors).
       labels: target labels.
       n_comp: integer; number of components to use in PLS-DA.
       kf: default None; pass a specific cross validation method from 
        https://scikit-learn.org/stable/modules/cross_validation.html#cross-validation-iterators (3.1.2)
       n_fold: int (default: 5); number of groups to divide dataset in for cross-validation
        (NOTE: max n_fold can not exceed minimum number of samples per class).
       iter_num: int (default: 1); number of iterations that cross validation is repeated.
       scale: bool (default: False); if data is scaled when inputted to PLS model (only true if scaling was not done earlier)
       feat_type: string (default: 'VIP'); types of feature importance metrics to use; accepted: {'VIP', 'Coef', 'Weights'}.

    Returns: (accuracy, F1-score, precision, recall, Q2, import_features);
        accuracy: list of accuracy values in group selection
        F1-score: list of F1-scores (weighted) in group selection
        precision: list of precision (weighted) in group selection
        recall: list of recall (weighted) in group selection
        Q2: list of average Q2 scores of the models
        imp_features: list of tuples (index number of feature, feature importance)
            ordered by decreasing feature importance.
    """
    # Setting up lists and matrices to store results
    CVR2 = []
    accuracies = []
    f1_scores = []
    precision = []
    recall = []
    Imp_Feat = np.zeros((iter_num * n_fold, df.shape[1]))
    f = 0

    unique_labels = list(pd.unique(labels))

    is1vector = len(unique_labels) == 2 and encode2as1vector

    matrix = ma._generate_y_PLSDA(labels, unique_labels, is1vector)

    if is1vector:
        # keep a copy to use later
        target1D = matrix.copy()

    # Number of iterations equal to iter_num
    for i in range(iter_num):
        if kf is None:
            kf = sklearn.model_selection.StratifiedKFold(n_fold, shuffle=True)
        
        # Setting up storing variables for cross-validation
        nright = 0 # For accuracy
        cvr2 = [] # For R2 score
        # To store real and predicted classes to calculate F1-score, precision and recall
        if not is1vector:
            all_preds = pd.DataFrame(columns=matrix.columns, index=matrix.index)
            all_tests = pd.DataFrame(columns=matrix.columns, index=matrix.index)
            a = 0
        else:
            all_preds = []
            all_tests = []

        # Iterate through cross-validation procedure
        for train_index, test_index in kf.split(df, labels):
            plsda = PLSRegression(n_components=n_comp, scale=scale)
            X_train, X_test = df.iloc[train_index, :], df.iloc[test_index, :]
            if not is1vector:
                y_train = matrix.iloc[train_index, :].copy()
                y_test = matrix.iloc[test_index, :].copy()

            else:
                y_train, y_test = target1D[train_index], target1D[test_index]
                correct = target1D[test_index]

            # Fit PLS model
            plsda.fit(X=X_train, Y=y_train)

            # Obtain results with the test group
            y_pred = plsda.predict(X_test)
            cvr2.append(r2_score(y_test, y_pred))

            # Decision rule for classification
            # Decision rule chosen: sample belongs to group where it has max y_pred (closer to 1)
            # In case of 1,0 encoding for two groups, round to nearest integer to compare
            if not is1vector:
                rounded_pred = y_pred.copy()
                for i in range(len(y_pred)):
                    if list(y_test.iloc[i, :]).index(max(y_test.iloc[i, :])) == np.argmax(
                        y_pred[i]
                    ):
                        nright += 1  # Correct prediction
                    
                    for l in range(len(y_pred[i])):
                        if l == np.argmax(y_pred[i]):
                            rounded_pred[i, l] = 1
                        else:
                            rounded_pred[i, l] = 0
            
                # Save y-test and predictions to calculate F1-score, precision and recall
                all_tests.iloc[a:a+len(y_test)] = y_test
                all_preds.iloc[a:a+len(y_test)] = rounded_pred
                a = a + len(y_test)

            else:
                rounded = np.round(y_pred)
                for p in range(len(y_pred)):
                    if rounded[p] >= 1:
                        rounded[p] = 1
                    else:
                        rounded[p] = 0
                    if rounded[p] == correct[p]:
                        nright += 1  # Correct prediction
                
                # Save y-test and predictions to calculate F1-score, precision and recall
                all_preds.extend(list(rounded[:,0]))
                all_tests.extend(y_test)
            
            # Calculate important features (3 different methods to choose from)
            if feat_type == 'VIP':
                Imp_Feat[f, :] = ma._calculate_vips(plsda)
            elif feat_type == 'Coef':
                Imp_Feat[f, :] = abs(plsda.coef_).sum(axis=1)
            elif feat_type == 'Weights':
                Imp_Feat[f, :] = abs(plsda.x_weights_).sum(axis=1)
            else:
                raise ValueError(
                    'Type not Recognized. Types accepted: "VIP", "Coef", "Weights"'
                )

            f += 1

        # Calculate the accuracy of the group predicted and storing score results
        accuracies.append(nright / len(labels))
        CVR2.append(np.mean(cvr2))
        # Calculate F1-score, precision and recall for the fold and storing results
        if not is1vector:
            pos_label = np.where(unique_labels != '1')[0][0]
            #print(unique_labels, pos_label)
            f1_scores.append(f1_score(all_tests.astype(int), all_preds.astype(int), average='binary', pos_label=pos_label))
            precision.append(precision_score(all_tests.astype(int), all_preds.astype(int), average='binary', pos_label=pos_label))
            recall.append(recall_score(all_tests.astype(int), all_preds.astype(int), average='binary', pos_label=pos_label))
        else:
            pos_label = np.where(unique_labels != '1')[0][0]
            #print(unique_labels, pos_label)
            f1_scores.append(f1_score(all_tests, all_preds, average='binary', pos_label=pos_label))
            precision.append(precision_score(all_tests, all_preds, average='binary', pos_label=pos_label))
            recall.append(recall_score(all_tests, all_preds, average='binary', pos_label=pos_label))


    # Join and sort all important features values from each cross validation group and iteration.
    Imp_sum = Imp_Feat.sum(axis=0) / (iter_num * n_fold)
    imp_features = sorted(enumerate(Imp_sum), key=lambda x: x[1], reverse=True)
    if iter_num == 1:
        return {'accuracy': accuracies[0], 'F1-scores':f1_scores[0], 'precision': precision[0], 'recall':recall[0],
                'Q2': CVR2[0], 'imp_feat': imp_features}
    else:
        return {'accuracy': accuracies, 'F1-scores':f1_scores, 'precision': precision, 'recall':recall,
                'Q2': CVR2, 'imp_feat': imp_features}

In [None]:
%%capture --no-stdout

PLSDA_all = {}

iter_num=20

# Application of the Random Forests for each differently-treated dataset
for n_samp in dfs:

    for n_feat in dfs[n_samp]:

        for ratio_red_inf in dfs[n_samp][n_feat]:

            for n_cluster in dfs[n_samp][n_feat][ratio_red_inf]:

                plsdaname = str(n_samp) + '_' + str(n_feat) + '_' + str(ratio_red_inf) + '_' + str(n_cluster)
                treatment = plsdaname
                print(f'Fitting a PLS-DA model ({treatment})', end=' ...')
                PLSDA_all[plsdaname] = {'treatment':treatment}
                n_comp = 4
                n_fold = 5
                fit = PLSDA_model_CV(dfs[n_samp][n_feat][ratio_red_inf][n_cluster],
                                     lbls_orig[n_samp][n_feat][ratio_red_inf][n_cluster],
                                        n_comp=n_comp, n_fold=n_fold,
                                        iter_num=iter_num,
                                        feat_type='VIP',
                                    encode2as1vector=True)
                PLSDA_all[plsdaname].update(fit)
                print(f'done')

In [None]:
# F1-scores across the iterations
F1s = pd.DataFrame({name: PLSDA_all[name]['F1-scores'] for name in PLSDA_all})

F1s_stats_PLSDA = pd.DataFrame({'Average accuracy': F1s.mean(axis=0),
                               'STD': F1s.std(axis=0)})
F1s_stats_PLSDA = F1s_stats_PLSDA.assign(treatment=[PLSDA_all[name]['treatment'] for name in PLSDA_all])
F1s_stats_PLSDA

# Data Augmentation

## Creating Imbalanced Datasets

The six synthetic datasets have two balanced datasets. Since both classes are similar in terms of heterogeneity, we will only use one of them as the minority class - the class '1'.

Then, for each synthetic dataset:

We split it in 5 different ways/folds. 

Using the datasets with 200 samples as example, each fold had 80 samples of the majority class in that case and 20 samples of the minority class in the training set. Thus, this left 20 samples of the majority and 80 of the minority class to be the test sets. This was made by putting the set of 100 samples of a class into 5 folds of 20, combining 4 for the majority class for the training set. Training set was Pareto scaled and on the test set we performed a 'faux' Pareto scaling using the features standard deviation and mean of the training set since the training and test sets have a vastly different balance of class samples. Thus, feature averages and standard deviations can be quite different between them, especially in key features for discrimination. To compensate for this, the ‘faux’ Pareto scaling was applied.

For 100 and 400 sample datasets, the procedure was similar where each training set had 40/10 and 160/40 majority/minority class samples, respectively (and vice-versa for the test set).

The untreated training sets were linearly augmented, which was then treated (using a normal Pareto scaling, in this case), to generate samples to train the CWGAN-GP models.

In [None]:
rng = np.random.default_rng(7519)

# This is useless but the rng seed was set before this so this has to be ran so the others remain the same
permutations = {}
for cl in ordered_labels:
    permutations[cl] = list(rng.permutation(np.where(np.array(lbls[n_samp][n_feat][ratio_red_inf][n_cluster]) == cl)[0]))

In [None]:
df_storage_train = {}
df_storage_test = {}
lbl_storage_train = {}
lbl_storage_test = {}
real_samples = {}
permutations = {}

for n_samp in [100, 200, 400]:
    fold_len = int(n_samp/2//5)

    for n_feat in [300, 600, 1200]:
        for ratio_red_inf in [2.5, 5, 10]:
            for n_cluster in [2, 3]:
                
                sep = str(n_samp) + '_' + str(n_feat) + '_' + str(ratio_red_inf) + '_' + str(n_cluster)
                df_storage_train[sep] = {}
                df_storage_test[sep] = {}
                lbl_storage_train[sep] = {}
                lbl_storage_test[sep] = {}
                real_samples[sep] = {}
                permutations[sep] = {}
    
                # Select the samples which will be in the imbalanced and in the test set
                for cl in ordered_labels:
                    permutations[sep][cl] = list(rng.permutation(np.where(np.array(lbls[
                        n_samp][n_feat][ratio_red_inf][n_cluster]) == cl)[0]))
    
                for i in range(5):
                    train_idxs = {'1':[], '0':[]}
                    test_idxs = {'1':[], '0':[]}

                    for cl in ordered_labels:
                        if cl == '1':
                            train_idxs[cl] = list(np.array(permutations[sep][cl])[i*fold_len: (i+1)*fold_len])
                            test_idxs[cl] = list(np.array(permutations[sep][cl])[: i*fold_len]) + list(
                                np.array(permutations[sep][cl])[(i+1)*fold_len:])
                        else:
                            train_idxs[cl] = list(np.array(permutations[sep][cl])[: i*fold_len]) + list(
                                np.array(permutations[sep][cl])[(i+1)*fold_len:])
                            test_idxs[cl] = list(np.array(permutations[sep][cl])[i*fold_len: (i+1)*fold_len])

                    print('Synthetic Dataset:', sep, 'Fold nº:', i+1)
                    print('Train 1/0:', len(train_idxs['1']), len(train_idxs['0']))
                    print('Test 1/0: ', len(test_idxs['1']), len(test_idxs['0']))
                    train_idxs = train_idxs['1'] + train_idxs['0']
                    test_idxs = test_idxs['1'] + test_idxs['0']

                    # Create the imbalanced and test set
                    df_storage_train[sep][i+1] = dfs_no_t[n_samp][n_feat][ratio_red_inf][n_cluster].iloc[train_idxs]
                    lbl_storage_train[sep][i+1] = list(np.array(lbls[n_samp][n_feat][ratio_red_inf][n_cluster])[train_idxs])

                    df_storage_test[sep][i+1] = dfs_no_t[n_samp][n_feat][ratio_red_inf][n_cluster].iloc[test_idxs]
                    lbl_storage_test[sep][i+1] = list(np.array(lbls[n_samp][n_feat][ratio_red_inf][n_cluster])[test_idxs])

                    # Data pretreatment of the imbalanced and test dataset
                    real_samples[sep][i+1] = transf.pareto_scale(df_storage_train[sep][i+1])

                    df_storage_test[sep][i+1] = (df_storage_test[sep][i+1] - df_storage_train[
                        sep][i+1].mean())/np.sqrt(df_storage_train[sep][i+1].std()) # 'Faux' Pareto Scale

Linear Augmentation of the Training Sets and Pareto Scale.

In [None]:
aug_df_storage_train = {}
aug_lbl_storage_train = {}
# Only generation of samples based on the imbalanced dataset
for sep in df_storage_train:
    aug_df_storage_train[sep] = {}
    aug_lbl_storage_train[sep] = {}
    for i in range(1,6):#df_storage_train.keys():
        if sep.startswith('400'):
            rnd = 0.5
            max_samples=512
        elif sep.startswith('100'):
            rnd = list(np.linspace(0.1,0.9,9))
            max_samples=256
        else:
            rnd = list(np.linspace(0.2,0.8,3))
            max_samples=512
        start = perf_counter()
        data, lbls_m = laf.artificial_dataset_generator(df_storage_train[sep][i], labels=lbl_storage_train[sep][i],
                                            max_new_samples_per_label=max_samples, binary=False, 
                                            rnd=rnd, 
                                            binary_rnd_state=None, rnd_state=42345)

        data_treated = transf.pareto_scale(data)

        aug_df_storage_train[sep][i] = data_treated.copy()
        aug_lbl_storage_train[sep][i] = lbls_m
        end = perf_counter()
        print(f'Simple augmentation of data done! Dataset: {sep}, Fold: {i}. Took {(end - start):.3f} s')

Set up colours for each of the classes. Generated samples will have the corresponding label with '- GAN' after.

In [None]:
# Colors to use in plots
colours2 = sns.color_palette('tab20', 4)#[:6]

ordered_labels_test = []
for i in ['1', '0']:
    ordered_labels_test.extend([i, i + ' - GAN'])
label_colors_test = {lbl: c for lbl, c in zip(ordered_labels_test, colours2)}

sns.palplot(label_colors_test.values())
new_ticks_test = plt.xticks(range(len(ordered_labels_test)), ordered_labels_test)

## Conditional Wasserstein GAN - GP model

This model construction was made by joining WGAN-GP models with Conditional GAN models. WGAN-GP models were originally made according to / originally based in https://keras.io/examples/generative/wgan_gp/#wasserstein-gan-wgan-with-gradient-penalty-gp and Conditional GAN models - https://machinelearningmastery.com/how-to-develop-a-conditional-generative-adversarial-network-from-scratch/ (generator and discriminator model) and https://keras.io/examples/generative/conditional_gan/ without using OOP (loss functions and training/training steps).

Functions for the generator and critic (discriminator) models

In [None]:
def generator_model(len_input, len_output, n_hidden_nodes, n_labels): 
    "Make the generator model of CWGAN-GP."

    data_input = tf.keras.Input(shape=(len_input,), name='data') # Take intensity input
    label_input = tf.keras.Input(shape=(1,), name='label') # Take Label Input
    
    # Treat label input to concatenate to intensity data after
    label_m = tf.keras.layers.Embedding(n_labels, 30, input_length=1)(label_input)
    label_m = tf.keras.layers.Dense(256, activation='linear', use_bias=True)(label_m)
    #label_m = tf.keras.layers.Reshape((len_input,1,))(label_m)
    label_m2 = tf.keras.layers.Reshape((256,))(label_m)

    joined_data = tf.keras.layers.Concatenate()([data_input, label_m2]) # Concatenate intensity and label data
    # Hidden Dense Layer and Normalization
    joined_data = tf.keras.layers.Dense(n_hidden_nodes, activation=tf.nn.leaky_relu, use_bias=True)(joined_data)
    joined_data = tf.keras.layers.Dense(256, activation=tf.nn.leaky_relu, use_bias=True)(joined_data)
    joined_data = tf.keras.layers.BatchNormalization()(joined_data)
    
    # Output - number of features of sample to make
    output = tf.keras.layers.Dense(len_output, activation='linear', use_bias=True)(joined_data)
    
    generator = tf.keras.Model(inputs=[data_input, label_input], outputs=output)
    
    return generator


def critic_model(len_input, n_hidden_nodes, n_labels):
    "Make the critic model of CWGAN-GP."
    
    label_input = tf.keras.Input(shape=(1,)) # Take intensity input
    data_input = tf.keras.Input(shape=(len_input,)) # Take Label Input

    # Treat label input to concatenate to intensity data after
    label_m = tf.keras.layers.Embedding(n_labels, 30, input_length=1)(label_input)
    label_m = tf.keras.layers.Dense(256, activation='linear', use_bias=True)(label_m)
    #label_m = tf.keras.layers.Reshape((len_input,1,))(label_m)
    label_m = tf.keras.layers.Reshape((256,))(label_m)

    joined_data = tf.keras.layers.Concatenate()([data_input, label_m]) # Concatenate intensity and label data
    # Hidden Dense Layer (Normalization worsened results here)
    joined_data = tf.keras.layers.Dense(n_hidden_nodes, activation=tf.nn.leaky_relu, use_bias=True)(joined_data)
    joined_data = tf.keras.layers.Dense(128, activation=tf.nn.leaky_relu, use_bias=True)(joined_data)
    joined_data = tf.keras.layers.Dense(256, activation=tf.nn.leaky_relu, use_bias=True)(joined_data)
    #joined_data = tf.keras.layers.BatchNormalization()(joined_data)

    # Output Layer - 1 node for critic decision
    output = tf.keras.layers.Dense(1, activation='linear', use_bias=True)(joined_data)
    
    critic = tf.keras.Model(inputs=[data_input, label_input], outputs=output)

    return critic

In [None]:
def generate_predictions(model, num_examples_to_generate, len_input, input_dist, uni_lbls):
    "Generate sample predictions based on a Generator model."
    
    test_input =  tf.constant(input_dist.rvs(size=len_input*num_examples_to_generate), shape=[
        num_examples_to_generate,len_input]) 
    
    if len(uni_lbls) < 3:
        test_labels = tf.constant([1.0]*(num_examples_to_generate//2) + [0.0]*(num_examples_to_generate//2), 
                                  shape=(num_examples_to_generate,1))
    else:
        test_labels = []
        for i in range(len(uni_lbls)):
            test_labels.extend([i]*(num_examples_to_generate//len(uni_lbls)))
        test_labels = np.array(pd.get_dummies(test_labels))
        #np.array(pd.get_dummies([i for i in range(len(uni_lbls))]*(num_examples_to_generate//len(uni_lbls))))
    predictions = model([test_input, test_labels], training=False) # `training` is set to False.
    return predictions

In [None]:
def training_montage(train_data_o, train_lbls, test_data, test_lbls,
                     epochs, generator, critic, generator_optimizer, critic_optimizer, input_dist,
                    batch_size, grad_pen_weight=10, k_cov_den=50, k_crossLID=15, random_seed=145,
                    n_generated_samples=96):
    """Train a generator and critic of CWGAN-GP.
    
       Receives training data and respective class labels (train_data_o and train_lbls) and trains a generator and a critic
        model (generator, critic) over a number of epochs (epochs) with a set batch size (batch_size) with the respective 
        optimizers and learning rate (generator_optimizer, critic_optimizer). Gradient Penalty is calculated with
        grad_pen_weight as the weight of the penalty.
       The functions returns at time intervals three graphs to evaluate the progression of the models (Loss plots,
        coverage, density, crossLID and correct first cluster plots and PCA plot with generated and test data). To this
        end, samples need to be generated requiring the distribution to sample the initial input values from (input_dist),
        and test data and respective labels has to be given (test_data and test_lbls). Finally the number of neighbors to
        consider for coverage/density and crossLID calculation is also needed (k_cov_den, k_crossLID).
    
       train_data_o: Pandas DataFrame with training data;
       train_lbls: List with training data class labels;
       test_data: Pandas DataFrame with test data to evaluate the model;
       test_lbls: List with test data class labels to evaluate the model;
       epochs: Int value with the number of epochs to train the model;
       generator: tensorflow keras.engine.functional.Functional model for the generator;
       critic: tensorflow keras.engine.functional.Functional model for the critic;
       generator_optimizer: tensorflow keras optimizer (with learning rate) for generator;
       critic_optimizer: tensorflow keras optimizer (with learning rate) for critic;
       input_dist: scipy.stats._continuous_distns.rv_histogram object - distribution to sample input values for generator;
       batch_size: int value with size of batch for model training;
       grad_pen_weight: int value (default 10) for penalty weight in gradient penalty calculation;
       k_cov_den: int value (default 50) for number of neighbors to consider for coverage and density calculation in
       generated samples evaluation;
       k_crossLID: int value (default 15) for number of neighbors to consider for crossLID calculation in generated samples
        evaluation.
       random_seed: int value (default 145) for numpy random seeding when randomly organizing samples in the data that
        will be split into batches.
       n_generated_samples: int value (default 96) for number of samples generated to test the model during training.
    """
    
    # Obtaining the train data, randomize its order and divide it be twice the standard deviation of its values
    all_data = train_data_o.iloc[
        np.random.RandomState(seed=random_seed).permutation(len(train_data_o))]/(2*train_data_o.values.std())
    
    # Same treatment for the test data
    test_data = (test_data/(2*test_data.values.std())).values
    training_data = all_data
    train_data = all_data.values
    
    # Change class labels to numerical values while following the randomized ordered of samples
    if len(set(train_lbls)) < 3: # 1 and 0 for when there are only two classes
        train_labels = pd.get_dummies(
            np.array(train_lbls)[np.random.RandomState(seed=random_seed).permutation(len(train_data))]).values[:,0]
        test_labels = pd.get_dummies(np.array(test_lbls)).values[:,0]
    else: # One hot encoding for when there are more than two classes
        train_labels = pd.get_dummies(
            np.array(train_lbls)[np.random.RandomState(seed=random_seed).permutation(len(train_data))]).values
        test_labels = pd.get_dummies(np.array(test_lbls)).values
    # Save the order of the labels
    ordered_labels = pd.get_dummies(
            np.array(train_lbls)[np.random.RandomState(seed=random_seed).permutation(len(train_data_o))]).columns

    batch_divisions = int(batch_size / len(set(train_lbls))) # See how many samples of each class will be in each batch
    n_steps = epochs * int(training_data.shape[0] / batch_size) # Number of steps: nº of batches per epoch * nº of epochs
    n_critic = 5
    
    # Set up the evaluating images printed during training and the intervals they will be updated
    f, (axl, axc, axr) = plt.subplots(1, 3, figsize = (16,5))
    update1 = n_steps
    update2 = n_steps

    if hasattr(tqdm, '_instances'):
        tqdm._instances.clear() # clear if it exists

    i=0

    for step in tqdm(range(n_steps)):
        
        # Critic Training
        crit_loss_temp = []
        
        # Select real samples for this batch on training and order samples to put samples of the same class together
        real_samp = train_data[i*batch_size:(i+1)*batch_size]
        real_lbls = train_labels[i*batch_size:(i+1)*batch_size]

        real_samples = np.empty(real_samp.shape)
        real_labels = np.empty(real_lbls.shape)
        a = 0
        if len(set(train_lbls)) < 3:
            for l,s in sorted(zip(real_lbls, real_samp), key=lambda pair: pair[0], reverse=True):
                real_samples[a] = s
                real_labels[a] = l
                a = a+1
        else:
            for l,s in sorted(zip(real_lbls, real_samp), key=lambda pair: np.argmax(pair[0]), reverse=False):
                #print(l, np.argmax(l))
                real_samples[a] = s
                real_labels[a] = l
                a = a+1

        for _ in range(n_critic): # For each step, train critic n_critic times
            
            # Generate input for generator
            artificial_samples = tf.constant(input_dist.rvs(size=all_data.shape[1]*batch_size), shape=[
                batch_size,all_data.shape[1]])
            artificial_labels = real_labels.copy()

            # Generate artificial samples from the latent vector
            artificial_samples = generator([artificial_samples, artificial_labels], training=True)
            
            with tf.GradientTape() as crit_tape: # See the gradient for the critic

                # Get the logits for the generated samples
                X_artificial = critic([artificial_samples, artificial_labels], training=True)
                # Get the logits for the real samples
                X_true = critic([real_samples, real_labels], training=True)

                # Calculate the critic loss using the generated and real sample results
                c_cost = critic_loss_wgan(X_true, X_artificial)

                # Calculate the gradient penalty
                grad_pen = gradient_penalty_cwgan(batch_size, real_samples, artificial_samples,
                                                  real_labels, artificial_labels, critic)
                # Add the gradient penalty to the original discriminator loss
                crit_loss = c_cost + grad_pen * grad_pen_weight
                
            crit_loss_temp.append(crit_loss)

            # Calculate and apply the gradients obtained from the loss on the trainable variables
            gradients_of_critic = crit_tape.gradient(crit_loss, critic.trainable_variables)
            critic_optimizer.apply_gradients(zip(gradients_of_critic, critic.trainable_variables))

        i = i + 1
        if (step+1) % (n_steps//epochs) == 0:
            i=0

        crit_loss_all.append(np.mean(crit_loss_temp))
        
        # Generator Training
        # Generate inputs for generator, values and labels
        artificial_samples = tf.constant(input_dist.rvs(size=all_data.shape[1]*batch_size), shape=[
                batch_size,all_data.shape[1]])
        
        if len(set(train_lbls)) < 3:
            artificial_labels = tf.constant([1.0]*(batch_size//2) + [0.0]*(batch_size//2), shape=(batch_size,1))
        else:
            artificial_labels = np.array(pd.get_dummies([i for i in range(len(set(train_lbls)))]*batch_divisions))
    
        with tf.GradientTape() as gen_tape: # See the gradient for the generator
            # Generate artificial samples
            artificial_samples = generator([artificial_samples, artificial_labels], training=True)
            
            # Get the critic results for generated samples
            X_artificial = critic([artificial_samples, artificial_labels], training=True)
            # Calculate the generator loss
            gen_loss = generator_loss_wgan(X_artificial)

        # Calculate and apply the gradients obtained from the loss on the trainable variables
        gradients_of_generator = gen_tape.gradient(gen_loss, generator.trainable_variables)
        generator_optimizer.apply_gradients(zip(gradients_of_generator, generator.trainable_variables))
        gen_loss_all.append(gen_loss)

        # Update the progress bar and evaluation graphs every update1 steps for loss plots and update2 for the others.
        if (step + 1) % update1 == 0:
            
            # Update the evaluating figures at the set intervals
            axl.clear() # Always clear the corresponding ax before redrawing it
            
            # Loss Plot
            axl.plot(gen_loss_all, color = 'blue', label='Generator Loss')
            axl.plot(crit_loss_all,color = 'red', label='Critic Loss')
            axl.set_xlabel('Number of Steps')
            axl.set_ylabel('Loss')
            axl.legend()
            
            ipythondisplay.clear_output(wait=True)
            ipythondisplay.display(plt.gcf())

        if (step + 1) % update2 == 0:

            saved_predictions.append(generate_predictions(generator, n_generated_samples, all_data.shape[1], 
                                                          input_realdata_dist, ordered_labels))
            # See density and coverage and crossLID (divided by 25 to be in the same order as the rest) 
            # of latest predictions
            den, cov = gem.evaluation_coverage_density(test_data, saved_predictions[-1], k= k_cov_den, metric='euclidean')
            clid = gem.cross_LID_estimator_byMLE(test_data, saved_predictions[-1], k=k_crossLID, metric='euclidean')/25
            density.append(den)
            coverage.append(cov)
            crossLID.append(clid)

            # PCA of the latest predictions and training data
            # Divide by twice the standard deviation to be the same as the generated data
            dfs_temp = pd.concat((train_data_o/(2*train_data_o.values.std()),pd.DataFrame(
                saved_predictions[-1].numpy(), columns=train_data_o.columns))) 
            temp_lbls = train_lbls.copy()
            for l in ordered_labels:
                temp_lbls.extend([l+' - GAN']*(n_generated_samples//len(ordered_labels)))
            principaldf = gem.pca_sample_projection(dfs_temp, temp_lbls, pca, whiten=True, 
                                                samp_number=len(train_data_o.index))
            lcolors = label_colors_test

            # Hierarchical clustering of the latest predictions and testing data, 
            # saving the correct 1st cluster fraction results
            dfs_temp = np.concatenate((test_data, saved_predictions[-1].numpy()))
            temp_lbls = ['real']*len(test_data) + ['gen']*len(saved_predictions[-1])
            hca_results = gem.perform_HCA(dfs_temp, temp_lbls, metric='euclidean', method='ward')
            corr1stcluster.append(hca_results['correct 1st clustering'])
            
            # Plots
            axc.clear()
            axc.plot(range(update2, step+2, update2), coverage, label='coverage')
            axc.plot(range(update2, step+2, update2), density, label='density')
            axc.plot(range(update2, step+2, update2), crossLID, color='red', label='crossLID')
            axc.plot(range(update2, step+2, update2), corr1stcluster, color='purple', label='corr_cluster')
            axc.legend()

            axr.clear()
            gem.plot_PCA(principaldf, lcolors, components=(1,2), title='', ax=axr)
            axr.legend(loc='upper right', ncol=1, framealpha=1)
            
            ipythondisplay.clear_output(wait=True)
            ipythondisplay.display(plt.gcf())
            print(lbl, fold)

### Training the GAN

In [None]:
GENERATE=True
epochs = 500
batch_size = 32
k_cov_den = 20
k_crossLID = 15
random_seed = 145
n_generated_samples = 48*len(pd.unique(aug_lbl_storage_train[sep][1]))

if GENERATE:
    generator_train = {}
    critic_train = {}

    results_train = {}

    for lbl in real_samples.keys():
        generator_train[lbl] = {}
        critic_train[lbl] = {}

        results_train[lbl] = {}
        for fold in real_samples[lbl].keys():

            print(lbl, fold)
            # Store results
            gen_loss_all = []
            crit_loss_all = []
            saved_predictions = []
            coverage = []
            density = []
            crossLID = []
            corr1stcluster = []
            
            # Get distribution of intensity values of the dataset
            hist = np.histogram(real_samples[lbl][fold].values.flatten(), bins=100)
            input_realdata_dist = stats.rv_histogram(hist)

            df = real_samples[lbl][fold]
            pca = PCA(n_components=2, svd_solver='full', whiten=True)
            pc_coords = pca.fit_transform(df)

            generator_optimizer = tf.keras.optimizers.RMSprop(1e-4)
            critic_optimizer = tf.keras.optimizers.RMSprop(1e-4)

            generator_train[lbl][fold] = generator_model(aug_df_storage_train[lbl][fold].shape[1],
                                                 aug_df_storage_train[lbl][fold].shape[1], 128, 2)
            critic_train[lbl][fold] = critic_model(aug_df_storage_train[lbl][fold].shape[1], 512, 2)

            training_montage(aug_df_storage_train[lbl][fold], aug_lbl_storage_train[lbl][fold],
                             real_samples[lbl][fold], lbl_storage_train[lbl][fold],
                             epochs, generator_train[lbl][fold], critic_train[lbl][fold],
                             generator_optimizer, critic_optimizer, input_realdata_dist, batch_size,
                             grad_pen_weight=10, k_cov_den=k_cov_den, k_crossLID=k_crossLID,
                             random_seed=random_seed, n_generated_samples=n_generated_samples)

            results_train[lbl][fold]={'gen_loss': gen_loss_all, 'crit_loss': crit_loss_all, 'saved_pred': saved_predictions,
                     'coverage': coverage, 'density': density, 'crossLID': crossLID, 'corr1st_cluster': corr1stcluster}
            
            generator_train[lbl][fold].save_weights(
                'gan_models/Sens_Test_'+str(class_sep)+'_Synthetic_gen_imb_'+str(lbl)+str(fold))
            critic_train[lbl][fold].save_weights(
                'gan_models/Sens_Test_'+str(class_sep)+'_Synthetic_crit_imb_'+str(lbl)+str(fold))
    
            # Save the results from GAN training
            with open('gan_models/Sens_Test_'+str(class_sep)+'_Synthetic_results_imb_'+str(lbl)+str(fold)+'.pickle', 'wb') as handle:
                pickle.dump(results_train[lbl][fold], handle)

In [None]:
if not GENERATE:

    generator_train = {}
    critic_train = {}

    results_train = {}

    for lbl in df_storage_train:
        generator_train[lbl] = {}
        critic_train[lbl] = {}

        results_train[lbl] = {}
        for fold in df_storage_train[lbl]:
            # Read back the saved model
            generator_optimizer = tf.keras.optimizers.RMSprop(1e-4)
            critic_optimizer = tf.keras.optimizers.RMSprop(1e-4)

            generator_train[lbl][fold] = generator_model(df_storage_train[lbl][fold].shape[1],
                                                         df_storage_train[lbl][fold].shape[1], 128, 2)
            critic_train[lbl][fold] = critic_model(df_storage_train[lbl][fold].shape[1],
                                                   512, 2)

            # Load previously saved models
            generator_train[lbl][fold].load_weights(
                './gan_models/Sens_Test_'+str(class_sep)+'_Synthetic_gen_imb_'+str(lbl)+str(fold))
            critic_train[lbl][fold].load_weights(
                './gan_models/Sens_Test_'+str(class_sep)+'_Synthetic_crit_imb_'+str(lbl)+str(fold))
            
            # Load previously saved results
            with open('gan_models/Sens_Test_'+str(class_sep)+'_Synthetic_results_imb_'+str(lbl)+str(fold)+'.pickle', 'rb') as handle:
                results_train[lbl][fold] = pickle.load(handle)

### Comparison of Classification Accuracy

With the train set, we build and train a GAN model from them. Then we build models with the train set and with generated samples from the GAN models and compare the performance in discriminating the test set.

#### Generate a lot of samples and make Random Forests and PLS-DA models

In [None]:
np.random.seed(5402)
# Generate sample for each fold
generated_samples = {}

for i in generator_train:
    generated_samples[i] = {}
    for fold in generator_train[i]:
        # Input to the generator
        num_examples_to_generate = 2048
        # Get distribution of intensity values of the dataset
        hist = np.histogram(real_samples[i][fold].values.flatten(), bins=100)
        input_realdata_dist = stats.rv_histogram(hist)

        test_input = tf.constant(input_realdata_dist.rvs(
            size=len(df_storage_train[i][fold].columns)*num_examples_to_generate),
                                 shape=[num_examples_to_generate,len(df_storage_train[i][fold].columns)])

        test_labels = tf.constant([0]*(num_examples_to_generate//2) + [1]*(num_examples_to_generate//2), shape=[
            num_examples_to_generate,1])

        # Generate GAN samples
        predictions = generator_train[i][fold]([test_input, test_labels], training=False)
        # Reverse the division done to the data
        predictions = predictions * 2* aug_df_storage_train[i][fold].values.std()
        
        ordered_labels_fold = pd.get_dummies(
            np.array(lbl_storage_train[i][fold])[np.random.RandomState(seed=random_seed).permutation(
                len(lbl_storage_train[i][fold]))]).columns

        generated_samples[i][fold] = [pd.DataFrame(np.array(predictions), columns=df_storage_train[i][fold].columns),
                                [ordered_labels_fold[1],]*(num_examples_to_generate//2) + [ordered_labels_fold[0],]*(
                                num_examples_to_generate//2)]

In [None]:
# To store for each fold
bal_datasets = {}
np.random.seed(325)
rng = np.random.default_rng(7519)
for i in generated_samples.keys():
    bal_datasets[i] = {}
    for fold in generator_train[i]:
        print(i, fold)
        bal_datasets[i][fold] = {}
        df = real_samples[i][fold].loc[np.array(lbl_storage_train[i][fold]) == '1']
        # Calculate all correlations between all samples of experimental and GAN data and store them in a dataframe
        correlations = pd.DataFrame(index=generated_samples[i][fold][0].index, columns=df.index).astype('float')

        for a in df.index:
            for j in generated_samples[i][fold][0].index:
                correlations.loc[j,a] = stats.pearsonr(df.loc[a],
                                                       generated_samples[i][fold][0].loc[j])[0]

        correlated_samples = pd.DataFrame(columns=df.index)
        for a in correlations:
            correlated_samples[a] = correlations[a].sort_values(ascending=False).index
            
        permutated = correlated_samples.copy()
        for l in correlated_samples.index:
            permutated.loc[l] = rng.permutation(correlated_samples.loc[l])
        #print(permutated)

        corr_idxs = pd.unique(permutated.values.flatten())
        
        dataset_len = real_samples[i][fold].shape[0]
        
        n_min_class = (np.array(lbl_storage_train[i][fold]) == '1').sum()
        n_max_class = (dataset_len - n_min_class)//(len(pd.unique(lbl_storage_train[i][fold]))-1)
        
        # Add samples - Half of the necessary to balance the dataset at a time
        for num in range(0, n_max_class - n_min_class+1, (n_max_class - n_min_class+1)//2):
            idx_to_keep = corr_idxs[:num]

            corr_preds = generated_samples[i][fold][0].loc[list(pd.unique(idx_to_keep))]
            corr_lbls  = np.array(generated_samples[i][fold][1])[list(pd.unique(idx_to_keep))]

            # Slowly add the GAN correlated GAN samples to the the imbalanced dataset, making it a balanced dataset
            concat_df = pd.concat((corr_preds, real_samples[i][fold]))
            # All GAN samples added are from the '1' minority class
            concat_lbls = ['1',]*len(set(idx_to_keep)) + lbl_storage_train[i][fold]
            bal_datasets[i][fold][num] = [concat_df.copy(), concat_lbls.copy()]

### Fitting RF and PLS-DA models to Imbalance Datasets and Evaluating them

RF and PLS-DA models are built for each minority class, each fold and each number of GAN samples added.

#### RF

In [None]:
# Fitting and storing Random Forest models for each fold
RF_models_bal = {}

# Train the Models
for min_class in bal_datasets:
    RF_models_bal[min_class] = {}
    for size in bal_datasets[min_class][1].keys():
        RF_models_bal[min_class][size] = {}
        for fold in bal_datasets[min_class]:
            rf_mod = ma.RF_model(bal_datasets[min_class][fold][size][0], bal_datasets[min_class][fold][size][1],
                                 return_cv=False, n_trees=200)
            RF_models_bal[min_class][size][fold] = rf_mod

In [None]:
# Testing the RF models with the test data for each fold for real, GAN and CorrGAN Data
RF_results_bal = {'Accuracy':{}, 'F1-Score':{}, 'Precision':{}, 'Recall':{}}
# Evaluate the Models
for min_class in RF_models_bal:
    RF_results_bal['Accuracy'][min_class] = {}
    RF_results_bal['F1-Score'][min_class] = {}
    RF_results_bal['Precision'][min_class] = {}
    RF_results_bal['Recall'][min_class] = {}
    for size in RF_models_bal[min_class].keys():
        RF_results_bal['Accuracy'][min_class][size] = {}
        RF_results_bal['F1-Score'][min_class][size] = {}
        RF_results_bal['Precision'][min_class][size] = {}
        RF_results_bal['Recall'][min_class][size] = {}
        for fold in RF_models_bal[min_class][size]:
            RF_results_bal['Accuracy'][min_class][size][fold] = RF_models_bal[min_class][size][fold].score(
                                                                                    df_storage_test[min_class][fold],
                                                                                    lbl_storage_test[min_class][fold])
            preds = RF_models_bal[min_class][size][fold].predict(df_storage_test[min_class][fold])
            prec, rec, f1, sup = precision_recall_fscore_support(lbl_storage_test[min_class][fold], preds,
                                                                pos_label='1', average='binary',
                                                                zero_division=0)
            RF_results_bal['F1-Score'][min_class][size][fold] = f1
            RF_results_bal['Precision'][min_class][size][fold] = prec
            RF_results_bal['Recall'][min_class][size][fold] = rec

In [None]:
results_df = pd.DataFrame(columns=['Imb.', 'Aug.'])
for l in RF_results_bal['F1-Score']:
    if l.startswith('100'):
        results_df.loc[l] = pd.DataFrame.from_dict(RF_results_bal['F1-Score'][l]).mean()[[0,30]].values
    elif l.startswith('200'):
        results_df.loc[l] = pd.DataFrame.from_dict(RF_results_bal['F1-Score'][l]).mean()[[0,60]].values
    else:
        results_df.loc[l] = pd.DataFrame.from_dict(RF_results_bal['F1-Score'][l]).mean()[[0,120]].values

results_df

#### PLS-DA

In [None]:
def decision_rule(y_pred, y_true, pos_label, average='binary'):
    "Decision rule for PLS-DA classification."
    # Decision rule for classification
    # Decision rule chosen: sample belongs to group where it has max y_pred (closer to 1)
    # In case of 1,0 encoding for two groups, round to nearest integer to compare
    nright = 0
    rounded = np.round(y_pred)

    for p in range(len(y_pred)):
        if rounded[p] >= 1:
            pred = 1
            rounded[p] = 1
        else:
            pred = 0
            rounded[p] = 0
        if pred == y_true[p]:
            nright += 1  # Correct prediction
    
    # Calculate accuracy for this iteration
    accuracy = (nright / len(y_pred))
    prec, rec, f1, sup = precision_recall_fscore_support(y_true, rounded, pos_label=pos_label, average=average,
                                                         zero_division=0)
    return accuracy, f1, prec, rec

In [None]:
PLSDA_models_bal = {}
PLSDA_results_bal = {'Accuracy':{}, 'F1-Score':{}, 'Precision':{}, 'Recall':{}}

np.random.seed(325)

# Train and Evaluate the models
for min_class in bal_datasets:
    PLSDA_models_bal[min_class] = {}
    PLSDA_results_bal['Accuracy'][min_class] = {}
    PLSDA_results_bal['F1-Score'][min_class] = {}
    PLSDA_results_bal['Precision'][min_class] = {}
    PLSDA_results_bal['Recall'][min_class] = {}
    for size in bal_datasets[min_class][1].keys():
        PLSDA_models_bal[min_class][size] = {}
        PLSDA_results_bal['Accuracy'][min_class][size] = {}
        PLSDA_results_bal['F1-Score'][min_class][size] = {}
        PLSDA_results_bal['Precision'][min_class][size] = {}
        PLSDA_results_bal['Recall'][min_class][size] = {}
        for fold in bal_datasets[min_class]:

            PLSDA_models_bal[min_class][size][fold] = ma.fit_PLSDA_model(bal_datasets[min_class][fold][size][0],
                                                                   bal_datasets[min_class][fold][size][1],
                                                              n_comp=4,
                                                      return_scores=False, scale=False, encode2as1vector=True)
            plsda = PLSDA_models_bal[min_class][size][fold]
            # Obtain results with the test group
            y_pred = plsda.predict(df_storage_test[min_class][fold])
            y_true = ma._generate_y_PLSDA(lbl_storage_test[min_class][fold],
                                          pd.unique(bal_datasets[min_class][fold][size][1]),
                                          True)
            y_true = [1 if i==True else 0 for i in y_true]
            pos_label = np.where(pd.unique(bal_datasets[min_class][fold][size][1]) != '1')[0][0]
            # Calculate accuracy
            accuracy, f1, prec, rec = decision_rule(y_pred, y_true, pos_label=pos_label, average='binary')
            PLSDA_results_bal['Accuracy'][min_class][size][fold] = accuracy
            PLSDA_results_bal['F1-Score'][min_class][size][fold] = f1
            PLSDA_results_bal['Precision'][min_class][size][fold] = prec
            PLSDA_results_bal['Recall'][min_class][size][fold] = rec

In [None]:
results_df = pd.DataFrame(columns=['Imb.', 'Aug.'])
for l in PLSDA_results_bal['F1-Score']:
    if l.startswith('100'):
        results_df.loc[l] = pd.DataFrame.from_dict(PLSDA_results_bal['F1-Score'][l]).mean()[[0,30]].values
    elif l.startswith('200'):
        results_df.loc[l] = pd.DataFrame.from_dict(PLSDA_results_bal['F1-Score'][l]).mean()[[0,60]].values
    else:
        results_df.loc[l] = pd.DataFrame.from_dict(PLSDA_results_bal['F1-Score'][l]).mean()[[0,120]].values

results_df