# Data Augmentation - Conditional Wasserstein GANs - GP

# Synthetic Dataset Testing

This notebook presents the testing made on the effect of supplementing imbalanced datasets with samples of the minority class generated by trained CWGAN-GP models to balance datasets on supervised analysis. This testing is made on six synthetic datasets made whose difference is on the separation between the 2 classes of the dataset.

Notebook Organization:
- Create and treat 6 synthetic datasets.
- Unsupervised and Supervised statistical analysis and univariate analysis of the synthetic datasets.
- Creation of the imbalanced datasets for each synthetic datasets, creating 5 folds for each one.
- Setup the CWGAN-GP model and train all 30 models (6 datasets times 5 folds), with the corresponding training data.
- Generate GAN samples and add them to the corresponding imbalanced training sets in small increments.
- Build and evaluate performance of RF and PLS-DA models from the imbalanced datasets, the imbalanced datasets supplemented with minority class samples and purely GAN samples dataset for each synthetic dataset and each fold.
- Compare important features in the RF and PLS-DA models against important features of models built with the complete dataset.

#### Due to stochasticity, re-running the notebook will get slightly different results. Thus, figures in the paper can be slightly different.

In [None]:
# json for persistence
from time import perf_counter

import numpy as np
import pandas as pd

import scipy.spatial.distance as dist
import scipy.cluster.hierarchy as hier
import scipy.stats as stats

import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from matplotlib import ticker

import seaborn as sns
from collections import namedtuple, Counter

from tqdm import tqdm
from IPython import display as ipythondisplay

from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.cross_decomposition import PLSRegression
import sklearn.cluster as skclust
from sklearn.metrics import (adjusted_rand_score, precision_recall_fscore_support, r2_score, roc_auc_score,
                             roc_curve, auc, f1_score, precision_score, recall_score)
from sklearn.datasets import make_classification
import sklearn.ensemble as skensemble
import sklearn.model_selection
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, cross_validate

import pickle
import tensorflow as tf
from keras import backend

# Metabolinks package
import metabolinks as mtl
import metabolinks.transformations as transf

# Python files in the repository
import multianalysis as ma
from elips import plot_confidence_ellipse
import gan_evaluation_metrics as gem
import linear_augmentation_functions as laf

In [None]:
# Import needed functions from GAN_functions
from GAN_functions import gradient_penalty_cwgan
from GAN_functions import critic_loss_wgan
from GAN_functions import generator_loss_wgan

### Functions for unsupervised analysis

In [None]:
# Functions to plot PCA
def plot_PCA(principaldf, label_colors, components=(1,2), title="PCA", ax=None):
    "Plot the projection of samples in the 2 main components of a PCA model."
    
    if ax is None:
        ax = plt.gca()
    
    loc_c1, loc_c2 = [c - 1 for c in components]
    col_c1_name, col_c2_name = principaldf.columns[[loc_c1, loc_c2]]
    
    #ax.axis('equal')
    ax.set_xlabel(f'{col_c1_name}')
    ax.set_ylabel(f'{col_c2_name}')

    unique_labels = principaldf['Label'].unique()

    for lbl in unique_labels:
        subset = principaldf[principaldf['Label']==lbl]
        ax.scatter(subset[col_c1_name],
                   subset[col_c2_name],
                   s=50, color=label_colors[lbl], label=lbl)

    #ax.legend(framealpha=1)
    ax.set_title(title, fontsize=15)

def plot_ellipses_PCA(principaldf, label_colors, components=(1,2),ax=None, q=None, nstd=2):
    "Plot confidence ellipses of a class' samples based on their projection in the 2 main components of a PCA model."
    
    if ax is None:
        ax = plt.gca()
    
    loc_c1, loc_c2 = [c - 1 for c in components]
    points = principaldf.iloc[:, [loc_c1, loc_c2]]
    
    #ax.axis('equal')

    unique_labels = principaldf['Label'].unique()

    for lbl in unique_labels:
        subset_points = points[principaldf['Label']==lbl]
        plot_confidence_ellipse(subset_points, q, nstd, ax=ax, ec=label_colors[lbl], fc='none')


#### Hierarchical Clustering Analysis (HCA)

In [None]:
def perform_HCA(df, metric='euclidean', method='average'):
    "Performs Hierarchical Clustering Analysis of a data set with chosen linkage method and distance metric."
    
    distances = dist.pdist(df, metric=metric)
    
    # method is one of
    # ward, average, centroid, single, complete, weighted, median
    Z = hier.linkage(distances, method=method)

    # Cophenetic Correlation Coefficient
    # (see how the clustering - from hier.linkage - preserves the original distances)
    coph = hier.cophenet(Z, distances)
    # Baker's gamma
    mr = ma.mergerank(Z)
    bg = mr[mr!=0]

    return {'Z': Z, 'distances': distances, 'coph': coph, 'merge_rank': mr, "Baker's Gamma": bg}

In [None]:
def compute_clustering_metrics(res_dict, labels):
    """Fill dict with clustering performance metrics."""
    
    discrim = ma.dist_discrim(res_dict['Z'], labels, # all samples have the same order
                              method = 'average')
    res_dict['Average discrim dist'] = discrim[0]
    correct = np.array(list(discrim[1].values()))
    
    classes = pd.unique(labels)
    res_dict['% correct clustering'] = (100/len(classes)) * len(correct[correct>0])

    # Correct First Cluster Percentage
    res_dict['% correct 1st clustering'] = 100 * ma.correct_1stcluster_fraction(res_dict['Z'],labels)

#### K-Means Clustering

In [None]:
def perform_KMeans(dataset, target, iter_num=150, best_fraction=0.1):
    "Perform K-means Clustering Analysis and calculate discrimination evaluation metrics."
    
    sample_labels = target
    n_classes = len(pd.unique(sample_labels))
    
    df = dataset
    
    discrim = ma.Kmeans_discrim(df, sample_labels,
                                method='average', 
                                iter_num=iter_num,
                                best_fraction=best_fraction)

    
    # Lists for the results of the best k-means clustering
    average = []
    correct = []
    rand = []
    
    for j in discrim:
        global_disc_dist, disc_dists, rand_index, SSE = discrim[j]
        
        # Average of discrimination distances
        average.append(global_disc_dist) 
        
        # Correct Clustering Percentages
        all_correct = np.array(list(disc_dists.values()))
        correct.append(len(all_correct[all_correct>0]))
        
        # Adjusted Rand Index
        rand.append(rand_index) 
    
    return{'dataset': dataset,
           'Discrimination Distance': np.median(average),
           '% correct clusters':np.median(correct)*100/n_classes,
           'Rand Index': np.median(rand)}

### Synthetic Dataset Creation

**Characteristics:**

- 2 classes
- 200 samples (100 for each class)
- 600 features (20 informative, 100 redundant - linear combination of informative ones - and 480 noisy features)
- 2 clusters per class
- No random flipping of class labels (flip_y)

Syntethic datasets are Pareto scaled so their values are mainly between -1 and 1.

**class_sep** (separation between classes) - varying from 0.6 to 1.6 in 0.2 intervals.

In [None]:
dfs = {}
lbls = {}
dfs_no_t = {}

sep = [0.6, 1.0, 1.2, 1.4, 1.6, 2.0]
for i in sep:
    print(i)
    df_f, lbl_f = make_classification(n_samples=200, n_features=600, n_informative=20, n_redundant=100,
                                       n_classes=2, n_clusters_per_class=2, weights=None,
                                       flip_y=0, class_sep=i, random_state=52683)
    dfs_no_t[i] = pd.DataFrame(df_f) # Save non-treated synthetic datasets
    dfs[i] = transf.pareto_scale(pd.DataFrame(df_f)) # Save treated synthetic datasets
    lbls[i] = [str(i) for i in lbl_f]
lbls_orig = lbls # Save original labels

In [None]:
colours = sns.color_palette('Set1', 3)

ordered_labels = pd.unique(lbls[0.6])

label_colors = {lbl: c for lbl, c in zip(ordered_labels, colours)}
sample_colors = [label_colors[lbl] for lbl in lbls[0.6]]

sns.palplot(label_colors.values())
new_ticks = plt.xticks(range(len(ordered_labels)), ordered_labels)

#### PCA Projection

In [None]:
f, axs = plt.subplots(2, 3, figsize=(16,10))

# Perform PCA after autoscaling of the original data
for i, ax in zip(dfs, axs.ravel()):
    df = dfs[i]

    principaldf, var = ma.compute_df_with_PCs(df, n_components=2, whiten=True, labels=lbls[i], return_var_ratios=True)

    # Plot PCA
    ax.axis('equal')
    lcolors = label_colors

    gem.plot_PCA(principaldf, lcolors, components=(1,2), title='', ax=ax)
    plot_ellipses_PCA(principaldf, lcolors, components=(1,2),ax=ax, q=0.95)

    ax.set_xlabel(f'PC 1 ({var[0] * 100:.1f} %)')
    ax.set_ylabel(f'PC 2 ({var[1] * 100:.1f} %)')

    ax.legend()

plt.show()

#### Hierarchical Clustering Analysis (HCA)

In [None]:
HCA_all = {}
for treat in dfs:
    print(f'Performing HCA with treatment {treat}', end=' ...')
    metric = 'euclidean'
    HCA_all[treat] = perform_HCA(dfs[treat], metric=metric, method='ward')
    print('done!')

In [None]:
with sns.axes_style("white"):
    f, axs = plt.subplots(2, 3, figsize=(12, 10), constrained_layout=True)
    
    for i, ax in zip(dfs, axs.ravel()):
        gem.plot_dendogram(HCA_all[i]['Z'], 
                       lbls[i], ax=ax,
                       label_colors=label_colors,
                       title='', color_threshold=0)

    plt.show()

In [None]:
for name, res_dict in HCA_all.items():
    compute_clustering_metrics(res_dict, lbls[name])

# Build table - summary of results
clust_performance = {}

for metric in ('Average discrim dist', '% correct clustering', '% correct 1st clustering'):
    clust_performance[metric] = {d: HCA_all[d][metric] for d in HCA_all}
clust_performance = pd.DataFrame(clust_performance, index=HCA_all)
clust_performance

#### K-Means Clustering

In [None]:
%%capture --no-stdout
np.random.seed(63780)
iter_num=15

KMeans_all = []

for treatment in dfs:
    print(f'performing KMeans with treatment {treatment}' , end=' ...')
    KMeans_all.append(perform_KMeans(dfs[treatment], lbls[treatment], iter_num=iter_num))
    print('done!')      

In [None]:
KMeans_all = pd.DataFrame(KMeans_all).iloc[:,1:]
KMeans_all

#### Random Forest

In [None]:
# RF_model_CV - RF application and result extraction.
def RF_model_CV(df, y, iter_num=1, n_fold=5, n_trees=200):
    nfeats = df.shape[1]

    # Setting up variables for result storing
    imp_feat = np.zeros((iter_num * n_fold, nfeats))
    accuracy_scores = []
    f1_scores = []
    prec_scores = []
    rec_scores = []
    f = 0

    # Number of times Random Forest cross-validation is made
    # with `n_fold` randomly generated folds.
    for _ in range(iter_num):
        # Use stratified n_fold cross validation
        kf = StratifiedKFold(n_fold, shuffle=True)
        CV_accuracy_scores = []
        CV_f1_scores = []
        CV_prec_scores = []
        CV_rec_scores = []
        # Fit and evaluate a Random Forest model for each fold
        for train_index, test_index in kf.split(df, y):
            # Random Forest setup and fit
            rf = skensemble.RandomForestClassifier(n_estimators=n_trees)
            X_train, X_test = df.iloc[train_index, :], df.iloc[test_index, :]
            y_train, y_test = [y[i] for i in train_index], [y[i] for i in test_index]
            rf.fit(X_train, y_train)

            # Compute performance and important features
            CV_accuracy_scores.append(rf.score(X_test, y_test)) # Predictive Accuracy
            preds = rf.predict(X_test)
            prec, rec, f1, sup = precision_recall_fscore_support(y_test, preds,
                                                                pos_label='1', average='binary',
                                                                zero_division=1)
            CV_f1_scores.append(f1)
            CV_prec_scores.append(prec)
            CV_rec_scores.append(rec)
            imp_feat[f, :] = rf.feature_importances_ # Importance of each feature
            f = f + 1

        # Average Predictive Accuracy in this iteration
        accuracy_scores.append(np.mean(CV_accuracy_scores))
        f1_scores.append(np.mean(CV_f1_scores))
        prec_scores.append(np.mean(CV_prec_scores))
        rec_scores.append(np.mean(CV_rec_scores))

    # Collect and order all important features values from each Random Forest
    imp_feat_sum = imp_feat.sum(axis=0) / (iter_num * n_fold)
    sorted_imp_feat = sorted(enumerate(imp_feat_sum), key=lambda x: x[1], reverse=True)

    # locs are sufficient as a reference to features
    #imp_feat_tuples = [(loc, importance) for loc, importance in sorted_imp_feat]
    
    if iter_num == 1:
        return {'accuracy': accuracy_scores[0], 'F1-Score':f1_scores[0], 'Precision':prec_scores[0],
                'Recall':rec_scores[0], 'important_features': sorted_imp_feat}
    else:
        return {'accuracy': accuracy_scores, 'F1-Score':f1_scores, 'Precision':prec_scores,
                'Recall':rec_scores, 'important_features': sorted_imp_feat}

In [None]:
iter_num=20

RF_all = {}

# Application of the Random Forests for each differently-treated dataset
for treatment in dfs:
    print(f'Fitting random forest with treatment {treatment}', end=' ...')
    rfname = treatment
    RF_all[rfname] = {'treatment':treatment}
    n_fold = 5

    fit = RF_model_CV(dfs[treatment], lbls_orig[treatment], iter_num=iter_num, n_fold=n_fold, n_trees=200)
    RF_all[rfname].update(fit)

    print(f'done')  

In [None]:
# Accuracy across the iterations
accuracies = pd.DataFrame({name: RF_all[name]['accuracy'] for name in RF_all})

accuracy_stats_RF = pd.DataFrame({'Average accuracy': accuracies.mean(axis=0),
                               'STD': accuracies.std(axis=0)})
accuracy_stats_RF = accuracy_stats_RF.assign(treatment=[RF_all[name]['treatment'] for name in RF_all])
accuracy_stats_RF

In [None]:
# F1-scores across the iterations
F1s = pd.DataFrame({name: RF_all[name]['F1-Score'] for name in RF_all})

F1s_stats_RF = pd.DataFrame({'Average accuracy': F1s.mean(axis=0),
                               'STD': F1s.std(axis=0)})
F1s_stats_RF = F1s_stats_RF.assign(treatment=[RF_all[name]['treatment'] for name in RF_all])
F1s_stats_RF

#### PLS-DA

In [None]:
%%capture --no-stdout
# above is to supress PLS warnings

max_comp=10

# Store Results
PLS_optim = {}

# Build and extract metrics from models build with different number of components by using the optim_PLS function.
for treatment in dfs:
    print(f'Fitting PLS-DA model with treatment {treatment}', end=' ...')
    plsdaname = treatment
    PLS_optim[plsdaname] = {'treatment':treatment}
    n_fold = 5
    optim = ma.optim_PLSDA_n_components(dfs[treatment], lbls[treatment],
                                        max_comp=max_comp, n_fold=n_fold).CVscores
    PLS_optim[plsdaname]['CV_scores'] = optim
    print(f'done')

In [None]:
# Plotting the results and adjusting plot parameters
with sns.axes_style("whitegrid"):
    with sns.plotting_context("notebook", font_scale=1.2):
        f, ax = plt.subplots(1, 1, figsize = (5,5))
        for name, data in PLS_optim.items():

            # Negative Grapevine Dataset
            ax.plot(range(1, len(data['CV_scores']) + 1), data['CV_scores'],
                     label=data['treatment'])
            ax.set(xlabel='Number of Components',
                    ylabel='PLS Score (1 - PRESS/SS)',
                    title='Negative Mode Grapevine Dataset')
            ax.legend()
            ax.set_ylim([0, 1])

        plt.tight_layout()
        plt.show()

In [None]:
def PLSDA_model_CV(df, labels, n_comp=10,
                   kf = None, n_fold=5,
                   iter_num=1,
                   encode2as1vector=True,
                   scale=False,
                   feat_type='VIP'):
    
    """Perform PLS-DA with n-fold cross-validation.

       df: pandas DataFrame; includes X equivalent in PLS-DA (training vectors).
       labels: target labels.
       n_comp: integer; number of components to use in PLS-DA.
       kf: default None; pass a specific cross validation method from 
        https://scikit-learn.org/stable/modules/cross_validation.html#cross-validation-iterators (3.1.2)
       n_fold: int (default: 5); number of groups to divide dataset in for cross-validation
        (NOTE: max n_fold can not exceed minimum number of samples per class).
       iter_num: int (default: 1); number of iterations that cross validation is repeated.
       scale: bool (default: False); if data is scaled when inputted to PLS model (only true if scaling was not done earlier)
       feat_type: string (default: 'VIP'); types of feature importance metrics to use; accepted: {'VIP', 'Coef', 'Weights'}.

    Returns: (accuracy, F1-score, precision, recall, Q2, import_features);
        accuracy: list of accuracy values in group selection
        F1-score: list of F1-scores (weighted) in group selection
        precision: list of precision (weighted) in group selection
        recall: list of recall (weighted) in group selection
        Q2: list of average Q2 scores of the models
        imp_features: list of tuples (index number of feature, feature importance)
            ordered by decreasing feature importance.
    """
    # Setting up lists and matrices to store results
    CVR2 = []
    accuracies = []
    f1_scores = []
    precision = []
    recall = []
    Imp_Feat = np.zeros((iter_num * n_fold, df.shape[1]))
    f = 0

    unique_labels = list(pd.unique(labels))

    is1vector = len(unique_labels) == 2 and encode2as1vector

    matrix = ma._generate_y_PLSDA(labels, unique_labels, is1vector)

    if is1vector:
        # keep a copy to use later
        target1D = matrix.copy()

    # Number of iterations equal to iter_num
    for i in range(iter_num):
        if kf is None:
            kf = sklearn.model_selection.StratifiedKFold(n_fold, shuffle=True)
        
        # Setting up storing variables for cross-validation
        nright = 0 # For accuracy
        cvr2 = [] # For R2 score
        # To store real and predicted classes to calculate F1-score, precision and recall
        if not is1vector:
            all_preds = pd.DataFrame(columns=matrix.columns, index=matrix.index)
            all_tests = pd.DataFrame(columns=matrix.columns, index=matrix.index)
            a = 0
        else:
            all_preds = []
            all_tests = []

        # Iterate through cross-validation procedure
        for train_index, test_index in kf.split(df, labels):
            plsda = PLSRegression(n_components=n_comp, scale=scale)
            X_train, X_test = df.iloc[train_index, :], df.iloc[test_index, :]
            if not is1vector:
                y_train = matrix.iloc[train_index, :].copy()
                y_test = matrix.iloc[test_index, :].copy()

            else:
                y_train, y_test = target1D[train_index], target1D[test_index]
                correct = target1D[test_index]

            # Fit PLS model
            plsda.fit(X=X_train, Y=y_train)

            # Obtain results with the test group
            y_pred = plsda.predict(X_test)
            cvr2.append(r2_score(y_test, y_pred))

            # Decision rule for classification
            # Decision rule chosen: sample belongs to group where it has max y_pred (closer to 1)
            # In case of 1,0 encoding for two groups, round to nearest integer to compare
            if not is1vector:
                rounded_pred = y_pred.copy()
                for i in range(len(y_pred)):
                    if list(y_test.iloc[i, :]).index(max(y_test.iloc[i, :])) == np.argmax(
                        y_pred[i]
                    ):
                        nright += 1  # Correct prediction
                    
                    for l in range(len(y_pred[i])):
                        if l == np.argmax(y_pred[i]):
                            rounded_pred[i, l] = 1
                        else:
                            rounded_pred[i, l] = 0
            
                # Save y-test and predictions to calculate F1-score, precision and recall
                all_tests.iloc[a:a+len(y_test)] = y_test
                all_preds.iloc[a:a+len(y_test)] = rounded_pred
                a = a + len(y_test)

            else:
                rounded = np.round(y_pred)
                for p in range(len(y_pred)):
                    if rounded[p] >= 1:
                        rounded[p] = 1
                    else:
                        rounded[p] = 0
                    if rounded[p] == correct[p]:
                        nright += 1  # Correct prediction
                
                # Save y-test and predictions to calculate F1-score, precision and recall
                all_preds.extend(list(rounded[:,0]))
                all_tests.extend(y_test)
            
            # Calculate important features (3 different methods to choose from)
            if feat_type == 'VIP':
                Imp_Feat[f, :] = ma._calculate_vips(plsda)
            elif feat_type == 'Coef':
                Imp_Feat[f, :] = abs(plsda.coef_).sum(axis=1)
            elif feat_type == 'Weights':
                Imp_Feat[f, :] = abs(plsda.x_weights_).sum(axis=1)
            else:
                raise ValueError(
                    'Type not Recognized. Types accepted: "VIP", "Coef", "Weights"'
                )

            f += 1

        # Calculate the accuracy of the group predicted and storing score results
        accuracies.append(nright / len(labels))
        CVR2.append(np.mean(cvr2))
        # Calculate F1-score, precision and recall for the fold and storing results
        if not is1vector:
            pos_label = np.where(unique_labels != '1')[0][0]
            #print(unique_labels, pos_label)
            f1_scores.append(f1_score(all_tests.astype(int), all_preds.astype(int), average='binary', pos_label=pos_label))
            precision.append(precision_score(all_tests.astype(int), all_preds.astype(int), average='binary', pos_label=pos_label))
            recall.append(recall_score(all_tests.astype(int), all_preds.astype(int), average='binary', pos_label=pos_label))
        else:
            pos_label = np.where(unique_labels != '1')[0][0]
            #print(unique_labels, pos_label)
            f1_scores.append(f1_score(all_tests, all_preds, average='binary', pos_label=pos_label))
            precision.append(precision_score(all_tests, all_preds, average='binary', pos_label=pos_label))
            recall.append(recall_score(all_tests, all_preds, average='binary', pos_label=pos_label))


    # Join and sort all important features values from each cross validation group and iteration.
    Imp_sum = Imp_Feat.sum(axis=0) / (iter_num * n_fold)
    imp_features = sorted(enumerate(Imp_sum), key=lambda x: x[1], reverse=True)
    if iter_num == 1:
        return {'accuracy': accuracies[0], 'F1-scores':f1_scores[0], 'precision': precision[0], 'recall':recall[0],
                'Q2': CVR2[0], 'imp_feat': imp_features}
    else:
        return {'accuracy': accuracies, 'F1-scores':f1_scores, 'precision': precision, 'recall':recall,
                'Q2': CVR2, 'imp_feat': imp_features}

In [None]:
%%capture --no-stdout

PLSDA_all = {}

iter_num=20

for treatment in dfs:
    print(f'Fitting a PLS-DA model with treatment {treatment}', end=' ...')
    plsdaname = treatment
    PLSDA_all[plsdaname] = {'treatment':treatment}
    n_comp = 4
    n_fold = 5
    fit = PLSDA_model_CV(dfs[treatment], lbls_orig[treatment],
                            n_comp=n_comp, n_fold=n_fold,
                            iter_num=iter_num,
                            feat_type='VIP')
    PLSDA_all[plsdaname].update(fit)
    print(f'done')

In [None]:
# Accuracy across the iterations
accuracies = pd.DataFrame({name: PLSDA_all[name]['accuracy'] for name in PLSDA_all})
#accuracies

accuracy_stats_PLSDA = pd.DataFrame({'Average accuracy': accuracies.mean(axis=0),
                               'STD': accuracies.std(axis=0)})
accuracy_stats_PLSDA = accuracy_stats_PLSDA.assign(treatment=[PLSDA_all[name]['treatment'] for name in PLSDA_all])
accuracy_stats_PLSDA

In [None]:
# F1-scores across the iterations
F1s = pd.DataFrame({name: PLSDA_all[name]['F1-scores'] for name in PLSDA_all})

F1s_stats_PLSDA = pd.DataFrame({'Average accuracy': F1s.mean(axis=0),
                               'STD': F1s.std(axis=0)})
F1s_stats_PLSDA = F1s_stats_PLSDA.assign(treatment=[PLSDA_all[name]['treatment'] for name in PLSDA_all])
F1s_stats_PLSDA

In [None]:
# Plot RF results
p4 = sns.color_palette('tab20', 9)
with sns.axes_style("whitegrid"):
    with sns.plotting_context("notebook", font_scale=1.3):
        f, (axl, axr) = plt.subplots(1, 2, figsize=(16, 4))
        x = 1  # the label locations
        width = 0.17  # the width of the bars
        for i, treatment in enumerate(dfs.keys()):
            acc_treatment = accuracy_stats_RF[accuracy_stats_RF['treatment']==treatment]
            offset = - 0.5 + i * 0.2
            rects = axl.bar(x + offset, acc_treatment['Average accuracy'], width, label=treatment, color = p4[i])
            axl.errorbar(x + offset, y=acc_treatment['Average accuracy'], yerr=acc_treatment['STD'],
                        ls='none', ecolor='0.2', capsize=3)   
        axl.set_xticks([1-0.5 +i*0.2 for i in range(len(dfs.keys()))])
        axl.set_xticklabels(dfs.keys())
        axl.set(ylabel='Average accuracy', title='', ylim=(0.4,1.02))
        axl.legend(loc='upper left', bbox_to_anchor=(0.75, 0.85))
        axl.set_title('Random Forest')
        
        for i, treatment in enumerate(dfs.keys()):
            acc_treatment = accuracy_stats_PLSDA[accuracy_stats_PLSDA['treatment']==treatment]
            offset = - 0.5 + i * 0.2
            rects = axr.bar(x + offset, acc_treatment['Average accuracy'], width, label=treatment, color = p4[i])
            axr.errorbar(x + offset, y=acc_treatment['Average accuracy'], yerr=acc_treatment['STD'],
                        ls='none', ecolor='0.2', capsize=3)   
        axr.set_xticks([1-0.5 +i*0.2 for i in range(len(dfs.keys()))])
        axr.set_xticklabels(dfs.keys())
        axr.set(ylabel='Average accuracy', title='', ylim=(0.4,1.02))
        axr.legend(loc='upper left', bbox_to_anchor=(0.75, 0.85))
        axr.set_title('PLS-DA')

### Univariate Analysis

In [None]:
uni_results = {}
uni_results_filt = {}
for sep in dfs:
    uni_results[sep] = ma.compute_FC_pvalues_2groups(dfs[sep], dfs[sep],
                               labels=lbls_orig[sep],
                               equal_var=True,
                               alpha=0.05, useMW=False)
    uni_results_filt[sep] = uni_results[sep].iloc[:40]
    #print(uni_results_filt[sep].shape)

# Data Augmentation

## Creating Imbalanced Datasets

The six synthetic datasets have two balanced datasets. Since both classes are similar in terms of heterogeneity, we will only use one of them as the minority class - the class '1'.

Then, for each synthetic dataset:

We split it in 5 different ways where each had 80 samples of the majority class in that case and 20 samples of the minority class in the training set. Thus, this left 20 samples of the majority and 80 of the minority class to be the test sets. This was made by putting the set of 100 samples of a class into 5 folds of 20, combining 4 for the majority class for the training set. Training set was Pareto scaled and on the test set we performed a 'faux' Pareto scaling using the features standard deviation and mean of the training set since the training and test sets have a vastly different balance of class samples. Thus, feature averages and standard deviations can be quite different between them, especially in key features for discrimination. To compensate for this, the ‘faux’ Pareto scaling was applied. 

The untreated training sets were linearly augmented, which was then treated (using a normal Pareto scaling, in this case), to generate samples to train the CWGAN-GP models.

In [None]:
rng = np.random.default_rng(7519)

# This is useless but the rng seed was set before this so this has to be ran so the others remain the same
permutations = {}
for cl in ordered_labels:
    permutations[cl] = list(rng.permutation(np.where(np.array(lbls[0.6]) == cl)[0]))

In [None]:
df_storage_train = {}
df_storage_test = {}
lbl_storage_train = {}
lbl_storage_test = {}
real_samples = {}
permutations = {}

for sep in dfs:
    fold_len = 100//5
    df_storage_train[sep] = {}
    df_storage_test[sep] = {}
    lbl_storage_train[sep] = {}
    lbl_storage_test[sep] = {}
    real_samples[sep] = {}
    permutations[sep] = {}
    
    # Select the samples which will be in the imbalanced and in the test set
    for cl in ordered_labels:
        permutations[sep][cl] = list(rng.permutation(np.where(np.array(lbls[sep]) == cl)[0]))
    
    for i in range(5):
        train_idxs = {'1':[], '0':[]}
        test_idxs = {'1':[], '0':[]}

        for cl in ordered_labels:
            if cl == '1':
                train_idxs[cl] = list(np.array(permutations[sep][cl])[i*fold_len: (i+1)*fold_len])
                test_idxs[cl] = list(np.array(permutations[sep][cl])[: i*fold_len]) + list(
                    np.array(permutations[sep][cl])[(i+1)*fold_len:])
            else:
                train_idxs[cl] = list(np.array(permutations[sep][cl])[: i*fold_len]) + list(
                    np.array(permutations[sep][cl])[(i+1)*fold_len:])
                test_idxs[cl] = list(np.array(permutations[sep][cl])[i*fold_len: (i+1)*fold_len])

        print('Synthetic Dataset:', sep, 'Fold nº:', i+1)
        print('Train 1/0:', len(train_idxs['1']), len(train_idxs['0']))
        print('Test 1/0: ', len(test_idxs['1']), len(test_idxs['0']))
        train_idxs = train_idxs['1'] + train_idxs['0']
        test_idxs = test_idxs['1'] + test_idxs['0']
        
        # Create the imbalanced and test set
        df_storage_train[sep][i+1] = dfs_no_t[sep].iloc[train_idxs]
        lbl_storage_train[sep][i+1] = list(np.array(lbls[sep])[train_idxs])

        df_storage_test[sep][i+1] = dfs_no_t[sep].iloc[test_idxs]
        lbl_storage_test[sep][i+1] = list(np.array(lbls[sep])[test_idxs])

        # Data pretreatment of the imbalanced and test dataset
        real_samples[sep][i+1] = transf.pareto_scale(df_storage_train[sep][i+1])

        df_storage_test[sep][i+1] = (df_storage_test[sep][i+1] - df_storage_train[
            sep][i+1].mean())/np.sqrt(df_storage_train[sep][i+1].std()) # 'Faux' Pareto Scale

Linear Augmentation of the Training Sets and Pareto Scale.

In [None]:
aug_df_storage_train = {}
aug_lbl_storage_train = {}
# Only generation of samples based on the imbalanced dataset
for sep in dfs:
    aug_df_storage_train[sep] = {}
    aug_lbl_storage_train[sep] = {}
    for i in range(1,6):#df_storage_train.keys():
        start = perf_counter()
        data, lbls = laf.artificial_dataset_generator(df_storage_train[sep][i], labels=lbl_storage_train[sep][i],
                                            max_new_samples_per_label=512, binary=False, 
                                            rnd=list(np.linspace(0.2,0.8,3)), 
                                            binary_rnd_state=None, rnd_state=42345)

        data_treated = transf.pareto_scale(data)

        aug_df_storage_train[sep][i] = data_treated.copy()
        aug_lbl_storage_train[sep][i] = lbls

        end = perf_counter()
        print(f'Simple augmentation of data done! Dataset: {sep}, Fold: {i}. Took {(end - start):.3f} s')

Set up colours for each of the classes. Generated samples will have the corresponding label with '- GAN' after.

In [None]:
# Colors to use in plots
colours2 = sns.color_palette('tab20', 4)#[:6]

ordered_labels_test = []
for i in ['1', '0']:
    ordered_labels_test.extend([i, i + ' - GAN'])
label_colors_test = {lbl: c for lbl, c in zip(ordered_labels_test, colours2)}

sns.palplot(label_colors_test.values())
new_ticks_test = plt.xticks(range(len(ordered_labels_test)), ordered_labels_test)

## Conditional Wasserstein GAN - GP model

This model construction was made by joining WGAN-GP models with Conditional GAN models. WGAN-GP models were originally made according to / originally based in https://keras.io/examples/generative/wgan_gp/#wasserstein-gan-wgan-with-gradient-penalty-gp and Conditional GAN models - https://machinelearningmastery.com/how-to-develop-a-conditional-generative-adversarial-network-from-scratch/ (generator and discriminator model) and https://keras.io/examples/generative/conditional_gan/ without using OOP (loss functions and training/training steps).

Functions for the generator and critic (discriminator) models

In [None]:
def generator_model(len_input, len_output, n_hidden_nodes, n_labels): 
    "Make the generator model of CWGAN-GP."

    data_input = tf.keras.Input(shape=(len_input,), name='data') # Take intensity input
    label_input = tf.keras.Input(shape=(1,), name='label') # Take Label Input
    
    # Treat label input to concatenate to intensity data after
    label_m = tf.keras.layers.Embedding(n_labels, 30, input_length=1)(label_input)
    label_m = tf.keras.layers.Dense(256, activation='linear', use_bias=True)(label_m)
    #label_m = tf.keras.layers.Reshape((len_input,1,))(label_m)
    label_m2 = tf.keras.layers.Reshape((256,))(label_m)

    joined_data = tf.keras.layers.Concatenate()([data_input, label_m2]) # Concatenate intensity and label data
    # Hidden Dense Layer and Normalization
    joined_data = tf.keras.layers.Dense(n_hidden_nodes, activation=tf.nn.leaky_relu, use_bias=True)(joined_data)
    joined_data = tf.keras.layers.Dense(256, activation=tf.nn.leaky_relu, use_bias=True)(joined_data)
    joined_data = tf.keras.layers.BatchNormalization()(joined_data)
    
    # Output - number of features of sample to make
    output = tf.keras.layers.Dense(len_output, activation='linear', use_bias=True)(joined_data)
    
    generator = tf.keras.Model(inputs=[data_input, label_input], outputs=output)
    
    return generator


def critic_model(len_input, n_hidden_nodes, n_labels):
    "Make the critic model of CWGAN-GP."
    
    label_input = tf.keras.Input(shape=(1,)) # Take intensity input
    data_input = tf.keras.Input(shape=(len_input,)) # Take Label Input

    # Treat label input to concatenate to intensity data after
    label_m = tf.keras.layers.Embedding(n_labels, 30, input_length=1)(label_input)
    label_m = tf.keras.layers.Dense(256, activation='linear', use_bias=True)(label_m)
    #label_m = tf.keras.layers.Reshape((len_input,1,))(label_m)
    label_m = tf.keras.layers.Reshape((256,))(label_m)

    joined_data = tf.keras.layers.Concatenate()([data_input, label_m]) # Concatenate intensity and label data
    # Hidden Dense Layer (Normalization worsened results here)
    joined_data = tf.keras.layers.Dense(n_hidden_nodes, activation=tf.nn.leaky_relu, use_bias=True)(joined_data)
    joined_data = tf.keras.layers.Dense(128, activation=tf.nn.leaky_relu, use_bias=True)(joined_data)
    joined_data = tf.keras.layers.Dense(256, activation=tf.nn.leaky_relu, use_bias=True)(joined_data)
    #joined_data = tf.keras.layers.BatchNormalization()(joined_data)

    # Output Layer - 1 node for critic decision
    output = tf.keras.layers.Dense(1, activation='linear', use_bias=True)(joined_data)
    
    critic = tf.keras.Model(inputs=[data_input, label_input], outputs=output)

    return critic

In [None]:
def generate_predictions(model, num_examples_to_generate, len_input, input_dist, uni_lbls):
    "Generate sample predictions based on a Generator model."
    
    test_input =  tf.constant(input_dist.rvs(size=len_input*num_examples_to_generate), shape=[
        num_examples_to_generate,len_input]) 
    
    if len(uni_lbls) < 3:
        test_labels = tf.constant([1.0]*(num_examples_to_generate//2) + [0.0]*(num_examples_to_generate//2), 
                                  shape=(num_examples_to_generate,1))
    else:
        test_labels = []
        for i in range(len(uni_lbls)):
            test_labels.extend([i]*(num_examples_to_generate//len(uni_lbls)))
        test_labels = np.array(pd.get_dummies(test_labels))
        #np.array(pd.get_dummies([i for i in range(len(uni_lbls))]*(num_examples_to_generate//len(uni_lbls))))
    predictions = model([test_input, test_labels], training=False) # `training` is set to False.
    return predictions

In [None]:
def training_montage(train_data_o, train_lbls, test_data, test_lbls,
                     epochs, generator, critic, generator_optimizer, critic_optimizer, input_dist,
                    batch_size, grad_pen_weight=10, k_cov_den=50, k_crossLID=15, random_seed=145,
                    n_generated_samples=96):
    """Train a generator and critic of CWGAN-GP.
    
       Receives training data and respective class labels (train_data_o and train_lbls) and trains a generator and a critic
        model (generator, critic) over a number of epochs (epochs) with a set batch size (batch_size) with the respective 
        optimizers and learning rate (generator_optimizer, critic_optimizer). Gradient Penalty is calculated with
        grad_pen_weight as the weight of the penalty.
       The functions returns at time intervals three graphs to evaluate the progression of the models (Loss plots,
        coverage, density, crossLID and correct first cluster plots and PCA plot with generated and test data). To this
        end, samples need to be generated requiring the distribution to sample the initial input values from (input_dist),
        and test data and respective labels has to be given (test_data and test_lbls). Finally the number of neighbors to
        consider for coverage/density and crossLID calculation is also needed (k_cov_den, k_crossLID).
    
       train_data_o: Pandas DataFrame with training data;
       train_lbls: List with training data class labels;
       test_data: Pandas DataFrame with test data to evaluate the model;
       test_lbls: List with test data class labels to evaluate the model;
       epochs: Int value with the number of epochs to train the model;
       generator: tensorflow keras.engine.functional.Functional model for the generator;
       critic: tensorflow keras.engine.functional.Functional model for the critic;
       generator_optimizer: tensorflow keras optimizer (with learning rate) for generator;
       critic_optimizer: tensorflow keras optimizer (with learning rate) for critic;
       input_dist: scipy.stats._continuous_distns.rv_histogram object - distribution to sample input values for generator;
       batch_size: int value with size of batch for model training;
       grad_pen_weight: int value (default 10) for penalty weight in gradient penalty calculation;
       k_cov_den: int value (default 50) for number of neighbors to consider for coverage and density calculation in
       generated samples evaluation;
       k_crossLID: int value (default 15) for number of neighbors to consider for crossLID calculation in generated samples
        evaluation.
       random_seed: int value (default 145) for numpy random seeding when randomly organizing samples in the data that
        will be split into batches.
       n_generated_samples: int value (default 96) for number of samples generated to test the model during training.
    """
    
    # Obtaining the train data, randomize its order and divide it be twice the standard deviation of its values
    all_data = train_data_o.iloc[
        np.random.RandomState(seed=random_seed).permutation(len(train_data_o))]/(2*train_data_o.values.std())
    
    # Same treatment for the test data
    test_data = (test_data/(2*test_data.values.std())).values
    training_data = all_data
    train_data = all_data.values
    
    # Change class labels to numerical values while following the randomized ordered of samples
    if len(set(train_lbls)) < 3: # 1 and 0 for when there are only two classes
        train_labels = pd.get_dummies(
            np.array(train_lbls)[np.random.RandomState(seed=random_seed).permutation(len(train_data))]).values[:,0]
        test_labels = pd.get_dummies(np.array(test_lbls)).values[:,0]
    else: # One hot encoding for when there are more than two classes
        train_labels = pd.get_dummies(
            np.array(train_lbls)[np.random.RandomState(seed=random_seed).permutation(len(train_data))]).values
        test_labels = pd.get_dummies(np.array(test_lbls)).values
    # Save the order of the labels
    ordered_labels = pd.get_dummies(
            np.array(train_lbls)[np.random.RandomState(seed=random_seed).permutation(len(train_data_o))]).columns

    batch_divisions = int(batch_size / len(set(train_lbls))) # See how many samples of each class will be in each batch
    n_steps = epochs * int(training_data.shape[0] / batch_size) # Number of steps: nº of batches per epoch * nº of epochs
    n_critic = 5
    
    # Set up the evaluating images printed during training and the intervals they will be updated
    f, (axl, axc, axr) = plt.subplots(1, 3, figsize = (16,5))
    update1 = n_steps//200
    update2 = n_steps//20

    if hasattr(tqdm, '_instances'):
        tqdm._instances.clear() # clear if it exists

    i=0

    for step in tqdm(range(n_steps)):
        
        # Critic Training
        crit_loss_temp = []
        
        # Select real samples for this batch on training and order samples to put samples of the same class together
        real_samp = train_data[i*batch_size:(i+1)*batch_size]
        real_lbls = train_labels[i*batch_size:(i+1)*batch_size]

        real_samples = np.empty(real_samp.shape)
        real_labels = np.empty(real_lbls.shape)
        a = 0
        if len(set(train_lbls)) < 3:
            for l,s in sorted(zip(real_lbls, real_samp), key=lambda pair: pair[0], reverse=True):
                real_samples[a] = s
                real_labels[a] = l
                a = a+1
        else:
            for l,s in sorted(zip(real_lbls, real_samp), key=lambda pair: np.argmax(pair[0]), reverse=False):
                #print(l, np.argmax(l))
                real_samples[a] = s
                real_labels[a] = l
                a = a+1

        for _ in range(n_critic): # For each step, train critic n_critic times
            
            # Generate input for generator
            artificial_samples = tf.constant(input_dist.rvs(size=all_data.shape[1]*batch_size), shape=[
                batch_size,all_data.shape[1]])
            artificial_labels = real_labels.copy()

            # Generate artificial samples from the latent vector
            artificial_samples = generator([artificial_samples, artificial_labels], training=True)
            
            with tf.GradientTape() as crit_tape: # See the gradient for the critic

                # Get the logits for the generated samples
                X_artificial = critic([artificial_samples, artificial_labels], training=True)
                # Get the logits for the real samples
                X_true = critic([real_samples, real_labels], training=True)

                # Calculate the critic loss using the generated and real sample results
                c_cost = critic_loss_wgan(X_true, X_artificial)

                # Calculate the gradient penalty
                grad_pen = gradient_penalty_cwgan(batch_size, real_samples, artificial_samples,
                                                  real_labels, artificial_labels, critic)
                # Add the gradient penalty to the original discriminator loss
                crit_loss = c_cost + grad_pen * grad_pen_weight
                
            crit_loss_temp.append(crit_loss)

            # Calculate and apply the gradients obtained from the loss on the trainable variables
            gradients_of_critic = crit_tape.gradient(crit_loss, critic.trainable_variables)
            critic_optimizer.apply_gradients(zip(gradients_of_critic, critic.trainable_variables))

        i = i + 1
        if (step+1) % (n_steps//epochs) == 0:
            i=0

        crit_loss_all.append(np.mean(crit_loss_temp))
        
        # Generator Training
        # Generate inputs for generator, values and labels
        artificial_samples = tf.constant(input_dist.rvs(size=all_data.shape[1]*batch_size), shape=[
                batch_size,all_data.shape[1]])
        
        if len(set(train_lbls)) < 3:
            artificial_labels = tf.constant([1.0]*(batch_size//2) + [0.0]*(batch_size//2), shape=(batch_size,1))
        else:
            artificial_labels = np.array(pd.get_dummies([i for i in range(len(set(train_lbls)))]*batch_divisions))
    
        with tf.GradientTape() as gen_tape: # See the gradient for the generator
            # Generate artificial samples
            artificial_samples = generator([artificial_samples, artificial_labels], training=True)
            
            # Get the critic results for generated samples
            X_artificial = critic([artificial_samples, artificial_labels], training=True)
            # Calculate the generator loss
            gen_loss = generator_loss_wgan(X_artificial)

        # Calculate and apply the gradients obtained from the loss on the trainable variables
        gradients_of_generator = gen_tape.gradient(gen_loss, generator.trainable_variables)
        generator_optimizer.apply_gradients(zip(gradients_of_generator, generator.trainable_variables))
        gen_loss_all.append(gen_loss)

        # Update the progress bar and evaluation graphs every update1 steps for loss plots and update2 for the others.
        if (step + 1) % update1 == 0:
            
            # Update the evaluating figures at the set intervals
            axl.clear() # Always clear the corresponding ax before redrawing it
            
            # Loss Plot
            axl.plot(gen_loss_all, color = 'blue', label='Generator Loss')
            axl.plot(crit_loss_all,color = 'red', label='Critic Loss')
            axl.set_xlabel('Number of Steps')
            axl.set_ylabel('Loss')
            axl.legend()
            
            ipythondisplay.clear_output(wait=True)
            ipythondisplay.display(plt.gcf())

        if (step + 1) % update2 == 0:

            saved_predictions.append(generate_predictions(generator, n_generated_samples, all_data.shape[1], 
                                                          input_realdata_dist, ordered_labels))
            # See density and coverage and crossLID (divided by 25 to be in the same order as the rest) 
            # of latest predictions
            den, cov = gem.evaluation_coverage_density(test_data, saved_predictions[-1], k= k_cov_den, metric='euclidean')
            clid = gem.cross_LID_estimator_byMLE(test_data, saved_predictions[-1], k=k_crossLID, metric='euclidean')/25
            density.append(den)
            coverage.append(cov)
            crossLID.append(clid)

            # PCA of the latest predictions and training data
            # Divide by twice the standard deviation to be the same as the generated data
            dfs_temp = pd.concat((train_data_o/(2*train_data_o.values.std()),pd.DataFrame(
                saved_predictions[-1].numpy(), columns=train_data_o.columns))) 
            temp_lbls = train_lbls.copy()
            for l in ordered_labels:
                temp_lbls.extend([l+' - GAN']*(n_generated_samples//len(ordered_labels)))
            principaldf = gem.pca_sample_projection(dfs_temp, temp_lbls, pca, whiten=True, 
                                                samp_number=len(train_data_o.index))
            lcolors = label_colors_test

            # Hierarchical clustering of the latest predictions and testing data, 
            # saving the correct 1st cluster fraction results
            dfs_temp = np.concatenate((test_data, saved_predictions[-1].numpy()))
            temp_lbls = ['real']*len(test_data) + ['gen']*len(saved_predictions[-1])
            hca_results = gem.perform_HCA(dfs_temp, temp_lbls, metric='euclidean', method='ward')
            corr1stcluster.append(hca_results['correct 1st clustering'])
            
            # Plots
            axc.clear()
            axc.plot(range(update2, step+2, update2), coverage, label='coverage')
            axc.plot(range(update2, step+2, update2), density, label='density')
            axc.plot(range(update2, step+2, update2), crossLID, color='red', label='crossLID')
            axc.plot(range(update2, step+2, update2), corr1stcluster, color='purple', label='corr_cluster')
            axc.legend()

            axr.clear()
            gem.plot_PCA(principaldf, lcolors, components=(1,2), title='', ax=axr)
            axr.legend(loc='upper right', ncol=1, framealpha=1)
            
            ipythondisplay.clear_output(wait=True)
            ipythondisplay.display(plt.gcf())
            print(lbl, fold)

### Training the GAN

In [None]:
GENERATE=True
epochs = 500
batch_size = 32
k_cov_den = 20
k_crossLID = 15
random_seed = 145
n_generated_samples = 48*len(pd.unique(aug_lbl_storage_train[0.6][1]))

if GENERATE:
    generator_train = {}
    critic_train = {}

    results_train = {}

    for lbl in real_samples.keys():
        generator_train[lbl] = {}
        critic_train[lbl] = {}

        results_train[lbl] = {}
        for fold in real_samples[lbl].keys():

            print(lbl, fold)
            # Store results
            gen_loss_all = []
            crit_loss_all = []
            saved_predictions = []
            coverage = []
            density = []
            crossLID = []
            corr1stcluster = []
            
            # Get distribution of intensity values of the dataset
            hist = np.histogram(real_samples[lbl][fold].values.flatten(), bins=100)
            input_realdata_dist = stats.rv_histogram(hist)

            df = real_samples[lbl][fold]
            pca = PCA(n_components=2, svd_solver='full', whiten=True)
            pc_coords = pca.fit_transform(df)

            generator_optimizer = tf.keras.optimizers.RMSprop(1e-4)
            critic_optimizer = tf.keras.optimizers.RMSprop(1e-4)

            generator_train[lbl][fold] = generator_model(aug_df_storage_train[lbl][fold].shape[1],
                                                 aug_df_storage_train[lbl][fold].shape[1], 128, 2)
            critic_train[lbl][fold] = critic_model(aug_df_storage_train[lbl][fold].shape[1], 512, 2)

            training_montage(aug_df_storage_train[lbl][fold], aug_lbl_storage_train[lbl][fold],
                             real_samples[lbl][fold], lbl_storage_train[lbl][fold],
                             epochs, generator_train[lbl][fold], critic_train[lbl][fold],
                             generator_optimizer, critic_optimizer, input_realdata_dist, batch_size,
                             grad_pen_weight=10, k_cov_den=k_cov_den, k_crossLID=k_crossLID,
                             random_seed=random_seed, n_generated_samples=n_generated_samples)

            results_train[lbl][fold]={'gen_loss': gen_loss_all, 'crit_loss': crit_loss_all, 'saved_pred': saved_predictions,
                     'coverage': coverage, 'density': density, 'crossLID': crossLID, 'corr1st_cluster': corr1stcluster}
            
            generator_train[lbl][fold].save_weights('gan_models/Synthetic_gen_imb_'+str(lbl)+str(fold))
            critic_train[lbl][fold].save_weights('gan_models/Synthetic_crit_imb_'+str(lbl)+str(fold))
    
            # Save the results from GAN training
            with open('gan_models/Synthetic_results_imb_'+str(lbl)+str(fold)+'.pickle', 'wb') as handle:
                pickle.dump(results_train[lbl][fold], handle)

In [None]:
if not GENERATE:

    generator_train = {}
    critic_train = {}

    results_train = {}

    for lbl in df_storage_train:
        generator_train[lbl] = {}
        critic_train[lbl] = {}

        results_train[lbl] = {}
        for fold in df_storage_train[lbl]:
            # Read back the saved model
            generator_optimizer = tf.keras.optimizers.RMSprop(1e-4)
            critic_optimizer = tf.keras.optimizers.RMSprop(1e-4)

            generator_train[lbl][fold] = generator_model(df_storage_train[lbl][fold].shape[1],
                                                         df_storage_train[lbl][fold].shape[1], 128, 2)
            critic_train[lbl][fold] = critic_model(df_storage_train[lbl][fold].shape[1],
                                                   512, 2)

            # Load previously saved models
            generator_train[lbl][fold].load_weights('./gan_models/Synthetic_gen_imb_'+str(lbl)+str(fold))
            critic_train[lbl][fold].load_weights('./gan_models/Synthetic_crit_imb_'+str(lbl)+str(fold))
            
            # Load previously saved results
            with open('gan_models/Synthetic_results_imb_'+str(lbl)+str(fold)+'.pickle', 'rb') as handle:
                results_train[lbl][fold] = pickle.load(handle)

### Comparison of Classification Accuracy

With the train set, we build and train a GAN model from them. Then we build models with the train set and with generated samples from the GAN models and compare the performance in discriminating the test set.

#### Generate a lot of samples and make Random Forests and PLS-DA models

In [None]:
np.random.seed(5402)
# Generate sample for each fold
generated_samples = {}

for i in generator_train:
    generated_samples[i] = {}
    for fold in generator_train[i]:
        # Input to the generator
        num_examples_to_generate = 2048
        # Get distribution of intensity values of the dataset
        hist = np.histogram(real_samples[i][fold].values.flatten(), bins=100)
        input_realdata_dist = stats.rv_histogram(hist)

        test_input = tf.constant(input_realdata_dist.rvs(
            size=len(df_storage_train[i][fold].columns)*num_examples_to_generate),
                                 shape=[num_examples_to_generate,len(df_storage_train[i][fold].columns)])

        test_labels = tf.constant([0]*(num_examples_to_generate//2) + [1]*(num_examples_to_generate//2), shape=[
            num_examples_to_generate,1])

        # Generate GAN samples
        predictions = generator_train[i][fold]([test_input, test_labels], training=False)
        # Reverse the division done to the data
        predictions = predictions * 2* aug_df_storage_train[i][fold].values.std()
        
        ordered_labels_fold = pd.get_dummies(
            np.array(lbl_storage_train[i][fold])[np.random.RandomState(seed=random_seed).permutation(
                len(lbl_storage_train[i][fold]))]).columns

        generated_samples[i][fold] = [pd.DataFrame(np.array(predictions), columns=df_storage_train[i][fold].columns),
                                [ordered_labels_fold[1],]*(num_examples_to_generate//2) + [ordered_labels_fold[0],]*(
                                num_examples_to_generate//2)]

In [None]:
# To store for each fold
bal_datasets = {}
np.random.seed(325)
rng = np.random.default_rng(7519)
for i in generated_samples.keys():
    bal_datasets[i] = {}
    for fold in generator_train[i]:
        bal_datasets[i][fold] = {}
        df = real_samples[i][fold].loc[np.array(lbl_storage_train[i][fold]) == '1']
        # Calculate all correlations between all samples of experimental and GAN data and store them in a dataframe
        correlations = pd.DataFrame(index=generated_samples[i][fold][0].index, columns=df.index).astype('float')

        for a in df.index:
            for j in generated_samples[i][fold][0].index:
                correlations.loc[j,a] = stats.pearsonr(df.loc[a],
                                                       generated_samples[i][fold][0].loc[j])[0]

        correlated_samples = pd.DataFrame(columns=df.index)
        for a in correlations:
            correlated_samples[a] = correlations[a].sort_values(ascending=False).index
            
        permutated = correlated_samples.copy()
        for l in correlated_samples.index:
            permutated.loc[l] = rng.permutation(correlated_samples.loc[l])
        #print(permutated)

        corr_idxs = pd.unique(permutated.values.flatten())
        
        dataset_len = real_samples[i][fold].shape[0]
        
        n_min_class = (np.array(lbl_storage_train[i][fold]) == '1').sum()
        n_max_class = (dataset_len - n_min_class)//(len(pd.unique(lbl_storage_train[i][fold]))-1)
        
        # Slowly add samples - 3 at a time until a total of 60
        for num in range(0, n_max_class - n_min_class+1, (n_max_class - n_min_class+1)//20):
            idx_to_keep = corr_idxs[:num]

            corr_preds = generated_samples[i][fold][0].loc[list(pd.unique(idx_to_keep))]
            corr_lbls  = np.array(generated_samples[i][fold][1])[list(pd.unique(idx_to_keep))]

            # Slowly add the GAN correlated GAN samples to the the imbalanced dataset, making it a balanced dataset
            concat_df = pd.concat((corr_preds, real_samples[i][fold]))
            # All GAN samples added are from the '1' minority class
            concat_lbls = ['1',]*len(set(idx_to_keep)) + lbl_storage_train[i][fold]
            bal_datasets[i][fold][num] = [concat_df.copy(), concat_lbls.copy()]

In [None]:
# Samples added to the minority class
bal_datasets[i][fold].keys()

### Fitting RF and PLS-DA models to Imbalance Datasets and Evaluating them

RF and PLS-DA models are built for each minority class, each fold and each number of GAN samples added.

#### RF

In [None]:
# Fitting and storing Random Forest models for each fold
RF_models_bal = {}

# Train the Models
for min_class in bal_datasets:
    RF_models_bal[min_class] = {}
    for size in bal_datasets[0.6][1].keys():
        RF_models_bal[min_class][size] = {}
        for fold in bal_datasets[min_class]:
            rf_mod = ma.RF_model(bal_datasets[min_class][fold][size][0], bal_datasets[min_class][fold][size][1],
                                 return_cv=False, n_trees=200)
            RF_models_bal[min_class][size][fold] = rf_mod

In [None]:
# Testing the RF models with the test data for each fold for real, GAN and CorrGAN Data
RF_results_bal = {'Accuracy':{}, 'F1-Score':{}, 'Precision':{}, 'Recall':{}}
# Evaluate the Models
for min_class in RF_models_bal:
    RF_results_bal['Accuracy'][min_class] = {}
    RF_results_bal['F1-Score'][min_class] = {}
    RF_results_bal['Precision'][min_class] = {}
    RF_results_bal['Recall'][min_class] = {}
    for size in RF_models_bal[min_class].keys():
        RF_results_bal['Accuracy'][min_class][size] = {}
        RF_results_bal['F1-Score'][min_class][size] = {}
        RF_results_bal['Precision'][min_class][size] = {}
        RF_results_bal['Recall'][min_class][size] = {}
        for fold in RF_models_bal[min_class][size]:
            RF_results_bal['Accuracy'][min_class][size][fold] = RF_models_bal[min_class][size][fold].score(
                                                                                    df_storage_test[min_class][fold],
                                                                                    lbl_storage_test[min_class][fold])
            preds = RF_models_bal[min_class][size][fold].predict(df_storage_test[min_class][fold])
            prec, rec, f1, sup = precision_recall_fscore_support(lbl_storage_test[min_class][fold], preds,
                                                                pos_label='1', average='binary',
                                                                zero_division=1)
            RF_results_bal['F1-Score'][min_class][size][fold] = f1
            RF_results_bal['Precision'][min_class][size][fold] = prec
            RF_results_bal['Recall'][min_class][size][fold] = rec

In [None]:
pd.DataFrame.from_dict(RF_results_bal['Accuracy'][0.6])

In [None]:
pd.DataFrame.from_dict(RF_results_bal['Accuracy'][2.0])

Random Forest results

In [None]:
# Plotting the results for each fold
f, axs = plt.subplots(2,2,figsize=(12,12))
for min_class in RF_results_bal['Accuracy'].keys():
    for t, ax in zip(RF_results_bal.keys(), axs.ravel()):
        ax.plot((pd.DataFrame.from_dict(RF_results_bal[t][min_class]).columns + 20)/n_max_class*100,
                 pd.DataFrame.from_dict(RF_results_bal[t][min_class]).mean(), 
                 label=str(min_class))
        ax.set_title(t, fontsize=15)
        ax.set_xlabel('% of Augmentation', fontsize=15)
        ax.set_ylim([0, 1.01])

axs[0][0].set_ylabel('Performance', fontsize=15)
axs[1][0].set_ylabel('Performance', fontsize=15)
axs[0][1].legend(fontsize=15, bbox_to_anchor=(1,1))

plt.suptitle('Random Forest', fontsize=18, y=0.93)
plt.show()

In [None]:
# Plotting the results for each fold
f, axs = plt.subplots(2,2,figsize=(12,12))
for min_class in RF_results_bal['Accuracy'].keys():
    for t, ax in zip(RF_results_bal.keys(), axs.ravel()):
        ax.plot((pd.DataFrame.from_dict(RF_results_bal[t][min_class]).columns + 20)/n_max_class*100,
                 pd.DataFrame.from_dict(RF_results_bal[t][min_class]).mean() - pd.DataFrame.from_dict(
                     RF_results_bal[t][min_class]).mean()[0], 
                 label=str(min_class))
        ax.set_title(t, fontsize=15)
        ax.set_xlabel('% of Augmentation', fontsize=15)
        ax.set_ylim([-0.5, 0.5])

axs[0][0].set_ylabel('Difference in Performance', fontsize=15)
axs[1][0].set_ylabel('Difference in Performance', fontsize=15)
axs[0][1].legend(fontsize=15, bbox_to_anchor=(1,1))

plt.suptitle('Random Forest', fontsize=18, y=0.93)
plt.show()

In [None]:
# Plotting the results for each fold
f, axs = plt.subplots(2,2,figsize=(12,12))
for min_class in RF_results_bal['Accuracy'].keys():
    for t, ax in zip(RF_results_bal.keys(), axs.ravel()):
        ax.plot((pd.DataFrame.from_dict(RF_results_bal[t][min_class]).columns + 20)/n_max_class*100,
                 (pd.DataFrame.from_dict(RF_results_bal[t][min_class]).mean() - pd.DataFrame.from_dict(
                     RF_results_bal[t][min_class]).mean()[0]) / (1-pd.DataFrame.from_dict(
                     RF_results_bal[t][min_class]).mean()[0]), 
                 label=str(min_class))
        ax.set_title(t, fontsize=15)
        ax.set_xlabel('% of Augmentation', fontsize=15)
        ax.set_ylim([-0.5, 1])

f.supylabel('Difference in Performance (fraction based on maximum possible)', fontsize=15, x=0.07)
axs[0][1].legend(fontsize=15, bbox_to_anchor=(1,1))

plt.suptitle('Random Forest', fontsize=18, y=0.93)
plt.show()

#### PLS-DA

In [None]:
def decision_rule(y_pred, y_true, pos_label, average='binary'):
    "Decision rule for PLS-DA classification."
    # Decision rule for classification
    # Decision rule chosen: sample belongs to group where it has max y_pred (closer to 1)
    # In case of 1,0 encoding for two groups, round to nearest integer to compare
    nright = 0
    rounded = np.round(y_pred)

    for p in range(len(y_pred)):
        if rounded[p] >= 1:
            pred = 1
            rounded[p] = 1
        else:
            pred = 0
            rounded[p] = 0
        if pred == y_true[p]:
            nright += 1  # Correct prediction
    
    # Calculate accuracy for this iteration
    accuracy = (nright / len(y_pred))
    prec, rec, f1, sup = precision_recall_fscore_support(y_true, rounded, pos_label=pos_label, average=average,
                                                         zero_division=1)
    return accuracy, f1, prec, rec

In [None]:
PLSDA_models_bal = {}
PLSDA_results_bal = {'Accuracy':{}, 'F1-Score':{}, 'Precision':{}, 'Recall':{}}

np.random.seed(325)

# Train and Evaluate the models
for min_class in bal_datasets:
    PLSDA_models_bal[min_class] = {}
    PLSDA_results_bal['Accuracy'][min_class] = {}
    PLSDA_results_bal['F1-Score'][min_class] = {}
    PLSDA_results_bal['Precision'][min_class] = {}
    PLSDA_results_bal['Recall'][min_class] = {}
    for size in bal_datasets[min_class][1].keys():
        PLSDA_models_bal[min_class][size] = {}
        PLSDA_results_bal['Accuracy'][min_class][size] = {}
        PLSDA_results_bal['F1-Score'][min_class][size] = {}
        PLSDA_results_bal['Precision'][min_class][size] = {}
        PLSDA_results_bal['Recall'][min_class][size] = {}
        for fold in bal_datasets[min_class]:

            PLSDA_models_bal[min_class][size][fold] = ma.fit_PLSDA_model(bal_datasets[min_class][fold][size][0],
                                                                   bal_datasets[min_class][fold][size][1],
                                                              n_comp=4,
                                                      return_scores=False, scale=False, encode2as1vector=True)
            plsda = PLSDA_models_bal[min_class][size][fold]
            # Obtain results with the test group
            y_pred = plsda.predict(df_storage_test[min_class][fold])
            y_true = ma._generate_y_PLSDA(lbl_storage_test[min_class][fold],
                                          pd.unique(bal_datasets[min_class][fold][size][1]),
                                          True)
            pos_label = np.where(pd.unique(bal_datasets[min_class][fold][size][1]) != '1')[0][0]
            # Calculate accuracy
            accuracy, f1, prec, rec = decision_rule(y_pred, y_true, pos_label=pos_label, average='binary')
            PLSDA_results_bal['Accuracy'][min_class][size][fold] = accuracy
            PLSDA_results_bal['F1-Score'][min_class][size][fold] = f1
            PLSDA_results_bal['Precision'][min_class][size][fold] = prec
            PLSDA_results_bal['Recall'][min_class][size][fold] = rec

In [None]:
pd.DataFrame.from_dict(PLSDA_results_bal['Accuracy'][0.6])

In [None]:
pd.DataFrame.from_dict(PLSDA_results_bal['Accuracy'][2.0])

PLS-DA Results

In [None]:
# Plotting the results for each fold
f, axs = plt.subplots(2,2,figsize=(12,12))
for min_class in PLSDA_results_bal['Accuracy'].keys():
    for t, ax in zip(PLSDA_results_bal.keys(), axs.ravel()):
        ax.plot((pd.DataFrame.from_dict(PLSDA_results_bal[t][min_class]).columns + 20)/n_max_class*100,
                 pd.DataFrame.from_dict(PLSDA_results_bal[t][min_class]).mean(), 
                 label=str(min_class))
        ax.set_title(t, fontsize=15)
        ax.set_xlabel('% of Augmentation', fontsize=15)
        ax.set_ylim([0, 1.01])

axs[0][0].set_ylabel('Performance', fontsize=15)
axs[1][0].set_ylabel('Performance', fontsize=15)
axs[0][1].legend(fontsize=15, bbox_to_anchor=(1,1))

plt.suptitle('PLS-DA', fontsize=18, y=0.93)
plt.show()

In [None]:
# Plotting the results for each fold
f, axs = plt.subplots(2,2,figsize=(12,12))
for min_class in PLSDA_results_bal['Accuracy'].keys():
    for t, ax in zip(PLSDA_results_bal.keys(), axs.ravel()):
        ax.plot((pd.DataFrame.from_dict(PLSDA_results_bal[t][min_class]).columns + 20)/n_max_class*100,
                 pd.DataFrame.from_dict(PLSDA_results_bal[t][min_class]).mean() - pd.DataFrame.from_dict(
                     PLSDA_results_bal[t][min_class]).mean()[0], 
                 label=str(min_class))
        ax.set_title(t, fontsize=15)
        ax.set_xlabel('% of Augmentation', fontsize=15)
        ax.set_ylim([-0.5, 0.5])

axs[0][0].set_ylabel('Difference in Performance', fontsize=15)
axs[1][0].set_ylabel('Difference in Performance', fontsize=15)
axs[0][1].legend(fontsize=15, bbox_to_anchor=(1,1))

plt.suptitle('PLS-DA', fontsize=18, y=0.93)
plt.show()

In [None]:
# Plotting the results for each fold
f, axs = plt.subplots(2,2,figsize=(12,12))
for min_class in PLSDA_results_bal['Accuracy'].keys():
    for t, ax in zip(PLSDA_results_bal.keys(), axs.ravel()):
        ax.plot((pd.DataFrame.from_dict(PLSDA_results_bal[t][min_class]).columns + 20)/n_max_class*100,
                 (pd.DataFrame.from_dict(PLSDA_results_bal[t][min_class]).mean() - pd.DataFrame.from_dict(
                     PLSDA_results_bal[t][min_class]).mean()[0]) / (1-pd.DataFrame.from_dict(
                     PLSDA_results_bal[t][min_class]).mean()[0]), 
                 label=str(min_class))
        ax.set_title(t, fontsize=15)
        ax.set_xlabel('% of Augmentation', fontsize=15)
        ax.set_ylim([-0.5, 1])

f.supylabel('Difference in Performance (fraction based on maximum possible)', fontsize=15, x=0.07)
axs[0][1].legend(fontsize=15, bbox_to_anchor=(1,1))

plt.suptitle('PLS-DA', fontsize=18, y=0.93)
plt.show()

#### Summary of Results

In [None]:
# Plotting the results for each fold
with plt.style.context('seaborn-whitegrid'):
    f = plt.figure(figsize=(12,10), constrained_layout=True)
    subfigs = f.subfigures(nrows=2, ncols=1)
    

    axs = subfigs[0].subplots(nrows=1, ncols=3)
    for min_class in RF_results_bal['Accuracy'].keys():
        for t, ax in zip(list(RF_results_bal.keys())[1:], axs.ravel()):
            ax.plot((pd.DataFrame.from_dict(RF_results_bal[t][min_class]).columns + 20)/n_max_class*100,
                     pd.DataFrame.from_dict(RF_results_bal[t][min_class]).mean(), 
                     label=str(min_class))
            ax.set_title(t, fontsize=15)
            ax.set_ylim([0, 1.05])
            ax.tick_params(axis='both', which='major', labelsize=14)
    
    subfigs[0].supxlabel('% of Samples of the Minority Class in Comparison to the Majority Class', fontsize = 15)
    axs[0].set_ylabel('Performance', fontsize=15)
    axs[1].legend(fontsize=15)
    subfigs[0].suptitle('      Random Forest', fontsize=18)
    
    axs = subfigs[1].subplots(nrows=1, ncols=3)
    for min_class in PLSDA_results_bal['Accuracy'].keys():
        for t, ax in zip(list(PLSDA_results_bal.keys())[1:], axs.ravel()):
            ax.plot((pd.DataFrame.from_dict(PLSDA_results_bal[t][min_class]).columns + 20)/n_max_class*100,
                     pd.DataFrame.from_dict(PLSDA_results_bal[t][min_class]).mean(), 
                     label=str(min_class))
            ax.set_title(t, fontsize=15)
            ax.set_ylim([0, 1.05])
            ax.tick_params(axis='both', which='major', labelsize=14)
    
    subfigs[1].supxlabel('% of Samples of the Minority Class in Comparison to the Majority Class', fontsize = 15)
    axs[0].set_ylabel('Performance', fontsize=15)
    axs[1].legend(fontsize=15)

    subfigs[1].suptitle('        PLS-DA', fontsize=18)
    plt.show()
    f.savefig('Images/Synthethic_Imbalanced_Performance_plot.png', dpi=400)
    f.savefig('Images/Synthethic_Imbalanced_Performance_plot.pdf', dpi=400)

In [None]:
print('2.0 min RF', pd.DataFrame.from_dict(RF_results_bal['F1-Score'][2.0]).mean()[0])
print('2.0 max RF', pd.DataFrame.from_dict(RF_results_bal['F1-Score'][2.0]).mean()[60])
print('0.6 min RF', pd.DataFrame.from_dict(RF_results_bal['F1-Score'][0.6]).mean()[0])
print('0.6 max RF', pd.DataFrame.from_dict(RF_results_bal['F1-Score'][0.6]).mean()[60])

In [None]:
print('2.0 min PLS-DA', pd.DataFrame.from_dict(PLSDA_results_bal['F1-Score'][2.0]).mean()[0])
print('2.0 max PLS-DA', pd.DataFrame.from_dict(PLSDA_results_bal['F1-Score'][2.0]).mean()[60])
print('0.6 min PLS-DA', pd.DataFrame.from_dict(PLSDA_results_bal['F1-Score'][0.6]).mean()[0])
print('0.6 max PLS-DA', pd.DataFrame.from_dict(PLSDA_results_bal['F1-Score'][0.6]).mean()[60])

In [None]:
# Plotting the results for each fold
with plt.style.context('seaborn-whitegrid'):
    f = plt.figure(figsize=(8,10), constrained_layout=True)
    subfigs = f.subfigures(nrows=2, ncols=1)
    

    axs = subfigs[0].subplots(nrows=1, ncols=2)
    for min_class in RF_results_bal['Accuracy'].keys():
        for t, ax in zip(['F1-Score', 'Recall'], axs.ravel()):
            ax.plot((pd.DataFrame.from_dict(RF_results_bal[t][min_class]).columns + 20)/n_max_class*100,
                 (pd.DataFrame.from_dict(RF_results_bal[t][min_class]).mean() - pd.DataFrame.from_dict(
                     RF_results_bal[t][min_class]).mean()[0]) / (1-pd.DataFrame.from_dict(
                     RF_results_bal[t][min_class]).mean()[0]), 
                 label=str(min_class))
            ax.set_title(t, fontsize=15)
            ax.set_ylim([-0.5, 1])
            ax.tick_params(axis='both', which='major', labelsize=14)
    
    subfigs[0].supxlabel('% of Samples of the Minority Class in Comparison to the Majority Class', fontsize = 15)
    f.supylabel('Difference in Performance (fraction based on maximum possible)', fontsize=15)
    axs[1].legend(fontsize=15, loc='lower right', ncol=2)
    #axs[1].legend(fontsize=15, bbox_to_anchor=(1,1), frameon=True)
    subfigs[0].suptitle('Random Forest          ', fontsize=18)
    
    axs = subfigs[1].subplots(nrows=1, ncols=2)
    for min_class in PLSDA_results_bal['Accuracy'].keys():
        for t, ax in zip(['F1-Score', 'Recall'], axs.ravel()):
            ax.plot((pd.DataFrame.from_dict(PLSDA_results_bal[t][min_class]).columns + 20)/n_max_class*100,
                 (pd.DataFrame.from_dict(PLSDA_results_bal[t][min_class]).mean() - pd.DataFrame.from_dict(
                     PLSDA_results_bal[t][min_class]).mean()[0]) / (1-pd.DataFrame.from_dict(
                     PLSDA_results_bal[t][min_class]).mean()[0]), 
                 label=str(min_class))
            ax.set_title(t, fontsize=15)
            ax.set_ylim([-0.5, 1])
            ax.tick_params(axis='both', which='major', labelsize=14)
    
    subfigs[1].supxlabel('% of Samples of the Minority Class in Comparison to the Majority Class', fontsize = 15)
    axs[1].legend(fontsize=15, loc='lower right', ncol=2)
    #axs[1].legend(fontsize=15, bbox_to_anchor=(1,1), frameon=True)

    subfigs[1].suptitle('PLS-DA          ', fontsize=18)
    plt.show()
    f.savefig('Images/Synthethic_Imbalanced_PerfDiff_plot.png', dpi=400)
    f.savefig('Images/Synthethic_Imbalanced_PerfDiff_plot.pdf', dpi=400)

In [None]:
difs_f1 = (pd.DataFrame.from_dict(RF_results_bal['F1-Score'][2.0]).mean() - pd.DataFrame.from_dict(
                     RF_results_bal['F1-Score'][2.0]).mean()[0]) / (1-pd.DataFrame.from_dict(
                     RF_results_bal['F1-Score'][2.0]).mean()[0])
print('2.0 possible improvement RF', difs_f1[60])
difs_f1 = (pd.DataFrame.from_dict(RF_results_bal['F1-Score'][0.6]).mean() - pd.DataFrame.from_dict(
                     RF_results_bal['F1-Score'][0.6]).mean()[0]) / (1-pd.DataFrame.from_dict(
                     RF_results_bal['F1-Score'][0.6]).mean()[0])
print('0.6 possible improvement RF', difs_f1[60])

In [None]:
difs_f1 = (pd.DataFrame.from_dict(PLSDA_results_bal['F1-Score'][2.0]).mean() - pd.DataFrame.from_dict(
                     PLSDA_results_bal['F1-Score'][2.0]).mean()[0]) / (1-pd.DataFrame.from_dict(
                     PLSDA_results_bal['F1-Score'][2.0]).mean()[0])
print('2.0 possible improvement PLS-DA', difs_f1[60])
difs_f1 = (pd.DataFrame.from_dict(PLSDA_results_bal['F1-Score'][0.6]).mean() - pd.DataFrame.from_dict(
                     PLSDA_results_bal['F1-Score'][0.6]).mean()[0]) / (1-pd.DataFrame.from_dict(
                     PLSDA_results_bal['F1-Score'][0.6]).mean()[0])
print('0.6 possible improvement PLS-DA', difs_f1[60])

### Fitting RF and PLS-DA models to the complete synthetic datasets

Results and Feature Importance are estimated by stratified n_fold-cross validation averaged across 20 iterations.

In [None]:
np.random.seed(485)
n_fold = 5
RF_model_real = {}
RF_feats_real = {}
for sep in dfs:
    RF_model_real[sep] = ma.RF_model_CV(dfs[sep], lbls_orig[sep],
                                   iter_num=20, n_fold=n_fold, n_trees=200) 
    RF_feats_real[sep] = pd.DataFrame(RF_model_real[sep][
        'important_features']).set_index(0).sort_values(by=1, ascending=False)
    RF_feats_real[sep].index = [dfs[sep].columns[i] for i in RF_feats_real[sep].index]

In [None]:
np.random.seed(485)
n_fold = 5
PLSDA_model_real = {}
PLSDA_feats_real = {}
for sep in dfs:
    PLSDA_model_real[sep] = ma.PLSDA_model_CV(dfs[sep], lbls_orig[sep],
                                   n_comp=4, iter_num=20, n_fold=n_fold, feat_type='VIP')
    PLSDA_feats_real[sep] = pd.DataFrame(PLSDA_model_real[sep][
        'important_features']).set_index(0).sort_values(by=1, ascending=False)
    PLSDA_feats_real[sep].index = [dfs[sep].columns[i] for i in PLSDA_feats_real[sep].index]

### Extracting Important Features for RF models

In [None]:
# Extracting Important Features and averaging them across the 5 folds for each synthetic dataset
rf_feats = {}
for cl, models in RF_models_bal.items():
    rf_feats[cl] = {}
    for size, folds in models.items():
        rf_feats[cl][size] = {}
        for fold, mod in folds.items():
            if fold == 1:
                temp_df_bal = mod.feature_importances_
            else:
                temp_df_bal = temp_df_bal + mod.feature_importances_

        temp_df_bal = temp_df_bal / len(folds)

        #RF_models[fold][size].feature_importances_
        rf_feats[cl][size] = dict(zip(range(1, len(bal_datasets[cl][1][size][0].columns)+1), temp_df_bal))

In [None]:
# Select the top 5% of important features
rf_feats_abrev = {}
for min_class in rf_feats:
    temp_df = pd.DataFrame.from_dict(rf_feats[min_class])
    rf_feats_abrev[min_class] = pd.Series(index=temp_df.columns)
    top10 = int(0.05*len(temp_df.index))
    for i in temp_df.columns:
        rf_feats_abrev[min_class][i] = temp_df[i].sort_values(ascending=False)[:top10].mean()

In [None]:
top10

Relative importance values depending on the number of GAN samples added

In [None]:
# Plotting the results and adjusting parameters of the plot
with plt.style.context('seaborn-whitegrid'):
    f, ax = plt.subplots(figsize=(6,6))
    for i in rf_feats_abrev:
        plt.plot((np.array(list(rf_feats_abrev[i].index)+n_min_class))/n_max_class*100,
             rf_feats_abrev[i].values/rf_feats_abrev[i].values[0]*100,
                 label=i)

    plt.ylabel('Avg. Gini Importance of top 5% Imp. features change (%)', fontsize = 12)
    plt.xlabel('% of Augmentation', fontsize = 15)
    plt.ylim(60,140)
    plt.legend(fontsize=13)
    ax.tick_params(axis='both', which='major', labelsize=13)
    plt.title('Random Forest', fontsize=15)

Common top 5% important features compared to the complete models depending on the number of GAN samples added

In [None]:
# Plotting the results and adjusting parameters of the plot
with plt.style.context('seaborn-whitegrid'):
    f, (axl, axc, axr) = plt.subplots(1,3,figsize=(15,5))

    for min_class in rf_feats:
        rf_line_min = []
        for i in rf_feats[min_class]:
            idxs = []
            for l in pd.DataFrame(rf_feats[min_class])[i].sort_values(ascending=False)[:top10].index:
                idxs.append(bal_datasets[min_class][1][0][0].columns[l-1])
            rf_line_min.append(len(np.intersect1d(idxs, RF_feats_real[min_class].index[:top10])))

        rf_line_uni_min = []
        for i in rf_feats[min_class]:
            idxs = []
            for l in pd.DataFrame(rf_feats[min_class])[i].sort_values(ascending=False)[:top10].index:
                idxs.append(bal_datasets[min_class][1][0][0].columns[l-1])
            rf_line_uni_min.append(len(np.intersect1d(idxs, uni_results_filt[min_class].index)))
            
        rf_line_unitop_min = []
        for i in rf_feats[min_class]:
            idxs = []
            for l in pd.DataFrame(rf_feats[min_class])[i].sort_values(ascending=False)[:top10].index:
                idxs.append(bal_datasets[min_class][1][0][0].columns[l-1])
            rf_line_unitop_min.append(len(np.intersect1d(idxs, uni_results_filt[min_class].index[:top10])))

        axl.plot((np.array(list(rf_feats[min_class].keys()))+n_min_class)/n_max_class*100, np.array(rf_line_min)/top10*100,
                 label=min_class)
        axc.plot((np.array(list(rf_feats[min_class].keys()))+n_min_class)/n_max_class*100,
                 np.array(rf_line_uni_min)/top10*100,
                 label=min_class)
        axr.plot((np.array(list(rf_feats[min_class].keys()))+n_min_class)/n_max_class*100,
                 np.array(rf_line_unitop_min)/top10*100,
                 label=min_class)

axl.set_title('Against RF Top 5% Imp. Feats', fontsize=15)
axc.set_title('Against Univariate Significant Feat.', fontsize=15)
axr.set_title('Against Univariate Top 5% Significant Feat.', fontsize=14)
axl.set_ylabel('% of Common Features with Real Model', fontsize = 15)
axc.set_xlabel('% of Samples of the Minority Class in Comparison to the Majority Class', fontsize = 15)
axl.set_ylim(40,102)
axc.set_ylim(40,102)
axr.set_ylim(40,102)
#plt.legend(loc='upper left', fontsize=13, bbox_to_anchor=(1,1))
axr.legend(loc='upper left', fontsize=15, bbox_to_anchor=(1,1))

plt.suptitle('Random Forest', fontsize=15)

### Extracting Important Features for PLS-DA models

In [None]:
# Extracting Important Features and averaging them across the 5 folds for each synthetic dataset
plsda_feats = {}

for cl, models in PLSDA_models_bal.items():
    plsda_feats[cl] = {}
    for size, folds in models.items():
        plsda_feats[cl][size] = {}
        for fold, mod in folds.items():
            if fold == 1:
                temp_df_bal = ma._calculate_vips(mod)
            else:
                temp_df_bal = temp_df_bal + ma._calculate_vips(mod)

        temp_df_bal = temp_df_bal / len(folds)

        plsda_feats[cl][size] = dict(zip(range(1, len(bal_datasets[cl][1][size][0].columns)+1), temp_df_bal))

In [None]:
# Select the top 5% of important features
plsda_feats_abrev = {}
for min_class in plsda_feats:
    temp_df = pd.DataFrame.from_dict(plsda_feats[min_class])
    plsda_feats_abrev[min_class] = pd.Series(index=temp_df.columns)
    top10 = int(0.05*len(temp_df.index))
    for i in temp_df.columns:
        plsda_feats_abrev[min_class][i] = temp_df[i].sort_values(ascending=False)[:top10].mean()
        #print(rf_feats[i].sort_values(ascending=False)[:top10])

In [None]:
top10

Relative importance values depending on the number of GAN samples added

In [None]:
# Plotting the results and adjusting parameters of the plot
with plt.style.context('seaborn-whitegrid'):
    f, ax = plt.subplots(figsize=(6,6))

    for i in plsda_feats_abrev:
        plt.plot((np.array(list(plsda_feats_abrev[i].index)+n_min_class))/n_max_class*100,
             plsda_feats_abrev[i].values/plsda_feats_abrev[i].values[0]*100,
                 label=i)

    plt.ylabel('Avg. VIP Score of top 2% Imp. features change (%)', fontsize = 12)
    plt.xlabel('% of Augmentation', fontsize = 15)
    plt.ylim(90,120)
    plt.legend(fontsize=13)
    ax.tick_params(axis='both', which='major', labelsize=13)
    plt.title('PLS-DA', fontsize=15)

Common top 5% important features compared to the complete models depending on the number of GAN samples added

In [None]:
# Plotting the results and adjusting parameters of the plot
with plt.style.context('seaborn-whitegrid'):
    f, (axl, axc, axr) = plt.subplots(1,3,figsize=(15,5))

    for min_class in plsda_feats:
        plsda_line_min = []
        for i in plsda_feats[min_class]:
            idxs = []
            for l in pd.DataFrame(plsda_feats[min_class])[i].sort_values(ascending=False)[:top10].index:
                idxs.append(bal_datasets[min_class][1][0][0].columns[l-1])
            plsda_line_min.append(len(np.intersect1d(idxs, PLSDA_feats_real[min_class].index[:top10])))

        plsda_line_uni_min = []
        for i in plsda_feats[min_class]:
            idxs = []
            for l in pd.DataFrame(plsda_feats[min_class])[i].sort_values(ascending=False)[:top10].index:
                idxs.append(bal_datasets[min_class][1][0][0].columns[l-1])
            plsda_line_uni_min.append(len(np.intersect1d(idxs, uni_results_filt[min_class].index)))
            
        plsda_line_unitop_min = []
        for i in plsda_feats[min_class]:
            idxs = []
            for l in pd.DataFrame(plsda_feats[min_class])[i].sort_values(ascending=False)[:top10].index:
                idxs.append(bal_datasets[min_class][1][0][0].columns[l-1])
            plsda_line_unitop_min.append(len(np.intersect1d(idxs, uni_results_filt[min_class].index[:top10])))

        axl.plot((np.array(list(plsda_feats[min_class].keys()))+n_min_class)/n_max_class*100,
                 np.array(plsda_line_min)/top10*100,
                 label=min_class)
        axc.plot((np.array(list(plsda_feats[min_class].keys()))+n_min_class)/n_max_class*100,
                 np.array(plsda_line_uni_min)/top10*100,
                 label=min_class)
        axr.plot((np.array(list(plsda_feats[min_class].keys()))+n_min_class)/n_max_class*100,
                 np.array(plsda_line_unitop_min)/top10*100,
                 label=min_class)

axl.set_title('Against PLS-DA Top 5% Imp. Feats', fontsize=15)
axc.set_title('Against Univariate Significant Feat.', fontsize=15)
axr.set_title('Against Univariate Top 5% Significant Feat.', fontsize=14)
axl.set_ylabel('% of Common Features with Real Model', fontsize = 15)
axc.set_xlabel('% of Samples of the Minority Class in Comparison to the Majority Class', fontsize = 15)
axl.set_ylim(40,102)
axc.set_ylim(40,102)
axr.set_ylim(40,102)
#plt.legend(loc='upper left', fontsize=13, bbox_to_anchor=(1,1))
axc.legend(loc='lower left', fontsize=13)

plt.suptitle('PLS-DA', fontsize=15)

#### Summary of Results of Feature Importance Analysis

In [None]:
# Plotting the results and adjusting parameters of the plot
with plt.style.context('seaborn-whitegrid'):
    f, (axl, axr) = plt.subplots(1,2, figsize=(8,4), constrained_layout=True)

    for i in rf_feats_abrev:
        axl.plot((np.array(list(rf_feats_abrev[i].index)+n_min_class))/n_max_class*100,
             rf_feats_abrev[i].values/rf_feats_abrev[i].values[0]*100,
                 label=i)

    axl.set_ylabel('Avg. Gini Imp. of top 5% Imp. Feat. change (%)', fontsize = 10)
    axl.set_ylim(60,140)
    axl.legend(fontsize=12, ncol=2)
    axl.tick_params(axis='both', which='major', labelsize=11)
    axl.set_title('Random Forest', fontsize=15)
    
    for i in plsda_feats_abrev:
        axr.plot((np.array(list(plsda_feats_abrev[i].index)+n_min_class))/n_max_class*100,
             plsda_feats_abrev[i].values/plsda_feats_abrev[i].values[0]*100,
                 label=i)

    axr.set_ylabel('Avg. VIP Score of top 5% Imp. Feat. change (%)', fontsize = 10)
    f.supxlabel('% of Samples of the Minority Class in Comparison to the Majority Class', fontsize = 15)
    axr.set_ylim(60,140)
    axr.tick_params(axis='both', which='major', labelsize=11)
    axr.set_title('PLS-DA', fontsize=15)
    #plt.suptitle('YD', fontsize=18)
    f.savefig('images/Synthetic_Imbalanced_ImpFeatChange_plot.png', dpi=400)
    f.savefig('images/Synthetic_Imbalanced_ImpFeatChange_plot.pdf', dpi=400)

In [None]:
# Plotting the results and adjusting parameters of the plot
with plt.style.context('seaborn-whitegrid'):
    f, (axu, axd) = plt.subplots(2,2,figsize=(10,10), constrained_layout=True)
    axul, axur = axu
    axdl, axdr = axd
    
    for min_class in rf_feats:
        rf_line_min = []
        for i in rf_feats[min_class]:
            idxs = []
            for l in pd.DataFrame(rf_feats[min_class])[i].sort_values(ascending=False)[:top10].index:
                idxs.append(bal_datasets[min_class][1][0][0].columns[l-1])
            rf_line_min.append(len(np.intersect1d(idxs, RF_feats_real[min_class].index[:top10])))

        rf_line_uni_min = []
        for i in rf_feats[min_class]:
            idxs = []
            for l in pd.DataFrame(rf_feats[min_class])[i].sort_values(ascending=False)[:top10].index:
                idxs.append(bal_datasets[min_class][1][0][0].columns[l-1])
            rf_line_uni_min.append(len(np.intersect1d(idxs, uni_results_filt[min_class].index)))

        axul.plot((np.array(list(rf_feats[min_class].keys()))+n_min_class)/n_max_class*100, np.array(rf_line_min)/top10*100,
                 label=min_class)
        axur.plot((np.array(list(rf_feats[min_class].keys()))+n_min_class)/n_max_class*100,
                 np.array(rf_line_uni_min)/top10*100,
                 label=min_class)

    axul.set_title('Against RF Top 5% Imp. Feat.', fontsize=15)
    axur.set_title('Against Univariate Significant Feat.', fontsize=15)
    axul.set_ylabel('% of Common Features with RF Real Model', fontsize = 14)
    #f.set_xlabel('% of Samples of the Minority Class in Comparison to the Majority Class', fontsize = 15)
    axul.set_ylim(0,105)
    axur.set_ylim(0,105)
    #plt.legend(loc='upper left', fontsize=13, bbox_to_anchor=(1,1))
    axur.legend(loc='lower left', fontsize=15, ncol=2)
    axul.tick_params(axis='both', which='major', labelsize=14)
    axur.tick_params(axis='both', which='major', labelsize=14)
    
    for min_class in plsda_feats:
        plsda_line_min = []
        for i in plsda_feats[min_class]:
            idxs = []
            for l in pd.DataFrame(plsda_feats[min_class])[i].sort_values(ascending=False)[:top10].index:
                idxs.append(bal_datasets[min_class][1][0][0].columns[l-1])
            plsda_line_min.append(len(np.intersect1d(idxs, PLSDA_feats_real[min_class].index[:top10])))

        plsda_line_uni_min = []
        for i in plsda_feats[min_class]:
            idxs = []
            for l in pd.DataFrame(plsda_feats[min_class])[i].sort_values(ascending=False)[:top10].index:
                idxs.append(bal_datasets[min_class][1][0][0].columns[l-1])
            plsda_line_uni_min.append(len(np.intersect1d(idxs, uni_results_filt[min_class].index)))

        axdl.plot((np.array(list(plsda_feats[min_class].keys()))+n_min_class)/n_max_class*100,
                 np.array(plsda_line_min)/top10*100,
                 label=min_class)
        axdr.plot((np.array(list(plsda_feats[min_class].keys()))+n_min_class)/n_max_class*100,
                 np.array(plsda_line_uni_min)/top10*100,
                 label=min_class)

    axdl.set_title('Against PLS-DA Top 5% Imp. Feat.', fontsize=15)
    axdr.set_title('Against Univariate Significant Feat.', fontsize=15)
    axdl.set_ylabel('% of Common Features with Real PLS-DA Model', fontsize = 14)
    f.supxlabel('% of Samples of the Minority Class in Comparison to the Majority Class', fontsize = 15)
    axdl.set_ylim(0,105)
    axdr.set_ylim(0,105)
    axdl.tick_params(axis='both', which='major', labelsize=14)
    axdr.tick_params(axis='both', which='major', labelsize=14)
    #plt.suptitle('YD', fontsize=18)
    f.savefig('Images/Synthetic_Imbalanced_ImpFeatCommon_plot.png', dpi=400)
    f.savefig('Images/Synthetic_Imbalanced_ImpFeatCommon_plot.pdf', dpi=400)

### Comparing Imbalanced Datasets, Imb + Min. Cl. GAN Samples and only GAN samples

Creating the models and evaluating them for GAN data and compiling results for the imbalanced datasets (with no augmentation) and the imbalanced datasets made balanced with minority class GAN samples.

##### RF

In [None]:
# Fitting and storing Random Forest models for each fold
RF_models_GAN = {}
RF_models_real = {}
RF_models_GAN_bal = {}

# Train the Models
for i in generated_samples:
    RF_models_GAN[i] = {}
    RF_models_real[i] = {}
    RF_models_GAN_bal[i] = {}
    for fold in generated_samples[i]:
        rf_mod = ma.RF_model(generated_samples[i][fold][0], generated_samples[i][fold][1], return_cv=False, n_trees=200)
        RF_models_GAN[i][fold] = rf_mod

        RF_models_real[i][fold] = RF_models_bal[i][0][fold]

        RF_models_GAN_bal[i][fold] = RF_models_bal[i][n_max_class-n_min_class][fold]

In [None]:
# Testing the RF models with the test data for each fold
RF_results_GAN = {'Accuracy':{}, 'F1-Score':{},
                  'Precision':{}, 'Recall':{}}
RF_results_real = {'Accuracy':{}, 'F1-Score':{},
                   'Precision':{}, 'Recall':{}}
RF_results_GAN_bal = {'Accuracy':{}, 'F1-Score':{},
                  'Precision':{}, 'Recall':{}}

for min_class in RF_models_bal:
    for l in RF_results_GAN:
        RF_results_GAN[l][min_class] = {}
        RF_results_real[l][min_class] = {}
        RF_results_GAN_bal[l][min_class] = {}
    
    for fold in generated_samples[min_class]:
        RF_results_GAN['Accuracy'][min_class][fold] = RF_models_GAN[min_class][fold].score(
                                                                                df_storage_test[min_class][fold],
                                                                                lbl_storage_test[min_class][fold])
        preds = RF_models_GAN[min_class][fold].predict(df_storage_test[min_class][fold])
        prec, rec, f1, sup = precision_recall_fscore_support(lbl_storage_test[min_class][fold], preds,
                                                            pos_label='1', average='binary', zero_division=1)
        RF_results_GAN['F1-Score'][min_class][fold] = f1
        RF_results_GAN['Precision'][min_class][fold] = prec
        RF_results_GAN['Recall'][min_class][fold] = rec

        RF_results_real['Accuracy'][min_class][fold] = RF_results_bal['Accuracy'][min_class][0][fold]
        RF_results_real['F1-Score'][min_class][fold] = RF_results_bal['F1-Score'][min_class][0][fold]
        RF_results_real['Precision'][min_class][fold] = RF_results_bal['Precision'][min_class][0][fold]
        RF_results_real['Recall'][min_class][fold] = RF_results_bal['Recall'][min_class][0][fold]
        
        RF_results_GAN_bal['Accuracy'][min_class][fold] = RF_results_bal[
            'Accuracy'][min_class][n_max_class-n_min_class][fold]
        RF_results_GAN_bal['F1-Score'][min_class][fold] = RF_results_bal[
            'F1-Score'][min_class][n_max_class-n_min_class][fold]
        RF_results_GAN_bal['Precision'][min_class][fold] = RF_results_bal[
            'Precision'][min_class][n_max_class-n_min_class][fold]
        RF_results_GAN_bal['Recall'][min_class][fold] = RF_results_bal['Recall'][min_class][n_max_class-n_min_class][fold]

In [None]:
PLSDA_models_GAN = {}
PLSDA_results_GAN = {'Accuracy':{}, 'F1-Score':{},
                  'Precision':{}, 'Recall':{}}

PLSDA_models_real = {}
PLSDA_results_real = {'Accuracy':{}, 'F1-Score':{},
                  'Precision':{}, 'Recall':{}}

PLSDA_models_GAN_bal = {}
PLSDA_results_GAN_bal = {'Accuracy':{}, 'F1-Score':{},
                  'Precision':{}, 'Recall':{}}

# Train the Models
for min_class in bal_datasets:
    for l in PLSDA_results_GAN:
        PLSDA_results_GAN[l][min_class] = {}
        PLSDA_results_real[l][min_class] = {}
        PLSDA_results_GAN_bal[l][min_class] = {}
    PLSDA_models_GAN[min_class] = {}
    PLSDA_models_real[min_class] = {}
    PLSDA_models_GAN_bal[min_class] = {}

    for fold in bal_datasets[min_class]:

        PLSDA_models_GAN[min_class][fold] = ma.fit_PLSDA_model(generated_samples[min_class][fold][0],
                                                               generated_samples[min_class][fold][1],
                                                          n_comp=4,
                                                  return_scores=False, scale=False, encode2as1vector=True)
        plsda = PLSDA_models_GAN[min_class][fold]
        # Obtain results with the test group
        y_pred = plsda.predict(df_storage_test[min_class][fold])
        y_true = ma._generate_y_PLSDA(lbl_storage_test[min_class][fold], pd.unique(generated_samples[min_class][fold][1]),
                                      True)
        pos_label = np.where(pd.unique(generated_samples[min_class][fold][1]) != '1')[0][0]
        # Calculate accuracy
        accuracy, f1, prec, rec = decision_rule(y_pred, y_true, pos_label=pos_label, average='binary')
        PLSDA_results_GAN['Accuracy'][min_class][fold] = accuracy
        PLSDA_results_GAN['F1-Score'][min_class][fold] = f1
        PLSDA_results_GAN['Precision'][min_class][fold] = prec
        PLSDA_results_GAN['Recall'][min_class][fold] = rec
        
        PLSDA_models_real[min_class][fold] = PLSDA_models_bal[min_class][0][fold]
        
        PLSDA_results_real['Accuracy'][min_class][fold] = PLSDA_results_bal['Accuracy'][min_class][0][fold]
        PLSDA_results_real['F1-Score'][min_class][fold] = PLSDA_results_bal['F1-Score'][min_class][0][fold]
        PLSDA_results_real['Precision'][min_class][fold] = PLSDA_results_bal['Precision'][min_class][0][fold]
        PLSDA_results_real['Recall'][min_class][fold] = PLSDA_results_bal['Recall'][min_class][0][fold]
        
        PLSDA_models_GAN_bal[min_class][fold] = PLSDA_models_bal[min_class][n_max_class-n_min_class][fold]
        
        PLSDA_results_GAN_bal['Accuracy'][min_class][fold] = PLSDA_results_bal[
            'Accuracy'][min_class][n_max_class-n_min_class][fold]
        PLSDA_results_GAN_bal['F1-Score'][min_class][fold] = PLSDA_results_bal[
            'F1-Score'][min_class][n_max_class-n_min_class][fold]
        PLSDA_results_GAN_bal['Precision'][min_class][fold] = PLSDA_results_bal[
            'Precision'][min_class][n_max_class-n_min_class][fold]
        PLSDA_results_GAN_bal['Recall'][min_class][fold] = PLSDA_results_bal[
            'Recall'][min_class][n_max_class-n_min_class][fold]

##### Summarising Results

In [None]:
# Plotting the results and adjusting parameters of the plot
with plt.style.context('seaborn-whitegrid'):
    f, (axl, axr) = plt.subplots(1,2, figsize=(8,4), constrained_layout=True)
    
    axl.plot(F1s_stats_RF['treatment'], F1s_stats_RF['Average accuracy'],
       label = 'Complete Dataset (CV)', color = 'black')
    axl.errorbar(F1s_stats_RF['treatment'], y=F1s_stats_RF['Average accuracy'],
                yerr=F1s_stats_RF['STD'],
                                ls='none', ecolor='0.2', capsize=3)
    metric = 'F1-Score'

    axl.plot(pd.DataFrame(RF_results_real[metric]).mean().index,
             pd.DataFrame(RF_results_real[metric]).mean().values,
           label = 'Real Imbalanced Data', color = 'green')
    axl.errorbar(pd.DataFrame(RF_results_real[metric]).mean().index, y=pd.DataFrame(RF_results_real[metric]).mean(), 
                yerr=pd.DataFrame(RF_results_real[metric]).std(),
                                ls='none', ecolor='0.2', capsize=3)
    
    axl.plot(pd.DataFrame(RF_results_GAN_bal[metric]).mean().index,
             pd.DataFrame(RF_results_GAN_bal[metric]).mean().values,
           label = 'Imb. + Min. Cl. GAN Data', color = 'red')
    axl.errorbar(pd.DataFrame(RF_results_GAN_bal[metric]).mean().index, y=pd.DataFrame(RF_results_GAN_bal[metric]).mean(), 
                yerr=pd.DataFrame(RF_results_GAN_bal[metric]).std(),
                                ls='none', ecolor='0.2', capsize=3)

    
    axl.plot(pd.DataFrame(RF_results_GAN[metric]).mean().index,
             pd.DataFrame(RF_results_GAN[metric]).mean().values,
           label = 'GAN Data', color = 'blue')
    axl.errorbar(pd.DataFrame(RF_results_GAN[metric]).mean().index, y=pd.DataFrame(RF_results_GAN[metric]).mean(), 
                yerr=pd.DataFrame(RF_results_GAN[metric]).std(),
                                ls='none', ecolor='0.2', capsize=3)
    
    axl.set_title('Random Forest', fontsize=15)
    axl.set_xlabel('Class Separation', fontsize=15)
    axl.set_ylabel(metric, fontsize=15)
    
    axr.plot(F1s_stats_PLSDA['treatment'], F1s_stats_PLSDA['Average accuracy'],
       label = 'Complete Dataset', color = 'black')
    axr.errorbar(F1s_stats_PLSDA['treatment'], y=F1s_stats_PLSDA['Average accuracy'],
                yerr=F1s_stats_PLSDA['STD'],
                                ls='none', ecolor='0.2', capsize=3)
    
    axr.plot(pd.DataFrame(PLSDA_results_real[metric]).mean().index,
             pd.DataFrame(PLSDA_results_real[metric]).mean().values,
           label = 'Real Imbalanced Data', color = 'green')
    axr.errorbar(pd.DataFrame(PLSDA_results_real[metric]).mean().index, y=pd.DataFrame(PLSDA_results_real[metric]).mean(), 
                yerr=pd.DataFrame(PLSDA_results_real[metric]).std(),
                                ls='none', ecolor='0.2', capsize=3)
    
    axr.plot(pd.DataFrame(PLSDA_results_GAN_bal[metric]).mean().index,
             pd.DataFrame(PLSDA_results_GAN_bal[metric]).mean().values,
           label = 'Imb. + Min. Cl. GAN Data', color = 'red')
    axr.errorbar(pd.DataFrame(PLSDA_results_GAN_bal[metric]).mean().index,
                y=pd.DataFrame(PLSDA_results_GAN_bal[metric]).mean(), 
                yerr=pd.DataFrame(PLSDA_results_GAN_bal[metric]).std(),
                                ls='none', ecolor='0.2', capsize=3)
    
    axr.plot(pd.DataFrame(PLSDA_results_GAN[metric]).mean().index,
             pd.DataFrame(PLSDA_results_GAN[metric]).mean().values,
           label = 'GAN Data', color = 'blue')
    axr.errorbar(pd.DataFrame(PLSDA_results_GAN[metric]).mean().index, y=pd.DataFrame(PLSDA_results_GAN[metric]).mean(), 
                yerr=pd.DataFrame(PLSDA_results_GAN[metric]).std(),
                                ls='none', ecolor='0.2', capsize=3)
    
    axr.set_title('PLS-DA', fontsize=15)
    axr.set_xlabel('Class Separation', fontsize=15)
    axr.set_ylim([0, 1.05])
    axr.legend(fontsize=13)
    #plt.suptitle('YD', fontsize=18)
    f.savefig('images/Synthetic_Imbalanced_F1Score_plot.png', dpi=400)
    f.savefig('images/Synthetic_Imbalanced_F1Score_plot.pdf', dpi=400)

In [None]:
# Plotting the results for each fold
f, axs = plt.subplots(2,2,figsize=(12,12))
for metric, ax in zip(RF_results_GAN, axs.ravel()):
    ax.plot(pd.DataFrame(RF_results_GAN[metric]).mean().index,
             pd.DataFrame(RF_results_GAN[metric]).mean().values,
           label = 'GAN data', color = 'blue')
    ax.errorbar(pd.DataFrame(RF_results_GAN[metric]).mean().index, y=pd.DataFrame(RF_results_GAN[metric]).mean(), 
                yerr=pd.DataFrame(RF_results_GAN[metric]).std(),
                                ls='none', ecolor='0.2', capsize=3)
    
    ax.plot(pd.DataFrame(RF_results_GAN_bal[metric]).mean().index,
             pd.DataFrame(RF_results_GAN_bal[metric]).mean().values,
           label = 'Balanced Data', color = 'red')
    ax.errorbar(pd.DataFrame(RF_results_GAN_bal[metric]).mean().index, y=pd.DataFrame(RF_results_GAN_bal[metric]).mean(), 
                yerr=pd.DataFrame(RF_results_GAN_bal[metric]).std(),
                                ls='none', ecolor='0.2', capsize=3)
    
    ax.plot(pd.DataFrame(RF_results_real[metric]).mean().index,
             pd.DataFrame(RF_results_real[metric]).mean().values,
           label = 'Imbalanced Data', color = 'green')
    ax.errorbar(pd.DataFrame(RF_results_real[metric]).mean().index, y=pd.DataFrame(RF_results_real[metric]).mean(), 
                yerr=pd.DataFrame(RF_results_real[metric]).std(),
                                ls='none', ecolor='0.2', capsize=3)
    
    ax.set_title(metric, fontsize=15)
    ax.set_xlabel('Class Separation', fontsize=15)
    ax.set_ylim([0, 1.05])
    
axs[0][0].plot(accuracy_stats_RF['treatment'], accuracy_stats_RF['Average accuracy'],
       label = 'Full Data (CV)', color = 'black')
axs[0][0].errorbar(accuracy_stats_RF['treatment'], y=accuracy_stats_RF['Average accuracy'],
            yerr=accuracy_stats_RF['STD'],
                            ls='none', ecolor='0.2', capsize=3)

axs[0][1].plot(F1s_stats_RF['treatment'], F1s_stats_RF['Average accuracy'],
       label = 'Full Data (CV)', color = 'black')
axs[0][1].errorbar(F1s_stats_RF['treatment'], y=F1s_stats_RF['Average accuracy'],
            yerr=F1s_stats_RF['STD'],
                            ls='none', ecolor='0.2', capsize=3)
    
axs[0][1].legend(bbox_to_anchor=(1,1), fontsize=15)
f.supylabel('Performance', fontsize=15, x=0.07)

plt.suptitle('RF', fontsize=18, y=0.93)
plt.show()

In [None]:
# Plotting the results for each fold
f, axs = plt.subplots(2,2,figsize=(12,12))
for metric, ax in zip(PLSDA_results_GAN, axs.ravel()):
    ax.plot(pd.DataFrame(PLSDA_results_GAN[metric]).mean().index,
             pd.DataFrame(PLSDA_results_GAN[metric]).mean().values,
           label = 'GAN data', color = 'blue')
    ax.errorbar(pd.DataFrame(PLSDA_results_GAN[metric]).mean().index, y=pd.DataFrame(PLSDA_results_GAN[metric]).mean(), 
                yerr=pd.DataFrame(PLSDA_results_GAN[metric]).std(),
                                ls='none', ecolor='0.2', capsize=3)
    
    ax.plot(pd.DataFrame(PLSDA_results_GAN_bal[metric]).mean().index,
             pd.DataFrame(PLSDA_results_GAN_bal[metric]).mean().values,
           label = 'Balanced Data', color = 'red')
    ax.errorbar(pd.DataFrame(PLSDA_results_GAN_bal[metric]).mean().index,
                y=pd.DataFrame(PLSDA_results_GAN_bal[metric]).mean(), 
                yerr=pd.DataFrame(PLSDA_results_GAN_bal[metric]).std(),
                                ls='none', ecolor='0.2', capsize=3)
    
    ax.plot(pd.DataFrame(PLSDA_results_real[metric]).mean().index,
             pd.DataFrame(PLSDA_results_real[metric]).mean().values,
           label = 'Imbalanced Data', color = 'green')
    ax.errorbar(pd.DataFrame(PLSDA_results_real[metric]).mean().index, y=pd.DataFrame(PLSDA_results_real[metric]).mean(), 
                yerr=pd.DataFrame(PLSDA_results_real[metric]).std(),
                                ls='none', ecolor='0.2', capsize=3)
    
    ax.set_title(metric, fontsize=15)
    ax.set_xlabel('Class Separation', fontsize=15)
    ax.set_ylim([0, 1.05])
    
axs[0][0].plot(accuracy_stats_PLSDA['treatment'], accuracy_stats_PLSDA['Average accuracy'],
       label = 'Full Data (CV)', color = 'black')
axs[0][0].errorbar(accuracy_stats_PLSDA['treatment'], y=accuracy_stats_PLSDA['Average accuracy'],
            yerr=accuracy_stats_PLSDA['STD'],
                            ls='none', ecolor='0.2', capsize=3)
axs[0][1].legend(bbox_to_anchor=(1,1), fontsize=15)
f.supylabel('Performance', fontsize=15, x=0.07)

plt.suptitle('PLS-DA', fontsize=18, y=0.93)
plt.show()

In [None]:
# Results
with sns.axes_style("whitegrid"):
    with sns.plotting_context("notebook", font_scale=1.5):
        f, axs = plt.subplots(6, 4, figsize=(20, 18), constrained_layout=True)
        
        for sep,i in zip(RF_results_GAN['Accuracy'], range(0,24,4)):
            for metric, axu in zip(PLSDA_results_GAN, axs.ravel()[i:i+4]):
                x = np.arange(2)  # the label locations
                l = ['RF', 'PLSDA']
                width = 0.23  # the width of the bars

                offset = - 0.25 + 0 * 0.25
                accuracy_stats = pd.DataFrame({'Average accuracy': 
                                               [pd.Series(RF_results_GAN[metric][sep]).values.mean(), 
                                                     pd.Series(PLSDA_results_GAN[metric][sep]).values.mean()],
                                             'STD': [pd.Series(RF_results_GAN[metric][sep]).values.std(), 
                                                     pd.Series(PLSDA_results_GAN[metric][sep]).values.std()]})
                rects = axu.bar(x + offset, accuracy_stats['Average accuracy'], width, label='GAN data', color='blue')
                axu.errorbar(x + offset, y=accuracy_stats['Average accuracy'], yerr=accuracy_stats['STD'],
                                ls='none', ecolor='0.2', capsize=3)

                offset = - 0.25 + 1 * 0.25
                accuracy_stats = pd.DataFrame({'Average accuracy': 
                                               [pd.Series(RF_results_GAN_bal[metric][sep]).values.mean(), 
                                                    pd.Series(PLSDA_results_GAN_bal[metric][sep]).values.mean()],
                                             'STD': [pd.Series(RF_results_GAN_bal[metric][sep]).values.std(), 
                                                     pd.Series(PLSDA_results_GAN_bal[metric][sep]).values.std()]})
                rects = axu.bar(x + offset, accuracy_stats['Average accuracy'],
                                width, label='Balanced data', color='red')
                axu.errorbar(x + offset, y=accuracy_stats['Average accuracy'], yerr=accuracy_stats['STD'],
                                ls='none', ecolor='0.2', capsize=3)

                offset = - 0.25 + 2 * 0.25
                accuracy_stats = pd.DataFrame({'Average accuracy': 
                                               [pd.Series(RF_results_real[metric][sep]).values.mean(), 
                                                    pd.Series(PLSDA_results_real[metric][sep]).values.mean()],
                                             'STD': [pd.Series(RF_results_real[metric][sep]).values.std(), 
                                                     pd.Series(PLSDA_results_real[metric][sep]).values.std()]})
                rects = axu.bar(x + offset, accuracy_stats['Average accuracy'], width, label='Real data', color='green')
                axu.errorbar(x + offset, y=accuracy_stats['Average accuracy'], yerr=accuracy_stats['STD'],
                                ls='none', ecolor='0.2', capsize=3)

                axu.set_xticks(x)
                axu.set_xticklabels(l, fontsize=16)
                axu.set(ylabel=sep, title=metric, ylim=(0,1.05))
                for spine in axu.spines.values():
                    spine.set_edgecolor('0.1')

    axs[0][3].legend(loc='upper left', fontsize=13, bbox_to_anchor=(1,1))

    plt.show()

### Comparing Important features of models built from Imbalanced Datasets, Imb + Min. Cl. GAN Samples and only GAN samples

#### Comparing Against Important Features of the Complete  Synthetic Dataset models

In [None]:
# Create a 2nd complete model but with only 1 iteration to compare the important features
np.random.seed(11485)
n_fold = 5
RF_model_real1 = {}
RF_feats_real1 = {}
for sep in dfs:
    RF_model_real1[sep] = ma.RF_model_CV(dfs[sep], lbls_orig[sep],
                                   iter_num=1, n_fold=n_fold, n_trees=200) 
    RF_feats_real1[sep] = pd.DataFrame(RF_model_real1[sep][
        'important_features']).set_index(0).sort_values(by=1, ascending=False)
    RF_feats_real1[sep].index = [dfs[sep].columns[i] for i in RF_feats_real1[sep].index]

In [None]:
RF_feats_GAN = {}
RF_feats_imb = {}
RF_feats_GAN_bal = {}

for i in RF_models_real.keys():
    RF_feats_GAN[i] = {}
    RF_feats_imb[i] = {}
    RF_feats_GAN_bal[i] = {}
    for fold in RF_models_real[i]:
        temp_df = pd.DataFrame(zip(range(len(generated_samples[i][fold][0].columns)),
                                   RF_models_real[i][fold].feature_importances_))
        temp_df = temp_df.set_index(0).sort_values(by=1, ascending=False)
        temp_df.index = [generated_samples[i][fold][0].columns[a] for a in temp_df.index]
        RF_feats_imb[i][fold] = temp_df.copy()

        temp_df = pd.DataFrame(zip(range(len(generated_samples[i][fold][0].columns)),
                                   RF_models_GAN[i][fold].feature_importances_))
        temp_df = temp_df.set_index(0).sort_values(by=1, ascending=False)
        temp_df.index = [generated_samples[i][fold][0].columns[a] for a in temp_df.index]
        RF_feats_GAN[i][fold] = temp_df.copy()

        temp_df = pd.DataFrame(zip(range(len(generated_samples[i][fold][0].columns)),
                                   RF_models_GAN_bal[i][fold].feature_importances_))
        temp_df = temp_df.set_index(0).sort_values(by=1, ascending=False)
        temp_df.index = [generated_samples[i][fold][0].columns[a] for a in temp_df.index]
        RF_feats_GAN_bal[i][fold] = temp_df.copy()

    print(i)

In [None]:
RF_feats_GAN_mean = {}
RF_feats_imb_mean = {}
RF_feats_GAN_bal_mean = {}

for i in RF_models_real.keys():
    RF_feats_GAN_mean[i] = []
    RF_feats_imb_mean[i] = []
    RF_feats_GAN_bal_mean[i] = []
    for fold in RF_models_real[i]:
        if fold == 1:
            temp_df_imb = RF_models_real[i][fold].feature_importances_
            temp_df_bal = RF_models_GAN_bal[i][fold].feature_importances_
            temp_df_GAN = RF_models_GAN[i][fold].feature_importances_
        else:
            temp_df_imb = temp_df_imb + RF_models_real[i][fold].feature_importances_
            temp_df_bal = temp_df_bal + RF_models_GAN_bal[i][fold].feature_importances_
            temp_df_GAN = temp_df_GAN + RF_models_GAN[i][fold].feature_importances_

    temp_df_imb = temp_df_imb / len(RF_models_real[i])
    temp_df_imb = pd.DataFrame(zip(range(len(generated_samples[i][fold][0].columns)), temp_df_imb))
    temp_df_imb = temp_df_imb.set_index(0).sort_values(by=1, ascending=False)
    temp_df_imb.index = [generated_samples[i][fold][0].columns[a] for a in temp_df_imb.index]
    RF_feats_imb_mean[i] = temp_df_imb.copy()

    temp_df_bal = temp_df_bal / len(RF_models_GAN_bal[i])
    temp_df_bal = pd.DataFrame(zip(range(len(generated_samples[i][fold][0].columns)), temp_df_bal))
    temp_df_bal = temp_df_bal.set_index(0).sort_values(by=1, ascending=False)
    temp_df_bal.index = [generated_samples[i][fold][0].columns[a] for a in temp_df_bal.index]
    RF_feats_GAN_bal_mean[i] = temp_df_bal.copy()

    temp_df_GAN = temp_df_GAN / len(RF_models_GAN[i])
    temp_df_GAN = pd.DataFrame(zip(range(len(generated_samples[i][fold][0].columns)), temp_df_GAN))
    temp_df_GAN = temp_df_GAN.set_index(0).sort_values(by=1, ascending=False)
    temp_df_GAN.index = [generated_samples[i][fold][0].columns[a] for a in temp_df_GAN.index]
    RF_feats_GAN_mean[i] = temp_df_GAN.copy()

Calculate intersection of important features from top 1 to top (number of features in the dataset) between the complete dataset (averaged over 20 iterations) and a iteration of the complete dataset, the imbalanced dataset, the balanced dataset and the GAN dataset.

In [None]:
intersections_RF = {}
for sep in RF_feats_real1:
    intersections_RF[sep] = []
    for i in range(1, len(RF_feats_real1[sep])):
        intersections_RF[sep].append(len(np.intersect1d(RF_feats_real1[sep].index[:i], RF_feats_real[sep].index[:i])))

In [None]:
intersections_RF_bal = {}
for cl in RF_feats_GAN_bal_mean:
    int_bal = []
    for i in range(1, len(RF_feats_real[cl])):
        int_bal.append(len(np.intersect1d(RF_feats_real[cl].index[:i], RF_feats_GAN_bal_mean[cl].index[:i])))
    intersections_RF_bal[cl] = int_bal

print('Intersections - Dataset Balanced - Finished')

intersections_RF_GAN = {}
for cl in RF_feats_GAN_mean:
    int_GAN = []
    for i in range(1, len(RF_feats_real[cl])):
        int_GAN.append(len(np.intersect1d(RF_feats_real[cl].index[:i], RF_feats_GAN_mean[cl].index[:i])))
    intersections_RF_GAN[cl] = int_GAN

print('Intersections - Dataset GAN - Finished')

intersections_RF_imb = {}
for cl in RF_feats_imb_mean:
    int_imb = []
    for i in range(1, len(RF_feats_real[cl])):
        int_imb.append(len(np.intersect1d(RF_feats_real[cl].index[:i], RF_feats_imb_mean[cl].index[:i])))
    intersections_RF_imb[cl] = int_imb

print('Intersections - Dataset Imbalanced - Finished')

# See intersections if features were randomly shuffled
random_intersections_RF = []
copy_shuffle = list(RF_feats_real[1.2].index).copy()
np.random.shuffle(copy_shuffle)
for i in range(1, len(RF_feats_real[1.2])):
    random_intersections_RF.append(len(np.intersect1d(RF_feats_real[1.2].index[:i], copy_shuffle[:i])))

In [None]:
f, axs = plt.subplots(3,2,figsize=(12,12))
# Graph depicting intersection of important features
for i,ax in zip(intersections_RF_imb, axs.ravel()):
    ax.scatter(range(1,len(intersections_RF[i])+1), 
               np.array(intersections_RF[i]) / np.array(range(1,len(intersections_RF[i])+1)),
            label = 'Real-Real Intersections', color='Black', s=5)
    
    ax.scatter(range(1,len(intersections_RF[i])+1),
               np.array(intersections_RF_imb[i]) / np.array(range(1,len(intersections_RF[i])+1)),
                label = 'Imbalanced Real Dataset', color='Green', s=5)
    ax.scatter(range(1,len(intersections_RF[i])+1),
               np.array(intersections_RF_bal[i]) / np.array(range(1,len(intersections_RF[i])+1)),
                label = 'GAN Augmented Real Dataset', color='Red', s=5)
    ax.scatter(range(1,len(intersections_RF[i])+1),
               np.array(intersections_RF_GAN[i]) / np.array(range(1,len(intersections_RF[i])+1)),
                label = 'GAN Samples Dataset', color='Blue', s=5)
    
    ax.scatter(range(1,len(intersections_RF[i])+1),
               np.array(random_intersections_RF) / np.array(range(1,len(intersections_RF[i])+1)),
            label = 'Random Intersections', color='Orange', s=5)
    
    ax.set_title(i, fontsize=15)
    ax.set_xlim([0,len(intersections_RF[i])//4])
f.supxlabel('Nº of Top Important (Gini Importance) Compounds', fontsize=15, y=0.05)
f.supylabel('Fraction of Common Compounds', fontsize=15, x=0.06)

axs[0][1].legend(loc='upper left', fontsize=11, bbox_to_anchor=(1,1), ncol=1, markerscale=3)
plt.suptitle('Random Forest', fontsize=18, y=0.93)
plt.show()

In [None]:
f, axs = plt.subplots(1,3,figsize=(16,4), constrained_layout=True)
# Graph depicting intersection of important features
for i in intersections_RF_imb:
    axs[0].scatter(range(1,len(intersections_RF[i])+1),
               np.array(intersections_RF_imb[i]) / np.array(range(1,len(intersections_RF[i])+1)), label =i, s=5)
    
    axs[1].scatter(range(1,len(intersections_RF[i])+1),
               np.array(intersections_RF_bal[i]) / np.array(range(1,len(intersections_RF[i])+1)), label =i, s=5)
    
    axs[2].scatter(range(1,len(intersections_RF[i])+1),
               np.array(intersections_RF_GAN[i]) / np.array(range(1,len(intersections_RF[i])+1)), label =i, s=5)

axs[0].set_title('Imbalanced Real Dataset', fontsize=15)
axs[1].set_title('GAN Augmented Real Dataset', fontsize=15)
axs[2].set_title('GAN Samples Dataset', fontsize=15)
    
axs[0].set_xlim([0,len(intersections_RF[i])//4])
axs[1].set_xlim([0,len(intersections_RF[i])//4])
axs[2].set_xlim([0,len(intersections_RF[i])//4])

f.supxlabel('Nº of Top Important (Gini Importance) Compounds', fontsize=15)
f.supylabel('Fraction of Common Compounds', fontsize=15)

axs[2].legend(loc='upper left', fontsize=11, bbox_to_anchor=(1,1), ncol=1, markerscale=3)
plt.suptitle('Random Forest', fontsize=18)
plt.show()

In [None]:
# Create a 2nd complete model but with only 1 iteration to compare the important features
np.random.seed(11485)
n_fold = 5
PLSDA_model_real1 = {}
PLSDA_feats_real1 = {}
for sep in dfs:
    PLSDA_model_real1[sep] = ma.PLSDA_model_CV(dfs[sep], lbls_orig[sep],
                                   n_comp=4, iter_num=1, n_fold=n_fold, feat_type='VIP')
    PLSDA_feats_real1[sep] = pd.DataFrame(PLSDA_model_real1[sep][
        'important_features']).set_index(0).sort_values(by=1, ascending=False)
    PLSDA_feats_real1[sep].index = [dfs[sep].columns[i] for i in PLSDA_feats_real1[sep].index]

In [None]:
PLSDA_feats_GAN = {}
PLSDA_feats_imb = {}
PLSDA_feats_GAN_bal = {}

for i in PLSDA_models_real.keys():
    PLSDA_feats_GAN[i] = {}
    PLSDA_feats_imb[i] = {}
    PLSDA_feats_GAN_bal[i] = {}
    for fold in PLSDA_models_real[i]:
        vips = ma._calculate_vips(PLSDA_models_real[i][fold])
        temp_df = pd.DataFrame(zip(range(len(generated_samples[i][fold][0].columns)), vips))
        temp_df = temp_df.set_index(0).sort_values(by=1, ascending=False)
        temp_df.index = [generated_samples[i][fold][0].columns[a] for a in temp_df.index]
        PLSDA_feats_imb[i][fold] = temp_df.copy()

        vips = ma._calculate_vips(PLSDA_models_GAN[i][fold])
        temp_df = pd.DataFrame(zip(range(len(generated_samples[i][fold][0].columns)), vips))
        temp_df = temp_df.set_index(0).sort_values(by=1, ascending=False)
        temp_df.index = [generated_samples[i][fold][0].columns[a] for a in temp_df.index]
        PLSDA_feats_GAN[i][fold] = temp_df.copy()

        vips = ma._calculate_vips(PLSDA_models_GAN_bal[i][fold])
        temp_df = pd.DataFrame(zip(range(len(generated_samples[i][fold][0].columns)), vips))
        temp_df = temp_df.set_index(0).sort_values(by=1, ascending=False)
        temp_df.index = [generated_samples[i][fold][0].columns[a] for a in temp_df.index]
        PLSDA_feats_GAN_bal[i][fold] = temp_df.copy()
    
    print(i)

In [None]:
PLSDA_feats_GAN_mean = {}
PLSDA_feats_imb_mean = {}
PLSDA_feats_GAN_bal_mean = {}

for i in PLSDA_models_real.keys():
    PLSDA_feats_GAN_mean[i] = []
    PLSDA_feats_imb_mean[i] = []
    PLSDA_feats_GAN_bal_mean[i] = []
    for fold in PLSDA_models_real[i]:
        if fold == 1:
            vips = ma._calculate_vips(PLSDA_models_real[i][fold])
            temp_df_imb = vips
            vips = ma._calculate_vips(PLSDA_models_GAN_bal[i][fold])
            temp_df_bal = vips
            vips = ma._calculate_vips(PLSDA_models_GAN[i][fold])
            temp_df_GAN = vips
        else:
            vips = ma._calculate_vips(PLSDA_models_real[i][fold])
            temp_df_imb = temp_df_imb + vips
            vips = ma._calculate_vips(PLSDA_models_GAN_bal[i][fold])
            temp_df_bal = temp_df_bal + vips
            vips = ma._calculate_vips(PLSDA_models_GAN[i][fold])
            temp_df_GAN = temp_df_GAN + vips

    temp_df_imb = temp_df_imb / len(PLSDA_models_real[i])
    temp_df_imb = pd.DataFrame(zip(range(len(generated_samples[i][fold][0].columns)), temp_df_imb))
    temp_df_imb = temp_df_imb.set_index(0).sort_values(by=1, ascending=False)
    temp_df_imb.index = [generated_samples[i][fold][0].columns[a] for a in temp_df_imb.index]
    PLSDA_feats_imb_mean[i] = temp_df_imb.copy()

    temp_df_bal = temp_df_bal / len(PLSDA_models_GAN_bal[i])
    temp_df_bal = pd.DataFrame(zip(range(len(generated_samples[i][fold][0].columns)), temp_df_bal))
    temp_df_bal = temp_df_bal.set_index(0).sort_values(by=1, ascending=False)
    temp_df_bal.index = [generated_samples[i][fold][0].columns[a] for a in temp_df_bal.index]
    PLSDA_feats_GAN_bal_mean[i] = temp_df_bal.copy()

    temp_df_GAN = temp_df_GAN / len(PLSDA_models_GAN[i])
    temp_df_GAN = pd.DataFrame(zip(range(len(generated_samples[i][fold][0].columns)), temp_df_GAN))
    temp_df_GAN = temp_df_GAN.set_index(0).sort_values(by=1, ascending=False)
    temp_df_GAN.index = [generated_samples[i][fold][0].columns[a] for a in temp_df_GAN.index]
    PLSDA_feats_GAN_mean[i] = temp_df_GAN.copy()

Calculate intersection of important features from top 1 to top (number of features in the dataset) between the complete dataset (averaged over 20 iterations) and a iteration of the complete dataset, the imbalanced dataset, the balanced dataset and the GAN dataset.

In [None]:
intersections_PLSDA = {}
for sep in PLSDA_feats_real1:
    intersections_PLSDA[sep] = []
    for i in range(1, len(PLSDA_feats_real1[sep])):
        intersections_PLSDA[sep].append(len(np.intersect1d(PLSDA_feats_real1[sep].index[:i],
                                                           PLSDA_feats_real[sep].index[:i])))

In [None]:
intersections_PLSDA_bal = {}
for cl in PLSDA_feats_GAN_bal_mean:
    int_bal = []
    for i in range(1, len(PLSDA_feats_real[cl])):
        int_bal.append(len(np.intersect1d(PLSDA_feats_real[cl].index[:i], PLSDA_feats_GAN_bal_mean[cl].index[:i])))
    intersections_PLSDA_bal[cl] = int_bal

print('Intersections - Dataset Balanced - Finished')

intersections_PLSDA_GAN = {}
for cl in PLSDA_feats_GAN_mean:
    int_GAN = []
    for i in range(1, len(PLSDA_feats_real[cl])):
        int_GAN.append(len(np.intersect1d(PLSDA_feats_real[cl].index[:i], PLSDA_feats_GAN_mean[cl].index[:i])))
    intersections_PLSDA_GAN[cl] = int_GAN

print('Intersections - Dataset GAN - Finished')

intersections_PLSDA_imb = {}
for cl in PLSDA_feats_imb_mean:
    int_imb = []
    for i in range(1, len(PLSDA_feats_real[cl])):
        int_imb.append(len(np.intersect1d(PLSDA_feats_real[cl].index[:i], PLSDA_feats_imb_mean[cl].index[:i])))
    intersections_PLSDA_imb[cl] = int_imb

print('Intersections - Dataset Imbalanced - Finished')

# See intersections if features were randomly shuffled
random_intersections_PLSDA = []
copy_shuffle = list(PLSDA_feats_real[1.2].index).copy()
np.random.shuffle(copy_shuffle)
for i in range(1, len(PLSDA_feats_real[1.2])):
    random_intersections_PLSDA.append(len(np.intersect1d(PLSDA_feats_real[1.2].index[:i], copy_shuffle[:i])))

In [None]:
f, axs = plt.subplots(3,2,figsize=(12,12))
# Graph depicting intersection of important features
for i,ax in zip(intersections_PLSDA_imb, axs.ravel()):
    ax.scatter(range(1,len(intersections_PLSDA[i])+1), 
               np.array(intersections_PLSDA[i]) / np.array(range(1,len(intersections_PLSDA[i])+1)),
            label = 'Real-Real Intersections', color='Black', s=5)
    
    ax.scatter(range(1,len(intersections_PLSDA[i])+1),
               np.array(intersections_PLSDA_imb[i]) / np.array(range(1,len(intersections_PLSDA[i])+1)),
                label = 'Imbalanced Real Dataset', color='Green', s=5)
    ax.scatter(range(1,len(intersections_PLSDA[i])+1),
               np.array(intersections_PLSDA_bal[i]) / np.array(range(1,len(intersections_PLSDA[i])+1)),
                label = 'GAN Augmented Real Dataset', color='Red', s=5)
    ax.scatter(range(1,len(intersections_PLSDA[i])+1),
               np.array(intersections_PLSDA_GAN[i]) / np.array(range(1,len(intersections_PLSDA[i])+1)),
                label = 'GAN Samples Dataset', color='Blue', s=5)
    
    ax.scatter(range(1,len(intersections_PLSDA[i])+1),
               np.array(random_intersections_PLSDA) / np.array(range(1,len(intersections_PLSDA[i])+1)),
            label = 'Random Intersections', color='Orange', s=5)
    
    ax.set_title(i, fontsize=15)
    ax.set_xlim([0,len(intersections_PLSDA[i])//4])

f.supxlabel('Nº of Top Important (VIP Score) Compounds', fontsize=15, y=0.05)
f.supylabel('Fraction of Common Compounds', fontsize=15, x=0.06)

axs[0][1].legend(loc='upper left', fontsize=11, bbox_to_anchor=(1,1), ncol=1, markerscale=3)
plt.suptitle('PLS-DA', fontsize=18, y=0.93)
plt.show()

In [None]:
f, axs = plt.subplots(1,3,figsize=(16,4), constrained_layout=True)
# Graph depicting intersection of important features
for i in intersections_RF_imb:
    axs[0].scatter(range(1,len(intersections_PLSDA[i])+1),
               np.array(intersections_PLSDA_imb[i]) / np.array(range(1,len(intersections_PLSDA[i])+1)), label =i, s=5)
    
    axs[1].scatter(range(1,len(intersections_PLSDA[i])+1),
               np.array(intersections_PLSDA_bal[i]) / np.array(range(1,len(intersections_PLSDA[i])+1)), label =i, s=5)
    
    axs[2].scatter(range(1,len(intersections_PLSDA[i])+1),
               np.array(intersections_PLSDA_GAN[i]) / np.array(range(1,len(intersections_PLSDA[i])+1)), label =i, s=5)

axs[0].set_title('Imbalanced Real Dataset', fontsize=15)
axs[1].set_title('GAN Augmented Real Dataset', fontsize=15)
axs[2].set_title('GAN Samples Dataset', fontsize=15)
    
axs[0].set_xlim([0,len(intersections_PLSDA[i])//4])
axs[1].set_xlim([0,len(intersections_PLSDA[i])//4])
axs[2].set_xlim([0,len(intersections_PLSDA[i])//4])

f.supxlabel('Nº of Top Important (VIP Score) Compounds', fontsize=15)
f.supylabel('Fraction of Common Compounds', fontsize=15)

axs[2].legend(loc='upper left', fontsize=11, bbox_to_anchor=(1,1), ncol=1, markerscale=3)
plt.suptitle('PLS-DA', fontsize=18)
plt.show()

#### Comparing Against Significant Feature (by Univariate Analysis) of the Complete models

Calculate intersection of important features from top 1 to top (number of features in the dataset) between the complete dataset (averaged over 20 iterations) and a iteration of the complete dataset, the imbalanced dataset, the balanced dataset and the GAN dataset.

In [None]:
intersections_RF_uni = {}
for sep in uni_results:
    intersections_RF_uni[sep] = []
    for i in range(1, len(uni_results[sep])):
        intersections_RF_uni[sep].append(len(np.intersect1d(RF_feats_real1[sep].index[:i], uni_results[sep].index[:i])))

In [None]:
intersections_RF_bal_uni = {}
for cl in RF_feats_GAN_bal_mean:
    int_bal = []
    for i in range(1, len(uni_results[cl])):
        int_bal.append(len(np.intersect1d(uni_results[cl].index[:i], RF_feats_GAN_bal_mean[cl].index[:i])))
    intersections_RF_bal_uni[cl] = int_bal

print('Intersections - Dataset Balanced - Finished')

intersections_RF_GAN_uni = {}
for cl in RF_feats_GAN_mean:
    int_GAN = []
    for i in range(1, len(uni_results[cl])):
        int_GAN.append(len(np.intersect1d(uni_results[cl].index[:i], RF_feats_GAN_mean[cl].index[:i])))
    intersections_RF_GAN_uni[cl] = int_GAN

print('Intersections - Dataset GAN - Finished')

intersections_RF_imb_uni = {}
for cl in RF_feats_imb_mean:
    int_imb = []
    for i in range(1, len(uni_results[cl])):
        int_imb.append(len(np.intersect1d(uni_results[cl].index[:i], RF_feats_imb_mean[cl].index[:i])))
    intersections_RF_imb_uni[cl] = int_imb

print('Intersections - Dataset Imbalanced - Finished')

# See intersections if features were randomly shuffled
random_intersections_RF_uni = []
copy_shuffle = list(uni_results[cl].index).copy()
np.random.shuffle(copy_shuffle)
for i in range(1, len(uni_results[cl])):
    random_intersections_RF_uni.append(len(np.intersect1d(uni_results[cl].index[:i], copy_shuffle[:i])))

In [None]:
f, axs = plt.subplots(3,2,figsize=(12,12))
# Graph depicting intersection of important features
for i,ax in zip(intersections_RF_imb_uni, axs.ravel()):
    ax.scatter(range(1,len(intersections_RF_uni[i])+1), 
               np.array(intersections_RF_uni[i]) / np.array(range(1,len(intersections_RF_uni[i])+1)),
            label = 'Real-Real Intersections', color='Black', s=5)
    
    ax.scatter(range(1,len(intersections_RF_uni[i])+1),
               np.array(intersections_RF_imb_uni[i]) / np.array(range(1,len(intersections_RF_uni[i])+1)),
                label = 'Imbalanced Real Dataset', color='Green', s=5)
    ax.scatter(range(1,len(intersections_RF_uni[i])+1),
               np.array(intersections_RF_bal_uni[i]) / np.array(range(1,len(intersections_RF_uni[i])+1)),
                label = 'GAN Augmented Real Dataset', color='Red', s=5)
    ax.scatter(range(1,len(intersections_RF_uni[i])+1),
               np.array(intersections_RF_GAN_uni[i]) / np.array(range(1,len(intersections_RF_uni[i])+1)),
                label = 'GAN Samples Dataset', color='Blue', s=5)
    
    ax.scatter(range(1,len(intersections_RF_uni[i])+1),
               np.array(random_intersections_RF_uni) / np.array(range(1,len(intersections_RF_uni[i])+1)),
            label = 'Random Intersections', color='Orange', s=5)
    
    ax.set_title(i, fontsize=15)
    ax.set_xlim([0,len(intersections_RF_uni[i])//4])

f.supxlabel('Nº of Top Important (Gini Importance) Compounds', fontsize=15, y=0.05)
f.supylabel('Fraction of Common Compounds', fontsize=15, x=0.06)

axs[0][1].legend(loc='upper left', fontsize=11, bbox_to_anchor=(1,1), ncol=1, markerscale=3)
plt.suptitle('Random Forest - Univariate Analysis Comparison', fontsize=18, y=0.93)
plt.show()

In [None]:
f, axs = plt.subplots(1,3,figsize=(16,4), constrained_layout=True)
# Graph depicting intersection of important features
for i in intersections_RF_imb_uni:
    axs[0].scatter(range(1,len(intersections_RF_uni[i])+1),
               np.array(intersections_RF_imb_uni[i]) / np.array(range(1,len(intersections_RF_uni[i])+1)), label =i, s=5)
    
    axs[1].scatter(range(1,len(intersections_RF[i])+1),
               np.array(intersections_RF_bal_uni[i]) / np.array(range(1,len(intersections_RF_uni[i])+1)), label =i, s=5)
    
    axs[2].scatter(range(1,len(intersections_RF[i])+1),
               np.array(intersections_RF_GAN_uni[i]) / np.array(range(1,len(intersections_RF_uni[i])+1)), label =i, s=5)

axs[0].set_title('Imbalanced Real Dataset', fontsize=15)
axs[1].set_title('GAN Augmented Real Dataset', fontsize=15)
axs[2].set_title('GAN Samples Dataset', fontsize=15)
    
axs[0].set_xlim([0,len(intersections_RF_uni[i])//4])
axs[1].set_xlim([0,len(intersections_RF_uni[i])//4])
axs[2].set_xlim([0,len(intersections_RF_uni[i])//4])

f.supxlabel('Nº of Top Important (Gini Importance) Compounds', fontsize=15)
f.supylabel('Fraction of Common Compounds', fontsize=15)

axs[2].legend(loc='upper left', fontsize=11, bbox_to_anchor=(1,1), ncol=1, markerscale=3)
plt.suptitle('Random Forest', fontsize=18)
plt.show()

Calculate intersection of important features from top 1 to top (number of features in the dataset) between the complete dataset (averaged over 20 iterations) and a iteration of the complete dataset, the imbalanced dataset, the balanced dataset and the GAN dataset.

In [None]:
intersections_PLSDA_uni = {}
for sep in uni_results:
    intersections_PLSDA_uni[sep] = []
    for i in range(1, len(uni_results[sep])):
        intersections_PLSDA_uni[sep].append(len(np.intersect1d(PLSDA_feats_real1[sep].index[:i],
                                                               uni_results[sep].index[:i])))

In [None]:
intersections_PLSDA_bal_uni = {}
for cl in PLSDA_feats_GAN_bal_mean:
    int_bal = []
    for i in range(1, len(uni_results[cl])):
        int_bal.append(len(np.intersect1d(uni_results[cl].index[:i], PLSDA_feats_GAN_bal_mean[cl].index[:i])))
    intersections_PLSDA_bal_uni[cl] = int_bal

print('Intersections - Dataset Balanced - Finished')

intersections_PLSDA_GAN_uni = {}
for cl in PLSDA_feats_GAN_mean:
    int_GAN = []
    for i in range(1, len(uni_results[cl])):
        int_GAN.append(len(np.intersect1d(uni_results[cl].index[:i], PLSDA_feats_GAN_mean[cl].index[:i])))
    intersections_PLSDA_GAN_uni[cl] = int_GAN

print('Intersections - Dataset GAN - Finished')

intersections_PLSDA_imb_uni = {}
for cl in PLSDA_feats_imb_mean:
    int_imb = []
    for i in range(1, len(uni_results[cl])):
        int_imb.append(len(np.intersect1d(uni_results[cl].index[:i], PLSDA_feats_imb_mean[cl].index[:i])))
    intersections_PLSDA_imb_uni[cl] = int_imb

print('Intersections - Dataset Imbalanced - Finished')

# See intersections if features were randomly shuffled
random_intersections_PLSDA_uni = []
copy_shuffle = list(uni_results[cl].index).copy()
np.random.shuffle(copy_shuffle)
for i in range(1, len(uni_results[cl])):
    random_intersections_PLSDA_uni.append(len(np.intersect1d(uni_results[cl].index[:i], copy_shuffle[:i])))

In [None]:
f, axs = plt.subplots(3,2,figsize=(12,12))
# Graph depicting intersection of important features
for i,ax in zip(intersections_PLSDA_imb_uni, axs.ravel()):
    ax.scatter(range(1,len(intersections_PLSDA_uni[i])+1), 
               np.array(intersections_PLSDA_uni[i]) / np.array(range(1,len(intersections_PLSDA_uni[i])+1)),
            label = 'Real-Real Intersections', color='Black', s=5)
    
    ax.scatter(range(1,len(intersections_PLSDA_uni[i])+1),
               np.array(intersections_PLSDA_imb_uni[i]) / np.array(range(1,len(intersections_PLSDA_uni[i])+1)),
                label = 'Imbalanced Real Dataset', color='Green', s=5)
    ax.scatter(range(1,len(intersections_PLSDA_uni[i])+1),
               np.array(intersections_PLSDA_bal_uni[i]) / np.array(range(1,len(intersections_PLSDA_uni[i])+1)),
                label = 'GAN Augmented Real Dataset', color='Red', s=5)
    ax.scatter(range(1,len(intersections_PLSDA_uni[i])+1),
               np.array(intersections_PLSDA_GAN_uni[i]) / np.array(range(1,len(intersections_PLSDA_uni[i])+1)),
                label = 'GAN Samples Dataset', color='Blue', s=5)
    ax.scatter(range(1,len(intersections_PLSDA_uni[i])+1),
               np.array(intersections_PLSDA_RUS_uni[i]) / np.array(range(1,len(intersections_PLSDA_uni[i])+1)),
                label = 'UnderSampled Dataset', color='Purple', s=5)
    
    ax.scatter(range(1,len(intersections_PLSDA_uni[i])+1),
               np.array(random_intersections_PLSDA_uni) / np.array(range(1,len(intersections_PLSDA_uni[i])+1)),
            label = 'Random Intersections', color='Orange', s=5)
    
    ax.set_title(i, fontsize=15)
    ax.set_xlim([0,len(intersections_PLSDA_uni[i])//4])

#axl.legend(loc='center left', fontsize=11, bbox_to_anchor=(-0.2,-0.15), ncol=5)
#axl.set_ylabel('Fraction of Common Compounds', fontsize=15)
#axl.set_xlim([0,len(intersections_RF)//8])
#axl.set_ylim([0,1.01])

f.supxlabel('Nº of Top Important (VIP Score) Compounds', fontsize=15, y=0.05)
f.supylabel('Fraction of Common Compounds', fontsize=15, x=0.06)

#axr.set_xlim([0,len(intersections_RF)//8])
#axr.set_ylim([0,1.01])
axs[0][1].legend(loc='upper left', fontsize=11, bbox_to_anchor=(1,1), ncol=1, markerscale=3)
plt.suptitle('PLS-DA - Univariate Analysis Comparison', fontsize=18, y=0.93)
plt.show()

In [None]:
f, axs = plt.subplots(1,3,figsize=(16,4), constrained_layout=True)
# Graph depicting intersection of important features
for i in intersections_RF_imb:
    axs[0].scatter(range(1,len(intersections_PLSDA_uni[i])+1),
               np.array(intersections_PLSDA_imb_uni[i]) / np.array(range(1,len(intersections_PLSDA_uni[i])+1)),
                   label =i, s=5)
    
    axs[1].scatter(range(1,len(intersections_PLSDA_uni[i])+1),
               np.array(intersections_PLSDA_bal_uni[i]) / np.array(range(1,len(intersections_PLSDA_uni[i])+1)),
                   label =i, s=5)
    
    axs[2].scatter(range(1,len(intersections_PLSDA_uni[i])+1),
               np.array(intersections_PLSDA_GAN_uni[i]) / np.array(range(1,len(intersections_PLSDA_uni[i])+1)),
                   label =i, s=5)

axs[0].set_title('Imbalanced Real Dataset', fontsize=15)
axs[1].set_title('GAN Augmented Real Dataset', fontsize=15)
axs[2].set_title('GAN Samples Dataset', fontsize=15)
    
axs[0].set_xlim([0,len(intersections_PLSDA_uni[i])//4])
axs[1].set_xlim([0,len(intersections_PLSDA_uni[i])//4])
axs[2].set_xlim([0,len(intersections_PLSDA_uni[i])//4])

f.supxlabel('Nº of Top Important (VIP Score) Compounds', fontsize=15)
f.supylabel('Fraction of Common Compounds', fontsize=15)

axs[2].legend(loc='upper left', fontsize=11, bbox_to_anchor=(1,1), ncol=1, markerscale=3)
plt.suptitle('PLS-DA', fontsize=18)
plt.show()