## Diversity 

In [1]:
import warnings
warnings.filterwarnings('ignore') 

import math 
from os import path
import pandas as pd 
from tqdm import tqdm  
import matplotlib.pyplot as plt

from functions import *  

### preprocessing libraries 
from sklearn.model_selection import train_test_split  
from sklearn.preprocessing import MinMaxScaler, StandardScaler


### model libraries 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.svm import SVC 
from sklearn.linear_model import LogisticRegression 
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier


### static 
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier 
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier 


### libraries for metrics 
from sklearn.metrics import (accuracy_score,
                             precision_score,
                             recall_score, 
                             f1_score,
                             roc_auc_score, 
                             precision_recall_curve,
                             balanced_accuracy_score,
                             auc) 

### data balancing libraries 
from imblearn.over_sampling import SMOTE 

In [2]:
def side_by_side(*objs, **kwds):
    from pandas.io.formats.printing import adjoin
    space = kwds.get('space', 4)
    reprs = [repr(obj).split('\n') for obj in objs]
    print (adjoin(space, *reprs))
    print()
    return

#### Load dataset 

In [3]:
assessment_filename       = "data/assessment_statistics.csv"
cogniteive_score_filename = "data/cogniteive_score_statistics.csv"
mri_filename              = "data/mri_statistics.csv"   
four_labels_filename      = "data/four_labels.csv" 

baselineDF = pd.read_csv("data/Baseline_final.csv") 
assessmentDF      = pd.read_csv(assessment_filename)
cognitive_scoreDF = pd.read_csv(cogniteive_score_filename)
mriDF             = pd.read_csv(mri_filename) 
four_labelsDF     = pd.read_csv(four_labels_filename)

In [187]:
complete_datasetDF = assessmentDF[:]
# complete_datasetDF = pd.merge(assessmentDF, cognitive_scoreDF, on="RID", how="inner") 
complete_datasetDF = pd.merge(complete_datasetDF, mriDF, on="RID", how="inner") 
complete_datasetDF = pd.merge(complete_datasetDF, baselineDF, on="RID", how="inner") 

# complete_datasetDF = pd.merge(complete_datasetDF, four_labelsDF, on="RID", how="inner") 
print("Shape: {}".format(complete_datasetDF.shape)) 

Shape: (1371, 146)


In [188]:
mapping = {"AD": 0, "sMCI": 1, "CN": 2, "pMCI": 3} 

### Single Best 

In [189]:
def _process_predictions(y, y_pred1, y_pred2):
    """Pre-process the predictions of a pair of base classifiers for the
    computation of the diversity measures
    Parameters
    ----------
    y : array of shape (n_samples):
        class labels of each sample.
    y_pred1 : array of shape (n_samples):
              predicted class labels by the classifier 1 for each sample.
    y_pred2 : array of shape (n_samples):
              predicted class labels by the classifier 2 for each sample.
    Returns
    -------
    N00 : Percentage of samples that both classifiers predict the wrong label
    N10 : Percentage of samples that only classifier 2 predicts the wrong label
    N10 : Percentage of samples that only classifier 1 predicts the wrong label
    N11 : Percentage of samples that both classifiers predict the correct label
    """
    size_y = len(y)
    if size_y != len(y_pred1) or size_y != len(y_pred2):
        raise ValueError(
            'The vector with class labels must have the same size.')

    N00, N10, N01, N11 = 0.00001, 0.00001, 0.00001, 0.00001
    for index in range(size_y):
        if y_pred1[index] == y[index] and y_pred2[index] == y[index]:
            N11 += 1.0
        elif y_pred1[index] == y[index] and y_pred2[index] != y[index]:
            N10 += 1.0
        elif y_pred1[index] != y[index] and y_pred2[index] == y[index]:
            N01 += 1.0
        else:
            N00 += 1.0

    return N00 / size_y, N10 / size_y, N01 / size_y, N11 / size_y

In [190]:
import numpy as np 

def Q_statistic(y, y_pred1, y_pred2):
    """Calculates the Q-statistics diversity measure between a pair of
    classifiers. The Q value is in a range [-1, 1]. Classifiers that tend to
    classify the same object correctly will have positive values of Q, and
    Q = 0 for two independent classifiers.
    Parameters
    ----------
    y : array of shape (n_samples):
        class labels of each sample.
    y_pred1 : array of shape (n_samples):
              predicted class labels by the classifier 1 for each sample.
    y_pred2 : array of shape (n_samples):
              predicted class labels by the classifier 2 for each sample.
    Returns
    -------
    Q : The q-statistic measure between two classifiers
    """
    N00, N10, N01, N11 = _process_predictions(y, y_pred1, y_pred2)
    Q = ((N11 * N00) - (N01 * N10)) / ((N11 * N00) + (N01 * N10))
    return Q


def correlation_coefficient(y, y_pred1, y_pred2):
    """Calculates the correlation  between two classifiers using oracle
    outputs. Coefficient is a value in a range [-1, 1].
    Parameters
    ----------
    y : array of shape (n_samples):
        class labels of each sample.
    y_pred1 : array of shape (n_samples):
              predicted class labels by the classifier 1 for each sample.
    y_pred2 : array of shape (n_samples):
              predicted class labels by the classifier 2 for each sample.
    Returns
    -------
    rho : The correlation coefficient measured between two classifiers
    """
    N00, N10, N01, N11 = _process_predictions(y, y_pred1, y_pred2)
    tmp = (N11 * N00) - (N10 * N01)
    rho = tmp / np.sqrt((N11 + N01) * (N10 + N00) * (N11 + N10) * (N01 + N00))
    return rho


def disagreement_measure(y, y_pred1, y_pred2):
    """Calculates the disagreement measure between a pair of classifiers. This
    measure is calculated by the frequency that only one classifier makes the
    correct prediction.
    Parameters
    ----------
    y : array of shape (n_samples):
        class labels of each sample.
    y_pred1 : array of shape (n_samples):
              predicted class labels by the classifier 1 for each sample.
    y_pred2 : array of shape (n_samples):
              predicted class labels by the classifier 2 for each sample.
    Returns
    -------
    disagreement : The frequency at which both classifiers disagrees
    """
    _, N10, N01, _ = _process_predictions(y, y_pred1, y_pred2)
    disagreement = N10 + N01
    return disagreement

In [191]:
def measure_diversity(model1, model2, dataset, target_column, test_size, seed, scaler_type): 
    X_train, X_test, y_train, y_test = split_dataset(dataset, target_column, test_size, seed) 
    
    scaled_X_train, scaled_X_test = normalize_dataset(X_train, X_test, scaler_type) 
    
    ## 4. balancing data 
    balanced_trainX, balanced_trainY, balanced_testX, balanced_testY = balance_data(scaled_X_train, y_train, 
                                                                                        scaled_X_test, y_test) 
    
    model1.fit(balanced_trainX, balanced_trainY)
    model2.fit(balanced_testX, balanced_testY) 
    
    y_pred1 = model1.predict(balanced_testX)
    y_pred2 = model2.predict(balanced_testX)
    
    q_statistic       = Q_statistic(balanced_testY.values, y_pred1, y_pred2) 
    corr_score        = correlation_coefficient(balanced_testY, y_pred1, y_pred2) 
    disagreement      = disagreement_measure(balanced_testY, y_pred1, y_pred2) 
    
    return q_statistic, corr_score, disagreement 


def get_diversity(model, compared_models, dataset, target_column, test_size, seed, scaler_type): 
    
    q_statistic_list  = []
    corr_score_list   = [] 
    disagreement_list = [] 
    names = [] 
    
    for c_model in compared_models:  
        q, c, d = measure_diversity(model, c_model, dataset, target_column, test_size, seed, scaler_type) 
        q_statistic_list.append(q)
        corr_score_list.append(c)
        disagreement_list.append(d)
        
        names.append(c_model.__class__.__name__)
    
    result = {"model": names, 
              "q_statistic": q_statistic_list, 
              "corr_score": corr_score_list, 
              "disagreement": disagreement_list}
    
    return pd.DataFrame(result)

In [192]:
TARGET_COLUMN = "DX" 
TEST_SIZE     = 0.20 
SPLIT_SEEDS   = [45, 78, 95, 15, 53, 12, 85, 61, 77, 10] 
SCALER_TYPE   = "mm" # ss for Sdandard Scaler, mm for MinMax Scaler 
CORR_LIMIT    = 0.70 
CV_K          = 5 

dataset       = complete_datasetDF.drop(['RID'], axis=1) 

mapping = {"AD": 0, "sMCI": 1, "CN": 2, "pMCI": 3}

dataset['DX'] = dataset['DX'].map(mapping)  

In [220]:
model_list = [KNeighborsClassifier(), 
              RandomForestClassifier(random_state=42),  
              GaussianNB(), 
              MLPClassifier(solver='adam', random_state=42),  
              SVC(probability=True, random_state=42),   
              LGBMClassifier(random_state=42), 
              LogisticRegression(), 
              XGBClassifier(random_state=42), 
             ] 

diversityDF = pd.DataFrame() 

model = DecisionTreeClassifier(random_state=42)

SPLIT_SEEDS = [45, 78, 95, 15, 53, 12, 85, 61, 77, 10]

for seed in SPLIT_SEEDS: 
    res = get_diversity(model, model_list, dataset, TARGET_COLUMN, TEST_SIZE, seed, SCALER_TYPE) 
    diversityDF = pd.concat([diversityDF, res])

In [221]:
def result_formatter(df): 
    columns = df.columns[1:]
    result = {}
    
    for col in columns: 
        result[col] = "{}±{}".format(round(df[col].mean(), 3), round(df[col].std(), 3))  
    
    return pd.DataFrame(result, index = [0])

In [222]:
result_formatter(diversityDF)

Unnamed: 0,q_statistic,corr_score,disagreement
0,0.554±0.146,0.136±0.12,0.269±0.021
