In [1]:
import pandas as pd 
import numpy as np
from scipy import linalg
from scipy.spatial.distance import pdist
import networkx as nx
import sys
import random
from tqdm import tqdm

from sklearn.preprocessing import StandardScaler, MinMaxScaler, normalize
from sklearn.metrics import DistanceMetric
from sklearn.metrics import pairwise_distances
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.cluster import OPTICS
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import SpectralClustering
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import classification_report
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.metrics import adjusted_rand_score
from sklearn.neighbors import NearestCentroid
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import matthews_corrcoef as mcc
from sklearn.tree import plot_tree

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

import matplotlib.pyplot as plt
from adjustText import adjust_text
import seaborn as sns
from matplotlib import rcParams, cycler

In [2]:
def norm_and_pca_from_df(df):
    '''
    Function processes the data using Principal component analysis(PCA)
    Args:
        df : Dataframe 
    Returns dataframe containing rincipal component of df
    '''
    
    df_normalized = pd.DataFrame()
    cols = list(df.columns)
    df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
    df_normalized.index = df.index
    
    df_pca = pd.DataFrame(PCA(n_components = 2).fit_transform(df_normalized))
    df_pca.columns = ['P1', 'P2']
    df_pca.index = df.index
    
    return df_normalized, df_pca

########################################################################################

def calculate_eigen_self_tuning(df, k):
    
    dimension = df.shape[0]
    dist_ = pdist(df)
    pd = np.zeros([dimension, dimension])
    dist = iter(dist_)
    for i in range(dimension):
        for j in range(i+1, dimension):  
            d = next(dist)
            pd[i,j] = d
            pd[j,i] = d
            
    #calculate local sigma
    sigmas = np.zeros(dimension)
    for i in tqdm(range(len(pd))):
        sigmas[i] = sorted(pd[i])[7]
    
    adjacency_matrix = np.zeros([dimension, dimension])
    dist = iter(dist_)
    for i in tqdm(range(dimension)):
        for j in range(i+1, dimension):  
            d = np.exp(-1*next(dist)**2/(sigmas[i]*sigmas[j]))
            adjacency_matrix[i,j] = d
            adjacency_matrix[j,i] = d
            
    degree_matrix = np.sum(adjacency_matrix, axis=0) * np.eye(dimension)
    
    # Normalized laplacian matrix
    d_half = linalg.fractional_matrix_power(degree_matrix, -0.5)
    laplacian_matrix_normalized = np.matmul(np.matmul(d_half, adjacency_matrix), d_half)
    
    e, v = np.linalg.eigh(laplacian_matrix_normalized)
    X = v[:, -1*k:]
    
    row_sums = X.sum(axis=1)
    Y = X / row_sums[:, np.newaxis]
            
    return X, Y

############################################################################################

def ensemble_classification(df_unlabelled, test_index_list, class_label_list, n):
    
    df = df_unlabelled.copy()
    df['class_label'] = class_label_list
    
    train_df = df.loc[~ df.index.isin(test_index_list)]
    test_df = df.loc[df.index.isin(test_index_list)]

    X_train = train_df.drop(['class_label'],axis=1)
    y_train = train_df[['class_label']]

    X_test = test_df.drop(['class_label'],axis=1)
    y_test = test_df[['class_label']]
    
    acc_1 = [] 
    mcc_1 = []
    cf_1 = []
    acc_2 = []
    mcc_2 = []
    cf_2 = []
    acc_3 = []
    mcc_3 = []
    cf_3 = []
    
    
    for values in range(0,n):
        
        classifier1 = NearestCentroid()
        classifier1.fit(X_train, y_train.values.ravel())
        acc_1.append(classifier1.score(X_test, y_test))
        mcc_1.append(mcc(classifier1.predict(X_test), y_test))
        cf_1.append(confusion_matrix(y_test, classifier1.predict(X_test)))
        print(classification_report(y_test, classifier1.predict(X_test)))
        
        classifier2 = KNeighborsClassifier(n_neighbors=5)
        classifier2.fit(X_train, y_train.values.ravel())
        acc_2.append(classifier2.score(X_test, y_test))
        mcc_2.append(mcc(classifier2.predict(X_test), y_test))
        cf_2.append(confusion_matrix(y_test, classifier2.predict(X_test)))
        print(classification_report(y_test, classifier2.predict(X_test)))
        
        classifier3 = DecisionTreeClassifier(criterion = "gini")
        classifier3.fit(X_train, y_train)
        acc_3.append(classifier3.score(X_test, y_test))
        mcc_3.append(mcc(classifier3.predict(X_test), y_test))
        cf_3.append(confusion_matrix(y_test, classifier3.predict(X_test)))
        print(classification_report(y_test, classifier3.predict(X_test)))
        
    print('Nearest Centroid (acc, mcc) -',sum(acc_1)/len(acc_1), sum(mcc_1)/len(mcc_1))
    print('K nearest neighbours (acc, mcc) -',sum(acc_2)/len(acc_2),sum(mcc_2)/len(mcc_2) )
    print('Decision Tree (acc, mcc) -',sum(acc_3)/len(acc_3),sum(mcc_3)/len(mcc_3) )
    
    return cf_1, cf_2, cf_3
    
    
############################################################################################

def ensemble_crossfold(df_unlabelled, class_label_list):
    
    df = df_unlabelled.copy()
    X = df
    y = class_label_list
    
    n_splits = 5
    n_repeats = 10000
    
    
    cv = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=42)
    
    acc_1 = [] 
    mcc_1 = []
    acc_2 = []
    mcc_2 = []
    acc_3 = []
    mcc_3 = []
    
        
    classifier1 = NearestCentroid()
    for train_index, test_index in cv.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        classifier1.fit(X_train, y_train)
        acc_1.append(classifier1.score(X_test, y_test))
        mcc_1.append(mcc(classifier1.predict(X_test), y_test))
    print(classification_report(y_test, classifier1.predict(X_test)))

    classifier2 = KNeighborsClassifier(n_neighbors=5)
    for train_index, test_index in cv.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        classifier2.fit(X_train, y_train)
        acc_2.append(classifier1.score(X_test, y_test))
        mcc_2.append(mcc(classifier1.predict(X_test), y_test))
    print(classification_report(y_test, classifier1.predict(X_test)))

    classifier3 = DecisionTreeClassifier(criterion = "gini")
    for train_index, test_index in cv.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        classifier3.fit(X_train, y_train)
        acc_3.append(classifier1.score(X_test, y_test))
        mcc_3.append(mcc(classifier1.predict(X_test), y_test))
    print(classification_report(y_test, classifier1.predict(X_test)))
        
    print('Nearest Centroid (acc, mcc) -',sum(acc_1)/len(acc_1), sum(mcc_1)/len(mcc_1))
    print('K nearest neighbours (acc, mcc) -',sum(acc_2)/len(acc_2),sum(mcc_2)/len(mcc_2) )
    print('Decision Tree (acc, mcc) -',sum(acc_3)/len(acc_3),sum(mcc_3)/len(mcc_3) )
    
############################################################################################    

In [10]:
# Load dataset 1
df_csv_data = pd.read_csv("dataset_1/d1_mp.csv", index_col = 0)
df_normalized, df_pca = norm_and_pca_from_df(df_csv_data)

df_pca['class_label'] = ['CD' if 'CD' in index 
                         else 'IBS' if 'IBS' in index
                         else 'UCr' if 'UCr' in index 
                         else 'UCa' if 'UCa' in index 
                         else 'GCA' if 'GCA' in index 
                         else 'A' if 'C' not in index else 'C' 
                         for index, patient in df_pca.iterrows()]
df_pca['class_label'] = df_pca['class_label'].astype('category').cat.codes

df_pca_control = df_pca[['P1','P2']]

index_list = ['P13_C','P14_C','P15_C','P16_C','P17_C',
              'P25_CD','P26_CD','P27_CD',
              'P34_UCr',
              'P41_UCa', 'P42_UCa',
              'P52_IBS', 'P53_IBS', 'P54_IBS','P55_IBS',
              'P62_A', 'P63_A',
              'P70_GCA', 'P71_GCA']

  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataF

  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataF

  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataF

  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataF

  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataF

  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataF

  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataF

In [None]:
# ## Load dataset 2
# df_csv_data = pd.read_csv("dataset_2/d2_mp.csv", index_col=0)

# df_normalized, df_pca = norm_and_pca_from_df(df_csv_data)

# df_pca['class_label'] = ['C' if 'C' in index else 'N' if 'N' in index else 'H' for index, patient in df_pca.iterrows()]
# df_pca['class_label'] = df_pca['class_label'].astype('category').cat.codes
# df_pca_control = df_pca[['P1','P2']]

# index_list = ['C_15','C_16','C_17','C_18','C_19',
#               'N_44','N_45','N_46','N_47','N_48','N_49','N_50','N_51',
#               'H_74','H_75','H_76','H_77','H_78','H_79','H_80']

In [14]:
# Load benchmark data for dataset 2

df_csv_data = pd.read_csv("dataset_2/d2_benchmark.csv", index_col=0)

df_normalized, df_pca = norm_and_pca_from_df(df_csv_data)

df_pca['class_label'] = ['C' if 'C' in index else 'N' if 'N' in index else 'H' for index, patient in df_pca.iterrows()]
df_pca['class_label'] = df_pca['class_label'].astype('category').cat.codes
df_pca_control = df_pca[['P1','P2']]

index_list = ['C_15','C_16','C_17','C_18','C_19',
              'N_44','N_45','N_46','N_47','N_48','N_49','N_50','N_51',
              'H_74','H_75','H_76','H_77','H_78','H_79','H_80']

In [20]:
## Load combined dataset
df_csv = pd.read_csv("dataset_3/d3_mp.csv", index_col=0)
df_csv_data = df_csv.iloc[:, 0:170]

df_normalized, df_pca = norm_and_pca_from_df(df_csv_data)

df_pca['class_label'] = ['N' if 'N' in index
                         else 'H' if 'H' in index
                         else 'CD' if 'CD' in index 
                         else 'IBS' if 'IBS' in  index
                         else 'UCr' if 'UCr' in index 
                         else 'UCa' if 'UCa' in index
                         else 'GCA' if 'GCA' in index 
                         else 'A' if '_A' in index else 'C' 
                         for index, patient in df_pca.iterrows()]

df_pca['class_label'] = df_pca['class_label'].astype('category').cat.codes

index_list = ['P13_C','P14_C','P15_C','P16_C','P17_C',
              'P25_CD','P26_CD','P27_CD',
              'P34_UCr',
              'P41_UCa', 'P42_UCa',
              'P52_IBS', 'P53_IBS', 'P54_IBS','P55_IBS',
              'P62_A', 'P63_A',
              'P70_GCA', 'P71_GCA',
              'C_15','C_16','C_17','C_18','C_19',
              'N_44','N_45','N_46','N_47','N_48','N_49','N_50','N_51',
              'H_74','H_75','H_76','H_77','H_78','H_79','H_80']

  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataFrame(normalize(MinMaxScaler().fit_transform(df[cols])))
  df_normalized[cols] = pd.DataF

### Non-benchmark

In [22]:
df_pca.class_label.value_counts()

1    40
6    32
4    29
5    13
2    12
7     8
0     8
3     8
8     6
Name: class_label, dtype: int64

In [21]:
# Original
cf_1, cf_2, cf_3 = ensemble_classification(df_csv_data, index_list, df_pca.class_label, 1)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.42      0.50      0.45        10
           2       0.50      0.33      0.40         3
           3       0.00      0.00      0.00         2
           4       1.00      0.14      0.25         7
           5       0.00      0.00      0.00         4
           6       0.33      0.12      0.18         8
           7       0.00      0.00      0.00         2
           8       0.00      0.00      0.00         1

    accuracy                           0.21        39
   macro avg       0.25      0.12      0.14        39
weighted avg       0.39      0.21      0.23        39

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.33      0.90      0.49        10
           2       0.00      0.00      0.00         3
           3       0.00      0.00      0.00         2
           4       0.29 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [23]:
# Standardized
cf_1, cf_2, cf_3 = ensemble_classification(df_normalized, index_list, df_pca.class_label, 1)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.44      0.40      0.42        10
           2       0.50      0.33      0.40         3
           3       0.67      1.00      0.80         2
           4       0.40      0.57      0.47         7
           5       0.33      0.25      0.29         4
           6       0.40      0.50      0.44         8
           7       1.00      0.50      0.67         2
           8       0.00      0.00      0.00         1

    accuracy                           0.44        39
   macro avg       0.42      0.39      0.39        39
weighted avg       0.43      0.44      0.42        39

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.40      0.60      0.48        10
           2       0.50      0.33      0.40         3
           3       0.00      0.00      0.00         2
           4       0.43 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [25]:
# Eigen transformation
Xst, Yst = calculate_eigen_self_tuning(df_normalized, 18)

df_U = pd.DataFrame(Xst)
df_U.index = df_csv_data.index

df_N = pd.DataFrame(Yst)
df_N.index = df_csv_data.index

cf_1, cf_2, cf_3 = ensemble_classification(df_U, index_list, df_pca.class_label, 1)

100%|█████████████████████████████████████████████████████████████████████████████| 156/156 [00:00<00:00, 11998.01it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 156/156 [00:00<00:00, 2716.30it/s]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.17      0.50      0.25         2
           1       0.67      0.20      0.31        10
           2       0.50      0.33      0.40         3
           3       0.40      1.00      0.57         2
           4       0.25      0.29      0.27         7
           5       0.33      0.25      0.29         4
           6       0.33      0.50      0.40         8
           7       0.00      0.00      0.00         2
           8       0.00      0.00      0.00         1

    accuracy                           0.33        39
   macro avg       0.29      0.34      0.28        39
weighted avg       0.39      0.33      0.31        39

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.50      0.60      0.55        10
           2       0.50      0.33      0.40         3
           3       0.50      0.50      0.50         2
           4       0.33 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Benchmark (dataset 2)

In [None]:
# Original
ensemble_crossfold(df_csv_data, df_pca.class_label)

In [None]:
# Standardized
ensemble_crossfold(df_normalized, df_pca.class_label)

In [None]:
# Eigen transformation
Xst, Yst = calculate_eigen_self_tuning(df_normalized, 7)

df_U = pd.DataFrame(Xst)
df_U.index = df_csv_data.index

df_N = pd.DataFrame(Yst)
df_N.index = df_csv_data.index

ensemble_crossfold(df_U, df_pca.class_label)

In [None]:
ConfusionMatrixDisplay(cf_1[0]).plot()

In [None]:
sum(cf_1[0])

In [None]:
len(cf_1)

In [None]:
# Dataset 2 subset to benchmark dataset

# Create a list of column names to search for
col_names = ['BLAST_KCP_MOUSE', 'BLAST_UGPC3_RHIEC',
             'ILVC_CLOBB', 'S10A9_BOVIN',
             'SCONB_ARTOC', 'RS3_AGARV',
             'BLAST_FTHS2_DESHY', 'BLAST_RS2_DESRM',
             'BLAST_ACDS_CLOAB', 'BLAST_G3P_CLOPA']

# Filter the dataframe 2 based on column names
filtered_df = df_csv_data[[col for col in col_names if col in df_csv_data.columns]]
filtered_df