In [1]:
# pip install -U scikit-fuzzy
# pip install skfuzzy

In [2]:
import time
import numpy as np
import pandas as pd
import random
import shap
import pickle

from sklearn.cluster import KMeans

import skfuzzy as fuzz
from scipy import linalg as la
from sklearn.metrics import accuracy_score
from sklearn.base import MultiOutputMixin, BaseEstimator

import skfuzzy as fuzz

import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

In [3]:
%%html
<style>
.output_wrapper, .output {
    height:auto !important;
    max-height:10000px;  /* your desired max-height here */
}
.output_scroll {
    box-shadow:none !important;
    webkit-box-shadow:none !important;
}
</style>

In [4]:
datasets = ['ecoli', 'glass', 'heart-statlog', 'iris', 'liver-disorders', 'pima', 'vehicle', 
            'wine-quality-red', 'yeast', 'vertebra-column-2c', 'saheart', 'new-thyroid',
            'echocardiogram', 'appendicitis', 'hayes-roth']

In [5]:
# Local imports
from FuzzyCMeansWrapper import FuzzyCMeans
from KMeansfunctions import f_importance
from miscellaneous import featuresorting
from plotting import resultplot
from KMeansfunctions import kmeans_performance

In [7]:
shap_max_size = 5
from sklearn.cluster import KMeans
import numpy as np

for dataset in datasets:
    
    # loading the current dataset
    df = pd.read_csv('./datasets/'+dataset+'.csv')
    n_centers = len(np.unique(df.values[:,-1]))
    df = df.drop(df.columns[-1], axis='columns')

    #Kmeans clusters calculated
    kmeans = KMeans(init="random", n_clusters=n_centers, n_init=10, max_iter=1000, random_state=0)
    kmeans.fit(df.values)
    y_true_kmeans = kmeans.predict(df.values)

    #FuzzyCMeans clusters calculated
    fcm = FuzzyCMeans(n_centers, seed=0).fit(df.values)
    y_true_fcm = np.argmax(fcm.predict(df.values), axis=1)

    # sample shap if above value
    if df.shape[0] > shap_max_size:
        shapdf = shap.sample(df, shap_max_size)
    else:
        shapdf = df

    # average shap values FCM
    fcm_ex = shap.KernelExplainer(fcm.predict, df)
    shap_values = fcm_ex.shap_values(shapdf)
    ave_shap_values_fcm = np.zeros(shap_values[0].shape[1])
    for shap_i in shap_values:
        ave_shap_values_fcm += np.mean(np.absolute(shap_i), axis=0)

    # average shap values KMeans
    fcm_ex = shap.KernelExplainer(kmeans.predict, df)
    shap_values = fcm_ex.shap_values(shapdf)
    ave_shap_values_kmeans = np.zeros(shap_values.shape[1])
    ave_shap_values_kmeans += np.mean(np.absolute(shap_values), axis=0)

    # Dataframe with importance scores
    pbfi_df = pd.DataFrame(columns=['features','PFBI FuzzyCMeans',"SHAP FuzzyCMeans", "PFBI KMeans", "SHAP KMeans", "PFBI Spectral", "SHAP Spectral"])
    pbfi_df['features'] = df.columns
    pbfi_df['PFBI FuzzyCMeans'] = fcm.f_importance(df.values)
    pbfi_df['SHAP FuzzyCMeans'] = ave_shap_values_fcm   
    pbfi_df['PFBI KMeans'] = f_importance(kmeans.cluster_centers_, df.values)
    pbfi_df['SHAP KMeans'] = ave_shap_values_kmeans
    pbfi_df.to_excel(f"importance_scores_{dataset}.xlsx")

    # Sorting features in order of importance
    FuzzyCMeans_labels, KMeans_labels, Shap_labels_fcm, Shap_labels_kmeans = featuresorting(pbfi_df)

    # computing the perturbation errors for PFBI KMeans
    errors_kmeans = kmeans_performance(df, KMeans_labels, kmeans, y_true_kmeans)
    
    # Fuzzy C Means
    errors_fuzzycmeans = [1.0]
    exclude_fuzzycmeans = []

    for fi in FuzzyCMeans_labels:

        df_temp = df.copy()
        exclude_fuzzycmeans.append(fi)

        for column in exclude_fuzzycmeans:
            df_temp[column] = df_temp[column].mean()

        y_pred = np.argmax(fcm.predict(df_temp.values), axis=1)
        errors_fuzzycmeans.append(accuracy_score(y_true_fcm, y_pred))

    # Shap
    errors_shap_fuzzycmeans = [1.0]
    exclude_shap = []

    # computing the perturbation errors for SHAP FCM
    for fi in Shap_labels_fcm:

        df_temp = df.copy()
        exclude_shap.append(fi)

        for column in exclude_shap:
            df_temp[column] = df_temp[column].mean()

        y_pred = np.argmax(fcm.predict(df_temp.values), axis=1)
        errors_shap_fuzzycmeans.append(accuracy_score(y_true_fcm, y_pred))
        
    # computing the perturbation errors for SHAP KMeans
    errors_shap_kmeans = kmeans_performance(df, Shap_labels_kmeans, kmeans, y_true_kmeans)


    error_df = pd.DataFrame(columns=['PBFI FuzzyCMeans',"SHAP FuzzyCMeans", "PFBI KMeans", "SHAP KMeans"])
    error_df['PFBI FuzzyCMeans'] = errors_fuzzycmeans
    error_df['SHAP FuzzyCMeans'] = errors_shap_fuzzycmeans   
    error_df['PFBI KMeans'] = errors_kmeans
    error_df['SHAP KMeans'] = errors_shap_kmeans
    error_df.to_excel(f"error_scores_{dataset}.xlsx")



Using 336 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.


  0%|          | 0/5 [00:00<?, ?it/s]

Using 336 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.


  0%|          | 0/5 [00:00<?, ?it/s]

[5 5 5 1 0 7 0 0 0 5 5 0 0 5 0 5 0 5 5 0 0 5 1 0 0 0 5 7 0 0 0 0 7 0 0 0 0
 1 0 0 0 0 5 7 1 0 0 5 0 0 0 5 0 0 7 7 7 0 7 0 7 0 0 5 5 0 0 1 0 0 0 0 5 0
 0 5 0 0 5 0 0 0 0 0 0 0 0 7 0 7 0 0 7 0 0 1 7 1 1 1 5 0 5 0 0 5 1 0 1 5 7
 0 0 5 0 0 0 1 0 7 0 0 0 0 1 0 0 0 0 0 0 0 0 5 0 0 0 5 0 0 0 0 0 0 0 6 4 0
 5 6 4 6 4 1 4 4 6 6 7 7 6 4 1 6 0 6 6 6 5 6 5 6 4 5 5 4 6 4 6 0 6 4 3 0 5
 4 6 4 4 1 4 6 4 4 6 6 5 4 4 4 6 4 4 4 7 6 6 5 4 6 6 6 1 0 0 0 0 7 1 6 1 6
 3 3 6 6 6 6 1 6 6 6 6 6 6 7 6 6 6 5 1 6 1 6 6 6 7 4 6 1 6 3 6 1 6 6 7 7 6
 1 1 1 1 1 5 1 1 1 1 1 1 1 1 7 3 1 1 1 1 3 3 3 3 3 1 0 1 1 1 1 5 1 1 5 1 7
 1 1 1 1 0 0 1 1 1 7 7 7 1 1 7 1 1 7 4 5 1 5 1 1 1 7 1 1 1 0 1 1 1 1 0 1 1
 5 1 1]
[5 5 5 1 0 7 0 0 0 5 5 0 0 5 0 5 0 5 5 0 0 5 1 0 0 0 5 7 0 0 0 0 7 0 0 0 0
 1 0 0 0 0 5 7 1 0 0 5 0 0 0 5 0 0 7 7 7 0 7 0 7 0 0 5 5 0 0 1 0 0 0 0 5 0
 0 5 0 0 5 0 0 0 0 0 0 0 0 7 0 7 0 0 7 0 0 1 7 1 1 1 5 0 5 0 0 5 1 0 1 5 7
 0 0 5 0 0 0 1 0 7 0 0 0 0 1 0 0 0 0 0 0 0 0 5 0 0 0 5 0 0 0 0 0 0 0 6 4 0
 5 6 4 6 4 1 4 4 

Using 214 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.


  0%|          | 0/5 [00:00<?, ?it/s]

Using 214 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.


  0%|          | 0/5 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [1]:
# used for testing purposes for testing smaller subsection of datasets
datasets = ['ecoli', 'glass']

# The which keyword controls wether to plot both, with keyword "both", or only kmeans with keyword "Kmeans", or only Fuzzycmeans with keyword "Fuzzycmeans"
resultplot(datasets, which="both")

Completed figure 1 out of 2 total
Completed figure 2 out of 2 total
