# Prototypes and Criticisms using MMD-critic

In [1]:
import os
import glob
import pandas as pd
import numpy as np
import sys
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.model_selection import train_test_split

In [2]:
# select prototypes with code copied from https://github.com/BeenKim/MMD-critic/blob/master/mmd.py

##############################################################################################################################
# Function choose m of all rows by MMD as per kernelfunc
# ARGS:
# K : kernel matrix
# candidate_indices : array of potential choices for selections, returned values are chosen from these  indices
# m: number of selections to be made
# is_K_sparse:  True means K is the pre-computed  csc sparse matrix? False means it is a dense matrix.
# RETURNS: subset of candidate_indices which are selected as prototypes
##############################################################################################################################

def greedy_select_protos(K, candidate_indices, m, is_K_sparse=False):

    if len(candidate_indices) != np.shape(K)[0]:
        K = K[:,candidate_indices][candidate_indices,:]

    n = len(candidate_indices)

    # colsum = np.array(K.sum(0)).ravel() # same as rowsum
    if is_K_sparse:
        colsum = 2*np.array(K.sum(0)).ravel() / n
    else:
        colsum = 2*np.sum(K, axis=0) / n

    selected = np.array([], dtype=int)
    value = np.array([])
    for i in range(m):
        maxx = -sys.float_info.max
        argmax = -1
        candidates = np.setdiff1d(range(n), selected)

        s1array = colsum[candidates]
        if len(selected) > 0:
            temp = K[selected, :][:, candidates]
            if is_K_sparse:
                # s2array = temp.sum(0) *2
                s2array = temp.sum(0) * 2 + K.diagonal()[candidates]

            else:
                s2array = np.sum(temp, axis=0) *2 + np.diagonal(K)[candidates]

            s2array = s2array/(len(selected) + 1)

            s1array = s1array - s2array

        else:
            if is_K_sparse:
                s1array = s1array - (np.abs(K.diagonal()[candidates]))
            else:
                s1array = s1array - (np.abs(np.diagonal(K)[candidates]))

        argmax = candidates[np.argmax(s1array)]
        # print("max %f" %np.max(s1array))

        selected = np.append(selected, argmax)
        # value = np.append(value,maxx)
        KK = K[selected, :][:, selected]
        if is_K_sparse:
            KK = KK.todense()

        inverse_of_prev_selected = np.linalg.inv(KK)  # shortcut

    return candidate_indices[selected]

In [3]:
# select criticisms with code copied from https://github.com/BeenKim/MMD-critic/blob/master/mmd.py

##############################################################################################################################
# function to select criticisms
# ARGS:
# K: Kernel matrix
# selectedprotos: prototypes already selected
# m : number of criticisms to be selected
# reg: regularizer type.
# is_K_sparse:  True means K is the pre-computed  csc sparse matrix? False means it is a dense matrix.
# RETURNS: indices selected as criticisms
##############################################################################################################################

def select_criticism_regularized(K, selectedprotos, m, reg='logdet', is_K_sparse=True):

    n = np.shape(K)[0]
    if reg in ['None','logdet','iterative']:
        pass
    else:
        print("wrong regularizer :" + reg)
        exit(1)
    options = dict()

    selected = np.array([], dtype=int)
    candidates2 = np.setdiff1d(range(n), selectedprotos)
    inverse_of_prev_selected = None  # should be a matrix

    if is_K_sparse:
        colsum = np.array(K.sum(0)).ravel()/n
    else:
        colsum = np.sum(K, axis=0)/n

    for i in range(m):
        maxx = -sys.float_info.max
        argmax = -1
        candidates = np.setdiff1d(candidates2, selected)

        s1array = colsum[candidates]

        temp = K[selectedprotos, :][:, candidates]
        if is_K_sparse:
            s2array = temp.sum(0)
        else:
            s2array = np.sum(temp, axis=0)

        s2array = s2array / (len(selectedprotos))

        s1array = np.abs(s1array - s2array)
        if reg == 'logdet':
            if inverse_of_prev_selected is not None: # first call has been made already
                temp = K[selected, :][:, candidates]
                if is_K_sparse:
                    temp2 = temp.transpose().dot(inverse_of_prev_selected)
                    regularizer = temp.transpose().multiply(temp2)
                    regcolsum = regularizer.sum(1).ravel()# np.sum(regularizer, axis=0)
                    regularizer = np.abs(K.diagonal()[candidates] - regcolsum)

                else:
                    # hadamard product
                    temp2 = np.array(np.dot(inverse_of_prev_selected, temp))
                    regularizer = temp2 * temp
                    regcolsum = np.sum(regularizer, axis=0)
                    regularizer = np.log(np.abs(np.diagonal(K)[candidates] - regcolsum))
                s1array = s1array + regularizer
            else:
                if is_K_sparse:
                    s1array = s1array - np.log(np.abs(K.diagonal()[candidates]))
                else:
                    s1array = s1array - np.log(np.abs(np.diagonal(K)[candidates]))
        argmax = candidates[np.argmax(s1array)]
        maxx = np.max(s1array)

        selected = np.append(selected, argmax)
        if reg == 'logdet':
            KK = K[selected,:][:,selected]
            if is_K_sparse:
                KK = KK.todense()

            inverse_of_prev_selected = np.linalg.inv(KK) # shortcut
        if reg == 'iterative':
            selectedprotos = np.append(selectedprotos, argmax)

    return selected

In [4]:
# function to compute the kernel matrix using the radial basis function
def compute_kernel_matrix(df, gamma=0.1):
    data = df.to_numpy()
    K = rbf_kernel(data, gamma=gamma)
    return K

In [5]:
# Load x_bins and y_raw into memory

random_seed = 10

# 1. Load raw data

def load_all_partitions(directory):
    all_files = glob.glob(os.path.join(directory, '*.csv'))
    df_list = [pd.read_csv(file) for file in all_files]
    return pd.concat(df_list, ignore_index=True)

x_raw = load_all_partitions('data/AF-Raw-Data/AF Data/all_raw_data_csv')

# Remove rows that are duplicates or have values > 1800
n_before = x_raw.shape[0]
x_raw = x_raw.drop_duplicates()
x_raw = x_raw[(x_raw.T < 1800).all()]
print(f"Removed {n_before - x_raw.shape[0]} rows with values > 1800 or are duplicate")

# Balance the dataset, by taking the same number of samples from each class
class_1 = x_raw[x_raw['Class_Label'] == 1]
class_0 = x_raw[x_raw['Class_Label'] == 0].sample(len(class_1), random_state=random_seed)
x_raw = pd.concat([class_1, class_0])
print(f"Balanced dataset: {x_raw.shape[0]} samples in total")

x_raw.reset_index(drop=True, inplace=True)
x_raw['Sample_id'] = x_raw.index

# Split the class label from the features and split the data into train and test
y_raw = x_raw[['Class_Label', 'Sample_id']]
x_raw = x_raw.drop(columns=['Class_Label'])
x_raw_train, x_raw_test, y_train, y_test = train_test_split(x_raw, y_raw, test_size=0.2, random_state=random_seed)


Removed 136070 rows with values > 1800 or are duplicate
Balanced dataset: 109272 samples in total


In [6]:
# 2. Create preprocessed data

# 30 bins (of 50 milliseconds) are created covering R-R intervals of 200 ms up to 1700 ms. For each
#sample the frequency of an R-R interval occurring in a certain bin was counted.

# Function to bin and count intervals for a row
def bin_row(row, bin_edges):
    intervals = row[:-1].values # Exclude the class label from binning
    bin_indices = np.digitize(intervals, bins=bin_edges, right=False)
    bin_counts = np.bincount(bin_indices, minlength=len(bin_edges)+1)[1:-1]  # Exclude counts outside defined bins
    return bin_counts

bin_edges = np.arange(200, 1751, 50) #ms
bin_column_names = [f'bin_{i + 1}' for i in range(len(bin_edges) - 1)]

def preprocess_into_bins(x_data):
    x_bins = x_data.apply(lambda row: bin_row(row, bin_edges), axis=1, result_type='expand')
    x_bins.columns = bin_column_names
    x_bins['Sample_id'] = x_data['Sample_id']
    return x_bins

x_bins_train = preprocess_into_bins(x_raw_train)
x_bins_test = preprocess_into_bins(x_raw_test)
x_bins = pd.concat([x_bins_train, x_bins_test], ignore_index=True)
x_bins = x_bins.sort_values('Sample_id')

## Here starts my code

In [7]:
# sample 20 percent of rows and observe the class distribution after sampl
sampling_ratio = 0.2
sample_indices = x_bins.sample(frac=sampling_ratio, random_state=1).Sample_id
x_bins_sampled = x_bins[x_bins['Sample_id'].isin(sample_indices)]
y_raw_sampled = y_raw[y_raw['Sample_id'].isin(sample_indices)]
print(y_raw_sampled.Class_Label.value_counts())

Class_Label
1.0    11033
0.0    10821
Name: count, dtype: int64


In [8]:
# select 10 prototypes and 10 criticisms from data of the different classes separately
num_prototypes=10
num_criticisms=10

## Prototypes and Criticisms for subset of data points with a positive class label

In [9]:
positive_indices = y_raw_sampled[y_raw_sampled.Class_Label == 1].Sample_id
sampled_positive = x_bins_sampled[x_bins_sampled['Sample_id'].isin(positive_indices)]
sampled_positive.head(5)

Unnamed: 0,bin_1,bin_2,bin_3,bin_4,bin_5,bin_6,bin_7,bin_8,bin_9,bin_10,...,bin_23,bin_24,bin_25,bin_26,bin_27,bin_28,bin_29,bin_30,bin_31,Sample_id
43522,2,16,10,10,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
45366,0,0,0,0,0,0,1,39,0,0,...,0,0,0,0,0,0,0,0,0,4
27921,1,14,16,4,4,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,6
88322,1,7,16,10,2,2,0,0,1,0,...,0,0,0,0,0,0,0,0,0,8
68659,2,10,16,7,3,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,10


In [10]:
K = compute_kernel_matrix(sampled_positive, gamma=0.0001)

In [11]:
prototypes = greedy_select_protos(K=K, candidate_indices=np.arange(len(K)), m=num_prototypes, is_K_sparse=False)
for prototype in prototypes:
    positive_index = sampled_positive.iloc[prototype].Sample_id
    print(f"Looking into prototype with index {prototype} and class label {y_raw_sampled[y_raw_sampled['Sample_id'] == positive_index]['Class_Label'].iloc[0]}")

Looking into prototype with index 1702 and class label 1.0
Looking into prototype with index 8543 and class label 1.0
Looking into prototype with index 7302 and class label 1.0
Looking into prototype with index 3768 and class label 1.0
Looking into prototype with index 4920 and class label 1.0
Looking into prototype with index 9472 and class label 1.0
Looking into prototype with index 4205 and class label 1.0
Looking into prototype with index 7162 and class label 1.0
Looking into prototype with index 3604 and class label 1.0
Looking into prototype with index 2648 and class label 1.0


In [12]:
criticisms = select_criticism_regularized(K, selectedprotos=prototypes, m=num_criticisms, is_K_sparse=False)
for criticism in criticisms:
    positive_index = sampled_positive.iloc[criticism].Sample_id
    print(f"Looking into criticism with index {criticism} and class label {y_raw_sampled[y_raw_sampled['Sample_id'] == positive_index]['Class_Label'].iloc[0]}")

Looking into criticism with index 7163 and class label 1.0
Looking into criticism with index 7301 and class label 1.0
Looking into criticism with index 3605 and class label 1.0
Looking into criticism with index 8544 and class label 1.0
Looking into criticism with index 4918 and class label 1.0
Looking into criticism with index 2647 and class label 1.0
Looking into criticism with index 9473 and class label 1.0
Looking into criticism with index 3769 and class label 1.0
Looking into criticism with index 4204 and class label 1.0
Looking into criticism with index 1700 and class label 1.0


In [13]:
positive_indices = sampled_positive.iloc[prototypes].Sample_id
df_prototypes = pd.merge(x_bins_sampled, y_raw_sampled[y_raw_sampled['Sample_id'].isin(positive_indices)], on='Sample_id', how='inner')
df_prototypes['Type'] = 'prototype'

positive_indices = sampled_positive.iloc[criticisms].Sample_id
df_criticisms = pd.merge(x_bins_sampled, y_raw_sampled[y_raw_sampled['Sample_id'].isin(positive_indices)], on='Sample_id', how='inner')
df_criticisms['Type'] = 'criticism'

df = pd.concat([df_prototypes, df_criticisms]).reset_index(drop=True)
print(df.head(20))

    bin_1  bin_2  bin_3  bin_4  bin_5  bin_6  bin_7  bin_8  bin_9  bin_10  \
0       0      0      0      0      2      8      7      8      6       3   
1       0      0      0      5      7      7      3      2      4       7   
2       0      0      6     13     10      4      1      2      3       0   
3       0      0      0     11     16      6      1      1      2       1   
4       0      0      0      2     10      9      6      6      6       1   
5       0      0      0      0      3      7      6      3      6       4   
6       0      0      0     40      0      0      0      0      0       0   
7       0      0      0     40      0      0      0      0      0       0   
8       0      0      1     39      0      0      0      0      0       0   
9       0      0      0      6      5      8      5      6      4       3   
10      0      0      0      0      1      3     11      9      7       4   
11      0      0      0      6      6      6      3      2      7       8   

In [14]:
df.to_csv('protocisms_positive.csv')

## Prototypes and Criticisms for subset of data points with a negative class label

In [15]:
negative_indices = y_raw_sampled[y_raw_sampled.Class_Label == 0].Sample_id
sampled_negative = x_bins_sampled[x_bins_sampled['Sample_id'].isin(negative_indices)]
sampled_negative.head(5)

Unnamed: 0,bin_1,bin_2,bin_3,bin_4,bin_5,bin_6,bin_7,bin_8,bin_9,bin_10,...,bin_23,bin_24,bin_25,bin_26,bin_27,bin_28,bin_29,bin_30,bin_31,Sample_id
76679,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,54636
80506,0,0,0,0,0,0,22,18,0,0,...,0,0,0,0,0,0,0,0,0,54637
19479,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,54644
99117,0,0,0,0,0,0,40,0,0,0,...,0,0,0,0,0,0,0,0,0,54645
20031,0,0,0,0,0,0,0,0,40,0,...,0,0,0,0,0,0,0,0,0,54650


In [16]:
num_prototypes=10
num_criticisms=10
K = compute_kernel_matrix(sampled_negative, gamma=0.0001)

In [17]:
prototypes = greedy_select_protos(K=K, candidate_indices=np.arange(len(K)), m=num_prototypes, is_K_sparse=False)
for prototype in prototypes:
    negative_index = sampled_negative.iloc[prototype].Sample_id
    print(f"Looking into prototype with index {prototype} and class label {y_raw_sampled[y_raw_sampled['Sample_id'] == negative_index]['Class_Label'].iloc[0]}")

Looking into prototype with index 10447 and class label 0.0
Looking into prototype with index 4487 and class label 0.0
Looking into prototype with index 9831 and class label 0.0
Looking into prototype with index 6533 and class label 0.0
Looking into prototype with index 1810 and class label 0.0
Looking into prototype with index 3326 and class label 0.0
Looking into prototype with index 8637 and class label 0.0
Looking into prototype with index 4384 and class label 0.0
Looking into prototype with index 8498 and class label 0.0
Looking into prototype with index 1461 and class label 0.0


In [18]:
criticisms = select_criticism_regularized(K, selectedprotos=prototypes, m=num_criticisms, is_K_sparse=False)
for criticism in criticisms:
    negative_index = sampled_negative.iloc[criticism].Sample_id
    print(f"Looking into prototype with index {criticism} and class label {y_raw_sampled[y_raw_sampled['Sample_id'] == negative_index]['Class_Label'].iloc[0]}")

Looking into prototype with index 1459 and class label 0.0
Looking into prototype with index 8636 and class label 0.0
Looking into prototype with index 4382 and class label 0.0
Looking into prototype with index 9832 and class label 0.0
Looking into prototype with index 6531 and class label 0.0
Looking into prototype with index 4484 and class label 0.0
Looking into prototype with index 1809 and class label 0.0
Looking into prototype with index 8505 and class label 0.0
Looking into prototype with index 3325 and class label 0.0
Looking into prototype with index 10454 and class label 0.0


In [19]:
negative_indices = sampled_negative.iloc[prototypes].Sample_id
df_prototypes = pd.merge(x_bins_sampled, y_raw_sampled[y_raw_sampled['Sample_id'].isin(negative_indices)], on='Sample_id', how='inner')
df_prototypes['Type'] = 'prototype'

negative_indices = sampled_negative.iloc[criticisms].Sample_id
df_criticisms = pd.merge(x_bins_sampled, y_raw_sampled[y_raw_sampled['Sample_id'].isin(negative_indices)], on='Sample_id', how='inner')
df_criticisms['Type'] = 'criticism'

df = pd.concat([df_prototypes, df_criticisms]).reset_index(drop=True)
print(df.head(20))

    bin_1  bin_2  bin_3  bin_4  bin_5  bin_6  bin_7  bin_8  bin_9  bin_10  \
0       0      0      0      2     10      7      4      4      1       6   
1       0      0      0      0      0      0      1      1      7       5   
2       0      0      0      0      0      0      0      0     17      15   
3       0      0      0      0      0      0      0      0      4      18   
4       0      0      0      0      0      0      0      0      0       1   
5       0      0      4      5      2      4      9      6      4       5   
6       0      0      0      0      0      0      0      1      2       4   
7       0      0      0      0      0      0      0      0      0      25   
8       0      0      0      0      0      0      2      1      4       4   
9       0      0      0      0      0      0      0      4      0       0   
10      0      0      1     10      7      5      5      3      4       0   
11      0      0      0      4      4      7      7      4      8       2   

In [20]:
df.to_csv('protocisms_negative.csv')

## Combine negative and positive protocisms

In [21]:
neg = pd.read_csv('protocisms_negative.csv')
pos = pd.read_csv('protocisms_positive.csv')
df = pd.concat([neg,pos]).reset_index(drop=True)
df.drop(columns='Unnamed: 0', inplace=True)
df.head(40)

Unnamed: 0,bin_1,bin_2,bin_3,bin_4,bin_5,bin_6,bin_7,bin_8,bin_9,bin_10,...,bin_25,bin_26,bin_27,bin_28,bin_29,bin_30,bin_31,Sample_id,Class_Label,Type
0,0,0,0,2,10,7,4,4,1,6,...,0,0,0,0,0,0,0,62211,0.0,prototype
1,0,0,0,0,0,0,1,1,7,5,...,0,0,0,0,0,0,0,63996,0.0,prototype
2,0,0,0,0,0,0,0,0,17,15,...,0,0,0,0,0,0,0,71545,0.0,prototype
3,0,0,0,0,0,0,0,0,4,18,...,0,0,0,0,0,0,0,76671,0.0,prototype
4,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,77103,0.0,prototype
5,0,0,4,5,2,4,9,6,4,5,...,0,0,0,0,0,0,0,87342,0.0,prototype
6,0,0,0,0,0,0,0,1,2,4,...,0,0,0,0,0,0,0,97517,0.0,prototype
7,0,0,0,0,0,0,0,0,0,25,...,0,0,0,0,0,0,0,98203,0.0,prototype
8,0,0,0,0,0,0,2,1,4,4,...,0,0,0,0,0,0,0,104214,0.0,prototype
9,0,0,0,0,0,0,0,4,0,0,...,0,0,0,0,0,0,0,107466,0.0,prototype


In [22]:
df.to_csv('protocisms.csv')