In [4]:
import os
import numpy as np
import pandas as pd
import util as u

# Statistical tests
from scipy.stats import wilcoxon

In [15]:
# Initialization parameters
k_factors = 2
baseline = 'opf' # or knn
datasets = ['Blood', 'Cancer', 'CMC', 'Digits', 'Iris']
cols_acc = ['acc_baseline', 'acc_nmf', 'acc_pmf', 'acc_svd']

In [16]:
project_dir = '/'.join([os.getcwd(), 'out', 'KIS_21', f'k_{k_factors}'])

### Calculate statistics (wilcoxon signed-rank test)

In [17]:
df_acc_latex = pd.DataFrame(data=None, columns=cols_acc)

for dataset in datasets:
    
    dataset = dataset.lower()
    
    l_stats = []
    df_stats_sparse = pd.DataFrame()

    # Get results for different levels of sparsity
    for j in range(10, 80 + 10, 10):

        sheet = f'S_{j:.1f}'
        df_stats = pd.DataFrame()
        i = 1

        # Read NMF results
        mf = 'nmf'
        df_nmf = pd.DataFrame()
        # Read all sheets from every experimental results xlsx file.
        for entry in os.scandir('/'.join([project_dir, baseline, mf, dataset])):

            if entry.path.endswith('.xlsx'):

                df = pd.read_excel(entry.path, sheet_name=sheet, usecols='C, J')
                df.columns = [f'acc_{mf}', f'time_{mf}']

                # Calculate the results' mean for every sparsity level
                df_nmf[str(i)] = df.mean(axis=0)
                i += 1


        # Read PMF results
        i = 1
        mf = 'pmf'
        df_pmf = pd.DataFrame()
        for entry in os.scandir('/'.join([project_dir, baseline, mf, dataset])):

            if entry.path.endswith('.xlsx'):

                df = pd.read_excel(entry.path, sheet_name=sheet, usecols='C, J')
                df.columns = [f'acc_{mf}', f'time_{mf}']

                # Calculate the average result for each level of sparsity
                df_pmf[str(i)] = df.mean(axis=0)
                i += 1


        # Read SVD results
        i = 1
        mf = 'svd'
        df_svd = pd.DataFrame()
        for entry in os.scandir('/'.join([project_dir, baseline, mf, dataset])):

            if entry.path.endswith('.xlsx'):

                df = pd.read_excel(entry.path, sheet_name=sheet, usecols='B, C, G, J')
                df.columns = [f'acc_baseline', f'acc_{mf}', f'time_baseline', f'time_{mf}']

                # Calculate the average result for each level of sparsity
                df_svd[str(i)] = df.mean(axis=0)
                i += 1

        # Concatenating MF dataframes
        #df_res = pd.concat([df_nmf.T[['acc_baseline', 'time_baseline']], df_nmf.T[['acc_nmf', 'time_nmf']], df_pmf.T, df_svd.T], axis=1)
        df_res = pd.concat([df_svd.T[['acc_baseline', 'time_baseline']], df_nmf.T, df_pmf.T, df_svd.T[['acc_svd', 'time_svd']]], axis=1)
        
        # Make a list of dataframes (each for a different sparsity level) 
        l_stats.append(df_res)

    
    # Select a baseline array at random
    idx_baseline = np.random.randint(0, high=len(l_stats))
#     aprint(f'Selected baseline array: {idx_baseline}')

    for j in range(len(l_stats)):

        # Read dataframe (__% of sparsity)
        df = l_stats[j]

        # Change baseline results to the ones selected randomly
        df['acc_baseline'] = l_stats[idx_baseline]['acc_baseline']

        # Calculate the average/standard deviation accuracy for each column
        avg_acc, std_acc = df[cols_acc].mean(), df[cols_acc].std()

        # Take the index of the best mean accuracy
        idx_best = avg_acc.idxmax()

        new_row = {idx_best: '\\bm{$' + str(avg_acc[idx_best].round(4)) + ' \\pm ' + str(std_acc[idx_best].round(4)) + '$}'}

        # Make a list with the remaining results (except the best)
        algs_acc = list(avg_acc.index.values)
        algs_acc.remove(idx_best)

        # Set the control group (G1)
        g_crt = df[idx_best].values

        # Statistical significance
        alpha = 0.05

        for idx_cmp in algs_acc:

            # Set the algorithm (G2) to be compared against G1
            g_cmp = df[idx_cmp].values
            
            # Perform the Wilcoxon signed-rank test
            stat, p = wilcoxon(g_crt, g_cmp, zero_method='zsplit')

            # Reject H0/ Accept H1 (Data come from distribution with different medians)
            avg_std_tex = '$' + str(avg_acc[idx_cmp].round(4)) + ' \\pm ' + str(std_acc[idx_cmp].round(4)) + '$'

            if p > alpha:
                # Fail to reject H0 (i.e, accepts H0) 
                # (Distributions are similar based on their medians)
                avg_std_tex = '\\bm{' + avg_std_tex + '}'

            new_row[idx_cmp] = avg_std_tex

        df_acc_latex.loc[len(df_acc_latex.index)] = new_row

### Building latex table from statistical results (pandas.dataframe)

In [18]:
# Set hierarchical indexes for rows
idx_iter = [datasets, [i for i in range(10, 90, 10)]]
df_acc_latex.index = pd.MultiIndex.from_product(idx_iter, names=('Dataset', 'Sparsity (\%)'))

# Set hierarchical indexes for columns
col_iter = [('Baseline', baseline.upper()), ('Proposed methods', 'NMF'), ('Proposed methods', 'PMF'), ('Proposed methods', 'SVD')]
df_acc_latex.columns = pd.MultiIndex.from_tuples(col_iter)

In [19]:
stats_dir = './out/KIS_21/k_2/stats'

df_acc_latex.to_latex(
    '/'.join([stats_dir, f'tex_table_{baseline}_accuracy_{u.get_datetime()}']),
    index=True,
    sparsify=False,
    float_format='%.4f',
    column_format='c',
    multicolumn=True,
    multicolumn_format='c',
    multirow=True,
    caption='Average accuracy comparison.', 
    label=f'tab_{baseline}_accuracy',
    escape=False,
    position='!ht')