We use this notebook to compute mcnemar p-values.  It is a little messy, but we basically just used the pandas df to see how many values were greater than the .05 threshold for each type of comparison.

In [1]:
import glob
import tensorflow as tf
from tensorflow.keras.models import load_model
import tensorflow_datasets as tfds
from statsmodels.stats.contingency_tables import mcnemar
import numpy as np
import pandas as pd

In [2]:
model_paths = glob.glob('./models/**/*.h5', recursive=True)
model_paths.remove('./models/pretrained_traditional/cifar10_1e_neg_4.h5')
model_paths

['./models/from_start_baseline/model_3/from_start_normal_model_3.h5',
 './models/from_start_baseline/model_2/from_start_normal_model_2.h5',
 './models/from_start_baseline/model_1/from_start_normal_model.h5',
 './models/from_start_5e-1/model_3/from_start_5e-1_model_3.h5',
 './models/from_start_5e-1/model_2/from_start_5e-1_model_2.h5',
 './models/from_start_5e-1/model_1/from_start_5e-1_model.h5',
 './models/from_start_reversed/model_3/from_start_reversed_model_3.h5',
 './models/from_start_reversed/model_2/from_start_reversed_model_2.h5',
 './models/from_start_reversed/model_1/from_start_reversed_model.h5',
 './models/from_start_1e-2/model_3/from_start_model_3.h5',
 './models/from_start_1e-2/model_2/from_start_model_2.h5',
 './models/from_start_1e-2/model_1/from_start_model.h5']

In [3]:
# Load in data
dset, info = tfds.load('cifar10', split='test', with_info=True, as_supervised=True)
batch_size = 32
n_observations = sum(1 for _ in dset)
labels = np.array([y for _, y in dset])
dset = dset.batch(32)
steps = n_observations//batch_size + int(n_observations%batch_size > 0)
print(f'n_observations: {n_observations}, steps: {steps}')
print(labels)

n_observations: 10000, steps: 313
[7 0 6 ... 8 6 0]


# Get Predictions

In [4]:
%%time
model_predictions = {}
for path in model_paths:
    model = load_model(path, compile=False)
    preds = model.predict(dset)
    int_preds = np.argmax(preds, axis=1)
    model_predictions[path] = int_preds

CPU times: user 3min 45s, sys: 13.1 s, total: 3min 58s
Wall time: 29.9 s


# Run Pairwise McNemar

In [5]:
def make_contingency_table(classifer_1_correct, classifer_2_correct):
    # Rowwise, first refers to classifier_1, second refers to classifier_2
    y_y = 0
    y_n = 0
    n_y = 0 
    n_n = 0
    
    for c1, c2 in zip(classifer_1_correct, classifer_2_correct):
        if c1 == 1 and c2 == 1:
            y_y += 1
        if c1 == 1 and c2 == 0:
            y_n += 1
        if c1 == 0 and c2 == 1:
            n_y += 1
        if c1 == 0 and c2 == 0:
            n_n += 1
    
    return np.array([[y_y, y_n], [n_y, n_n]])

In [8]:
computed_p_values = []
done = []

for first_path in model_paths:
    for second_path in model_paths:
        if first_path == second_path:
            continue
            
        if (first_path, second_path) in done:
            continue
            
        if (second_path, first_path) in done:
            continue
        
        # Get the model predictions for both 
        first_preds = model_predictions[first_path]
        second_preds = model_predictions[second_path]
        
        # Get the correct/incorrect labels
        first_is_correct = (labels == first_preds).astype(int)
        second_is_correct = (labels == second_preds).astype(int)
        
        # Get the contingency table 
        tb = make_contingency_table(first_is_correct, second_is_correct)
        
        # Get the mcnemar stats
        p_value = mcnemar(tb).pvalue
        
        # Get the model names
        f_model = first_path.split('/')[2]
        s_model = second_path.split('/')[2]
        
        computed_p_values.append([f_model, s_model, first_path, second_path, p_value])
        
        done.append((first_path, second_path))
        done.append((second_path, first_path))

In [9]:
np_computed_p_values = np.vstack(computed_p_values)
df = pd.DataFrame(np_computed_p_values, columns=['First', 'Second', 'f_path', 's_path', 'p-value'])
df['p-value'] = pd.to_numeric(df['p-value'], downcast='float')
df

Unnamed: 0,First,Second,f_path,s_path,p-value
0,from_start_baseline,from_start_baseline,./models/from_start_baseline/model_3/from_star...,./models/from_start_baseline/model_2/from_star...,2.865375e-01
1,from_start_baseline,from_start_baseline,./models/from_start_baseline/model_3/from_star...,./models/from_start_baseline/model_1/from_star...,9.506769e-01
2,from_start_baseline,from_start_5e-1,./models/from_start_baseline/model_3/from_star...,./models/from_start_5e-1/model_3/from_start_5e...,6.107711e-23
3,from_start_baseline,from_start_5e-1,./models/from_start_baseline/model_3/from_star...,./models/from_start_5e-1/model_2/from_start_5e...,2.462936e-07
4,from_start_baseline,from_start_5e-1,./models/from_start_baseline/model_3/from_star...,./models/from_start_5e-1/model_1/from_start_5e...,3.430616e-10
...,...,...,...,...,...
61,from_start_reversed,from_start_1e-2,./models/from_start_reversed/model_1/from_star...,./models/from_start_1e-2/model_2/from_start_mo...,1.533904e-02
62,from_start_reversed,from_start_1e-2,./models/from_start_reversed/model_1/from_star...,./models/from_start_1e-2/model_1/from_start_mo...,1.067672e-02
63,from_start_1e-2,from_start_1e-2,./models/from_start_1e-2/model_3/from_start_mo...,./models/from_start_1e-2/model_2/from_start_mo...,6.957254e-01
64,from_start_1e-2,from_start_1e-2,./models/from_start_1e-2/model_3/from_start_mo...,./models/from_start_1e-2/model_1/from_start_mo...,7.738170e-01


In [10]:
def get_comparison_row(df, row_title):
    base = df[(df['First'] == row_title) & (df['Second'] == 'from_start_baseline')]
    neg2 = df[(df['First'] == row_title) & (df['Second'] == 'from_start_1e-2')]
    neg5 = df[(df['First'] == row_title) & (df['Second'] == 'from_start_5e-1')]
    rev = df[(df['First'] == row_title) & (df['Second'] == 'from_start_reversed')]
    
    base_str = str(len(base[base['p-value'] < .05])) + '/' + str(len(base))
    neg2_str = str(len(neg2[neg2['p-value'] < .05])) + '/' + str(len(neg2))
    neg5_str = str(len(neg5[neg5['p-value'] < .05])) + '/' + str(len(neg5))
    rev_str = str(len(rev[rev['p-value'] < .05])) + '/' + str(len(rev))
    return base_str + ' | ' + neg2_str + ' | ' + neg5_str + ' | ' + rev_str


    

In [11]:
row_title = 'from_start_baseline'
get_comparison_row(df, row_title)

'0/3 | 0/9 | 9/9 | 7/9'

In [12]:
row_title = 'from_start_1e-2'
get_comparison_row(df, row_title)

'0/0 | 0/3 | 0/0 | 0/0'

In [13]:
row_title = 'from_start_5e-1'
get_comparison_row(df, row_title)

'0/0 | 9/9 | 2/3 | 7/9'

In [14]:
row_title = 'from_start_reversed'
get_comparison_row(df, row_title)

'0/0 | 7/9 | 0/0 | 1/3'

In [15]:
df[(df['First'] == 'from_start_reversed') & (df['Second'] == 'from_start_reversed')]

Unnamed: 0,First,Second,f_path,s_path,p-value
51,from_start_reversed,from_start_reversed,./models/from_start_reversed/model_3/from_star...,./models/from_start_reversed/model_2/from_star...,0.014889
52,from_start_reversed,from_start_reversed,./models/from_start_reversed/model_3/from_star...,./models/from_start_reversed/model_1/from_star...,0.449533
56,from_start_reversed,from_start_reversed,./models/from_start_reversed/model_2/from_star...,./models/from_start_reversed/model_1/from_star...,0.094086
