In [1]:
%matplotlib inline
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
from statsmodels.stats.contingency_tables import mcnemar                                                           

In [2]:
def compute_power(prob_table, dataset_size, alpha=0.05, r=5000):

    if prob_table[0, 1] == prob_table[1, 0]:
        raise RuntimeError("Power is undefined when the true effect is zero.")

    pvals = []
    diffs = []
    for i in range(r):  # number of simulations
        sample = np.random.multinomial(n=dataset_size, pvals=prob_table.reshape((4,))).reshape((2,2))
        acc_diff = (sample[0,1] - sample[1, 0]) / dataset_size
        test_results = mcnemar(sample)
        pvals.append(test_results.pvalue)
        diffs.append(acc_diff)

    true_diff = prob_table[0, 1] - prob_table[1, 0]
    true_sign = np.sign(true_diff) 
    sig_diffs = [d for i, d in enumerate(diffs) if pvals[i] <= alpha]
    power = len([d for i, d in enumerate(diffs) if pvals[i] <= alpha and np.sign(d) == true_sign]) / r
    mean_effect = np.mean(diffs)
    type_m = np.mean(np.abs(sig_diffs) / np.abs(true_diff))
    type_s = np.mean(np.sign(sig_diffs) != true_sign)
    return power, mean_effect, type_m, type_s


In [3]:
# If we ONLY know the difference in accuracy, one option is to compute bounds, assuming the best and worst case scneario

baseline_acc = 0.7
delta_acc = 0.02
dataset_size = 1000
r = 10000
alpha = 0.05

#### Case 1: maximal agreement on instances
acc1 = baseline_acc
acc2 = baseline_acc + delta_acc
p_both_correct = min(acc1, acc2)
p_diff = abs(acc1-acc2)
p_both_incorrect = 1.0 - max(acc1, acc2)
# create probability table [[both correct, only M1 correct], [only M2 correct, both incorrect]]
if acc2 > acc1:
    prob_table = np.array([[p_both_incorrect, 0], [p_diff, p_both_correct]]) 
else:
    prob_table = np.array([[p_both_incorrect, p_diff], [0, p_both_correct]]) 
print("Probability table for maximal agreement:")
print(prob_table)
print("acc1 =", prob_table[1, :].sum())
print("acc2 =", prob_table[:, 1].sum())

power, mean_effect, type_m, type_s = compute_power(prob_table, dataset_size, alpha=alpha, r=r)

print("\nUpper bounds:")
print("Approx power = {:.3f}".format(power))
print("Approx Type-M error = {:.3f}".format(type_m))
print("Approx Type-S error = {:.3f}".format(type_s))

#### Case 2: maximal disagreement on instances (assume acc > 0.5)

error_rate_1 = 1.0 - acc1
error_rate_2 = 1.0 - acc2
p_both_correct = 1.0 - error_rate_1 - error_rate_2
# create probability table [[cc, ci], [ic, ii]]
prob_table = np.array([[0, error_rate_1], [error_rate_2, p_both_correct]])
print("\nProbability table for minimal agreement:")
print(prob_table)
print("acc1 =", prob_table[1, :].sum())
print("acc2 =", prob_table[:, 1].sum())
print()

power, mean_effect, type_m, type_s = compute_power(prob_table, dataset_size, alpha=alpha, r=r)

print("\nLower bounds:")
print("Approx power = {:.3f}".format(power))
print("Approx Type-M error = {:.3f}".format(type_m))
print("Approx Type-S error = {:.3f}".format(type_s))




Probability table for maximal agreement:
[[0.28 0.  ]
 [0.02 0.7 ]]
acc1 = 0.72
acc2 = 0.7

Upper bounds:
Approx power = 1.000
Approx Type-M error = 0.995
Approx Type-S error = 0.000

Probability table for minimal agreement:
[[0.   0.3 ]
 [0.28 0.42]]
acc1 = 0.7
acc2 = 0.72


Lower bounds:
Approx power = 0.119
Approx Type-M error = 3.004
Approx Type-S error = 0.022


In [4]:
# Alternatively, we can an use estimated values for agreement

agreement_rate = 0.975
baseline_acc = 0.8
delta_acc = 0.02
dataset_size = 500
r = 10000
alpha = 0.05


acc1 = baseline_acc
acc2 = baseline_acc + delta_acc

disagreement_rate = 1 - agreement_rate
if delta_acc > 0:
    p_only_1_correct = (disagreement_rate - delta_acc) / 2 
    p_only_2_correct = (disagreement_rate - delta_acc) / 2 + delta_acc
else:
    p_only_1_correct = (disagreement_rate + delta_acc) / 2 - delta_acc
    p_only_2_correct = (disagreement_rate + delta_acc) / 2 

p_both_correct = acc1 - p_only_1_correct
assert np.abs(p_both_correct - (acc2 - p_only_2_correct)) < 1e-4
p_both_incorrect = 1. - p_both_correct - p_only_1_correct - p_only_2_correct

for p in [p_both_correct, p_only_1_correct, p_only_2_correct, p_both_incorrect]:
    assert p >= 0

prob_table = np.array([[p_both_incorrect, p_only_2_correct], [p_only_1_correct, p_both_correct]]) 
print("Probability table:")
print(prob_table)
print("acc1 = {:.3f}".format(prob_table[1, :].sum()))
print("acc2 = {:.3f}".format(prob_table[:, 1].sum()))

power, mean_effect, type_m, type_s = compute_power(prob_table, dataset_size, alpha=alpha, r=r)

print("")
print("Approx power = {:.3f}".format(power))
print("Approx Type-M error = {:.3f}".format(type_m))
print("Approx Type-S error = {:.3f}".format(type_s))


Probability table:
[[0.1775 0.0225]
 [0.0025 0.7975]]
acc1 = 0.800
acc2 = 0.820

Approx power = 0.800
Approx Type-M error = 1.118
Approx Type-S error = 0.000
