In [None]:
%matplotlib inline
import os
from collections import defaultdict

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.stats.contingency_tables import mcnemar

In [None]:
# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive')

In [None]:
base_path = "/content/drive/My Drive/NLP Power Analysis/data/squad"

file1 = 'models.tsv'
models_df = pd.read_csv(os.path.join(base_path, file1), header=0, index_col=0, sep='\t')


In [None]:
file2 = 'pairs.tsv'
pairs_df = pd.read_csv(os.path.join(base_path, file2), header=0, index_col=None, sep='\t')


In [None]:
n_dev = 11873
n_test = 8862
model_index = list(models_df.index)
dev_ems = models_df['dev_em'].values
test_ems = models_df['test_em'].values
sota = models_df['sota'].values
descriptions = models_df['description'].values
dates = models_df['date'].values

dev_deltas = defaultdict(dict)
dev_disagreements = defaultdict(dict)
test_deltas = defaultdict(dict)
test_disagreements = defaultdict(dict)

i_list = pairs_df['i'].values
j_list = pairs_df['j'].values

# collect the deltas and disagreements for each pair of models
dev_deltas_list = pairs_df['dev_delta'].values
dev_disagree_list = pairs_df['dev_disagree'].values
test_deltas_list = pairs_df['test_delta'].values
test_disagree_list = pairs_df['test_disagree'].values
for index in range(len(i_list)):
    dev_deltas[i_list[index]][j_list[index]] = dev_deltas_list[index]
    dev_disagreements[i_list[index]][j_list[index]] = dev_disagree_list[index]
    test_deltas[i_list[index]][j_list[index]] = test_deltas_list[index]
    test_disagreements[i_list[index]][j_list[index]] = test_disagree_list[index]    

In [None]:
# simulate many draws
def compute_power(table, dataset_size,  n_sim=5000, alpha=0.05):
    pvals = []
    for i in range(n_sim):  # number of simulations
        sample = np.random.multinomial(n=dataset_size, pvals=table.reshape((4,))).reshape((2,2))
        test_results = mcnemar(sample)
        pvals.append(test_results.pvalue)
    pvals = np.array(pvals)
    return np.mean(pvals < alpha)


In [None]:
def make_prob_table(baseline_em, new_em, n_delta, n_disagreement, n_total):
    n_new_win = n_disagreement // 2 + n_delta // 2
    n_old_win = n_disagreement // 2 - n_delta // 2
    p_new_win = n_new_win / n_total
    p_old_win = n_old_win / n_total
    p_both_1 = baseline_em / 100 - p_old_win
    p_both_2 = new_em/100 - p_new_win
    try:
        assert np.abs(p_both_1 - p_both_2) < 0.001
    except AssertionError as e:
        print(baseline_em, new_em, n_delta, n_disagreement, n_total)
        raise e
    p_neither = 1.0 - p_both_1 - p_new_win - p_old_win
    assert p_neither > 0
    table = np.array([[p_neither, p_new_win], [p_old_win, p_both_1]])
    return table


In [None]:
def make_count_table(baseline_em, new_em, n_delta, n_disagreement, n_total):
    n_old_correct = int(np.round(n_total * baseline_em / 100))
    n_new_correct = int(np.round(n_total * new_em / 100))
    n_delta = n_new_correct - n_old_correct
    n_new_win = n_disagreement // 2 + n_delta // 2
    n_old_win = n_disagreement // 2 - n_delta // 2
    n_both_1 = int(np.round(n_total * baseline_em / 100)) - n_old_win
    n_both_2 = int(np.round(n_total * new_em / 100)) - n_new_win
    try:
        assert np.abs(n_both_1 - n_both_2) < 2
    except AssertionError as e:
        print(baseline_em, new_em, n_delta, n_disagreement, n_total)
        print(n_total * baseline_em / 100, n_total * new_em / 100, np.round(n_total * new_em / 100), np.round(n_total * new_em / 100) - np.round(n_total * baseline_em / 100) )
        print(n_both_1, n_both_2)
        raise e
    n_neither = n_test - n_both_1 - n_new_win - n_old_win
    assert n_neither > 0
    table = np.array([[n_neither, n_new_win], [n_old_win, n_both_1]])
    return table

In [None]:
# start with the first model as SOTA
sota_index = 0
print(descriptions[sota_index])
powers = []
pvals = []
test_improvement = []
dev_improvement = []
new_sotas = []
new_sotas_info = []
for i in range(1, len(model_index)):
    # get performance for current sota
    sota_test_em = test_ems[sota_index]
    sota_dev_em = dev_ems[sota_index]
    # get performance of new model
    dev_em = dev_ems[i]
    test_em = test_ems[i]
    # get deltas and disagreements
    dev_delta = dev_deltas[sota_index][i]
    dev_disagreement = dev_disagreements[sota_index][i]
    test_delta = test_deltas[sota_index][i]
    test_disagreement = test_disagreements[sota_index][i]
    
    # make a probability table based on dev performance
    table = make_prob_table(sota_dev_em, dev_em, dev_delta, dev_disagreement, n_dev)
    # compute power
    power = compute_power(table, n_dev)
    
    # make a count table based on test numbers
    table = make_count_table(sota_test_em, test_em, test_delta, test_disagreement, n_test)
    # run a significance test
    result = mcnemar(table)
    pval = result.pvalue
    powers.append(power)
    pvals.append(pval)
    if dev_em > sota_dev_em:
        dev_improvement.append(1)
    else:
        dev_improvement.append(0)
    if test_em > sota_test_em:
        test_improvement.append(1)
    else:
        test_improvement.append(0)
    # only count as SOTA is powered dev improvement and significnat test improvement
    new_sota = False    
    
    if dev_em > sota_dev_em and power >= 0.8 and pval <= 0.05 and test_em > sota_test_em:
        #print(descriptions[i])
        sota_index = i
        new_sota = True
        new_sotas.append(i)
        new_sotas_info.append((sota_test_em/100.0,(test_em - sota_test_em)/100.0, test_disagreement/n_test))
        print(new_sotas_info[-1])
    #print('{:s} {:.3f} {:.3f} {:.3f} {:.3f} {:.3f} {:.9f} {:d}'.format(descriptions[i][:30], sota_dev_em, dev_em, sota_test_em, test_em, power, pval, new_sota))

print(len(new_sotas_info))

In [None]:
print("---------------- Regressing Effect Size ----------------")
print(new_sotas_info)
x = [z[0] for z in new_sotas_info]
y = [z[1] for z in new_sotas_info]
import statsmodels.api as sm     
import numpy as np 
x = np.array(x).reshape(-1, 1)

xt = sm.add_constant(x)
print(x)

print(y)
# import pdb; pdb.set_trace()
# xt = np.array(x)
lm_1 = sm.OLS(np.array(y), xt).fit()
print(lm_1.summary().as_latex())
new_point = np.array([.90724]).reshape(-1, 1)
print(new_point.shape)

new_pointt = sm.add_constant(new_point, has_constant='add')
print(new_pointt.shape)

print(lm_1.predict(new_pointt))

print("---------------- Regressing Overlap ----------------")

print(new_sotas_info)
x = [[z[0],z[1]] for z in new_sotas_info]
y = [1.0 - z[2] for z in new_sotas_info] # predict agreement overall
import statsmodels.api as sm     
import numpy as np 
x = np.array(x).reshape(-1, 2)

xt = sm.add_constant(x)
print(x)

print(y)
# import pdb; pdb.set_trace()
# xt = np.array(x)
lm_1 = sm.OLS(np.array(y), xt).fit()
print(lm_1.summary().as_latex())
# new_point = np.array([.90724,]).reshape(-1, 1)
# print(new_point.shape)

# new_pointt = sm.add_constant(new_point, has_constant='add')
# print(new_pointt.shape)

# lm_1.predict(new_pointt)


In [None]:
# group by categories and count numbers in each
cats = ['no sig test diff', 'sig test improvement', 'sig test decline']
unpowered_dev_improvement = np.zeros(3)
unpowered_no_dev_improvement = np.zeros(3)
powered_dev_improvement = np.zeros(3)
powered_no_dev_improvement = np.zeros(3)
unpowered = np.zeros(3)
beta = 0.8
for i, power in enumerate(powers):
    if dev_improvement[i]:
        if powers[i] < beta:
            #target = unpowered_dev_improvement
            target = unpowered
        else:
            target = powered_dev_improvement
    else:
        if powers[i] < beta:
            #target = unpowered_no_dev_improvement
            target = unpowered
        else:
            target = powered_no_dev_improvement
    if dev_improvement[i] == 0 or powers[i] < 0.8:
        if test_improvement[i] and pvals[i] < 0.05:
            print(powers[i], pvals[i], dev_improvement[i], test_improvement[i], descriptions[i])
            
    if pvals[i] > 0.05:
        target[0] += 1 
    elif test_improvement[i]:
        target[1] += 1
    else:
        target[2] += 1
        
            
#print("Unpowered dev improvement", unpowered_dev_improvement, unpowered_dev_improvement/np.sum(unpowered_dev_improvement))
print("Unpowered", unpowered, unpowered/np.sum(unpowered))
print("Powered no dev improvement", powered_no_dev_improvement, powered_no_dev_improvement/np.sum(powered_no_dev_improvement))
print("Powered dev improvement", powered_dev_improvement, powered_dev_improvement/np.sum(powered_dev_improvement))
# add together those without a powered dev improvement
print("No powered dev improvement", (unpowered + powered_no_dev_improvement), (unpowered + powered_no_dev_improvement)/ (unpowered + powered_no_dev_improvement).sum())


In [None]:
# construct the plot somewhat manually
other = unpowered + powered_no_dev_improvement
x_offset = 0.5
y_offset = -0.05
fig, axes = plt.subplots(ncols=2, sharey=True)
plt.subplots_adjust(wspace=0.1)
axes[0].barh(range(3), [powered_dev_improvement[2], powered_dev_improvement[0], powered_dev_improvement[1]], label='Powered dev improvement')
axes[0].text(powered_dev_improvement[2]+x_offset, y_offset, '0', size=12)
axes[0].text(powered_dev_improvement[0]+x_offset-0.2, 1+y_offset, str(int(powered_dev_improvement[0])), size=12)
axes[0].text(powered_dev_improvement[1]+x_offset-0.2, 2+y_offset, str(int(powered_dev_improvement[1])), size=12)
axes[0].set_yticks(range(3))
axes[0].set_xlim(0, 17)
axes[0].set_yticklabels(['Sig. worse on test', 'No sig. difference', 'Sig. better on test'])
axes[0].set_title('Validation improvement\nwith power > 80%')
axes[0].set_xlabel('Count')
axes[1].barh(range(3), [other[2], other[0], other[1]], label='Other')
axes[1].text(other[2]+x_offset, y_offset, str(int(other[2])), size=12)
axes[1].text(other[0]+x_offset, 1+y_offset, str(int(other[0])), size=12)
axes[1].text(other[1]+x_offset, 2+y_offset, str(int(other[1])), size=12)
axes[1].set_xlim(0, 130)
axes[1].set_title('Other')
axes[1].set_xlabel('Count')
plt.savefig('figures/squad_control.pdf', bbox_inches='tight')
plt.show();