In [None]:
# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive')

In [None]:
!ls "/content/drive/My Drive/NLP Power Analysis/data/glue_submissoins"

In [None]:
import pandas as pd

glue_leaderboard = pd.read_html("/content/drive/My Drive/NLP Power Analysis/data/glue_submissoins/GLUE Benchmark.html", header=0)[0]
glue_leaderboard["Model"] = glue_leaderboard["Model"].apply(lambda x: x.replace("Microsoft Research Paraphrase Corpus-F1 / Accuracy","").lower().strip())



In [None]:
import os
base_path = "/content/drive/My Drive/NLP Power Analysis/data/glue_submissoins"
models = [x for x in os.listdir(base_path) if not x.endswith(".html")]
print(models)
tasks = os.listdir(os.path.join(base_path, models[0]))
print(tasks)

from itertools import combinations

pairwise = list(combinations(models, 2))
task = "MRPC.tsv"

mapping = {
    "electra_small" : "ELECTRA-Small",
    "electra_base" : "ELECTRA-Base",
    "electra_large" : "ELECTRA-Large",
    "electra_large_tricks" : "ELECTRA-Large + Standard Tricks",
    "albert" : "Albert (Ensemble)",
    "XLNET" : "XlNet (ensemble)",
    "BAM" : "BERT + BAM",
    "BERT" : "BERT: 24-layers, 16-heads, 1024-hidden"
}

def _get_table_key(modelname):
  if modelname in mapping.keys():
    return mapping[modelname].lower()
  else:
    return modelname.lower()


x = []
texts=[]
y = []
for model1, model2 in pairwise:
  model1_preds = pd.read_csv(os.path.join(base_path, model1, task), delimiter="\t").sort_values(by="index").reset_index(drop=True)
  model2_preds = pd.read_csv(os.path.join(base_path, model2, task), delimiter="\t").sort_values(by="index").reset_index(drop=True)
  
  # print(model1_preds)
  # print(model1_preds.iloc[:,1])
  print(f"{model1} v. {model2}")

  if "MRPC" in task or "QQP" in task:
    model1_performance = float(glue_leaderboard[glue_leaderboard["Model"] == _get_table_key(model1)][task.replace(".tsv", "")].values[0].split("/")[1])
    model2_performance = float(glue_leaderboard[glue_leaderboard["Model"] == _get_table_key(model2)][task.replace(".tsv", "")].values[0].split("/")[1])
  else:
    model1_performance = float(glue_leaderboard[glue_leaderboard["Model"] == _get_table_key(model1)][task.replace(".tsv", "")].values[0])
    model2_performance = float(glue_leaderboard[glue_leaderboard["Model"] == _get_table_key(model2)][task.replace(".tsv", "")].values[0])  
  y.append((model1_preds.iloc[:,1] == model2_preds.iloc[:,1]).mean())

  print(f"min performance: {model1_performance}")
  x.append(min(model1_performance, model2_performance))
  texts.append(f"{model1} v. {model2}")
  print(f"overlap: {y[-1]}")




In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
# !pip install adjustText
from adjustText import adjust_text
import seaborn as sns

for task in tasks:
  print(task)
  x = []
  texts=[]
  y = []
  for model1, model2 in pairwise:
    model1_preds = pd.read_csv(os.path.join(base_path, model1, task), delimiter="\t").sort_values(by="index").reset_index(drop=True)
    model2_preds = pd.read_csv(os.path.join(base_path, model2, task), delimiter="\t").sort_values(by="index").reset_index(drop=True)
    
    # print(model1_preds)
    # print(model1_preds.iloc[:,1])
    print(f"{model1} v. {model2}")

    if "MRPC" in task or "QQP" in task:
      model1_performance = float(glue_leaderboard[glue_leaderboard["Model"] == _get_table_key(model1)][task.replace(".tsv", "")].values[0].split("/")[1])
      model2_performance = float(glue_leaderboard[glue_leaderboard["Model"] == _get_table_key(model2)][task.replace(".tsv", "")].values[0].split("/")[1])
    else:
      try:
        model1_performance = float(glue_leaderboard[glue_leaderboard["Model"] == _get_table_key(model1)][task.replace(".tsv", "")].values[0])
        model2_performance = float(glue_leaderboard[glue_leaderboard["Model"] == _get_table_key(model2)][task.replace(".tsv", "")].values[0]) 
      except:
        print(f"Issue parsing model {model2} on {task}.")
        continue
    y.append((model1_preds.iloc[:,1] == model2_preds.iloc[:,1]).mean())

    print(f"min performance: {model1_performance}")
    x.append(min(model1_performance, model2_performance))
    texts.append(f"{model1} v. {model2}")
    print(f"overlap: {y[-1]}")


  plt.figure()

  sns.set(color_codes=True, font_scale=1.5)
  fig, ax = plt.subplots(figsize=(20,20))
  plt.title(task.replace(".tsv",""))
  plt.xlabel("Min. Accuracy")
  plt.ylabel("Percent Overlap")
  ax.scatter(x, y)
  annotations = []
  for i, txt in enumerate(texts):
      annotations.append(ax.annotate(txt, (x[i], y[i])))

  print(adjust_text(annotations, y=y, x=x, ax=ax, arrowprops=dict(arrowstyle="->", color='r', lw=1.0), expand_points=(1.3, 1.2), expand_text=(1.3, 1.2), precision=0.001, force_text= (1.1, .6), force_points=(1.0, 1.0), force_objects=(1.0, 0.6)))

  plt.savefig(f'/content/drive/My Drive/NLP Power Analysis/plots/glue/percent_overlap_glue_{task.replace(".tsv","").lower()}.pdf')

In [None]:
# Do the above for all tasks

tasks_to_cover = ["MRPC.tsv", "QQP.tsv", "SST-2.tsv", "RTE.tsv", "MNLI-m.tsv", "MNLI-mm.tsv", "WNLI.tsv"]

def _get_table_key(modelname):
  if modelname in mapping.keys():
    return mapping[modelname].lower()
  else:
    return modelname.lower()


x = []
texts=[]
y = []
for task in tasks_to_cover:
  for model1, model2 in pairwise:
    if task == "WNLI.tsv" and ("electra" in model1.lower() or "electra" in model2.lower() or "bert" in model1.lower() or "bert" in model2.lower() or "bam" in model1.lower() or "bam" in model2.lower()):
      print(f"skipping {task} {model1} {model2}")
      continue
    model1_preds = pd.read_csv(os.path.join(base_path, model1, task), delimiter="\t").sort_values(by="index").reset_index(drop=True)
    model2_preds = pd.read_csv(os.path.join(base_path, model2, task), delimiter="\t").sort_values(by="index").reset_index(drop=True)
    
    # print(model1_preds)
    # print(model1_preds.iloc[:,1])

    if "MRPC" in task or "QQP" in task:
      model1_performance = float(glue_leaderboard[glue_leaderboard["Model"] == _get_table_key(model1)][task.replace(".tsv", "")].values[0].split("/")[1]) / 100.0
      model2_performance = float(glue_leaderboard[glue_leaderboard["Model"] == _get_table_key(model2)][task.replace(".tsv", "")].values[0].split("/")[1]) / 100.0
    else:
      model1_performance = float(glue_leaderboard[glue_leaderboard["Model"] == _get_table_key(model1)][task.replace(".tsv", "")].values[0]) / 100.0
      model2_performance = float(glue_leaderboard[glue_leaderboard["Model"] == _get_table_key(model2)][task.replace(".tsv", "")].values[0]) / 100.0
    y.append((model1_preds.iloc[:,1] == model2_preds.iloc[:,1]).mean())
    min_performer = min(model1_performance, model2_performance)
    max_perfromer = max(model1_performance, model2_performance)
    x.append([min_performer, max_perfromer - min_performer])
    texts.append(f"{task.replace('.tsv', '')} {model1} v. {model2}")





In [None]:
import statsmodels.api as sm     
import numpy as np 
from scipy import stats

x = np.array(x)
print(x.shape)

xt = sm.add_constant(x)
# import pdb; pdb.set_trace()
# xt = np.array(x)
lm_1 = sm.OLS(np.array(y), xt).fit()
print(lm_1.summary().as_latex())

plt.figure(figsize=(6,6))

sns.set(color_codes=True, font_scale=1.5, style='white')
# sns.set(font_scale=20)  # crazy big

ax = sns.regplot(x=x[:,0] * 100.0, y=y, label="test")
# ax = p.axes[0]
ax.legend()
leg = ax.get_legend()
L_labels = leg.get_texts()
# assuming you computed r_squared which is the coefficient of determination somewhere else
slope, intercept, r_value, p_value, std_err = stats.linregress(x[:,0], y)
label_line_2 = r'$R^2:{0:.5f}$'.format(r_value) # as an exampple or whatever you want[!
L_labels[0].set_text(label_line_2)
# ax.tick_params
ax.set_xlabel("Baseline Accuracy")
ax.set_ylabel("Test % Agreement")

plt.savefig('/content/drive/My Drive/NLP Power Analysis/plots/glue/glue_baseline_v_disagreement.pdf', bbox_inches='tight', 
               transparent=True,
               pad_inches=0)

In [None]:
print(lm_1.params)