In [1]:
import pandas as pd
import numpy as np
import json

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [2]:
ft_tasks = ["cola", "sst2", "mrpc", "stsb", "qnli", "rte"]
probing_tasks = ["Length", "Depth", "TopConstituents", "BigramShift", "Tense", "SubjNumber", "ObjNumber", "OddManOut", "CoordinationInversion"]
random_seeds = [1, 10, 42, 123, 1234]
encoders = ["bert", "roberta"]
num_exp = 34

In [3]:
exp_dir = "../experiments"
probing_dict = {}
for enc in encoders:
    probing_dict[enc] = {}
    for rs in random_seeds:
        probing_data = np.zeros((num_exp, len(probing_tasks)))
        for exp in range(0,num_exp):
            file_path = f"{exp_dir}/{enc}/{rs}/{exp}/probe_results.json"
            f = open(file_path)
            probing_res = json.load(f)
            for i, task in enumerate(probing_tasks):
                probing_data[exp,i] = probing_res[task]['acc']
            f.close()
        probing_dict[enc][rs] = probing_data
probing_dict

{'bert': {1: array([[59.3, 27.3, 66.1, 83.2, 88.6, 79.6, 75. , 59.4, 64.8],
         [56. , 27.5, 64. , 89.9, 87.5, 80.6, 76.5, 65.1, 64.3],
         [56.7, 26.5, 62.8, 81.3, 87.7, 78.1, 74.4, 61. , 65.9],
         [56.3, 28.2, 57.2, 86. , 87.1, 78.9, 75.6, 62.9, 64.8],
         [58.9, 30. , 62.7, 82.7, 85.6, 79.4, 77.7, 61.6, 62.8],
         [56.4, 29.5, 68.2, 82.2, 87.4, 84.3, 77.3, 61.1, 67. ],
         [56.7, 29.3, 66.2, 83.1, 88. , 82.9, 77.1, 60.7, 64.9],
         [53.6, 25.5, 63.4, 88.9, 87.8, 81.2, 75. , 64. , 68.3],
         [56.5, 27.4, 63.4, 89.6, 87.2, 79.2, 76.2, 63.9, 65.8],
         [57.7, 24.1, 63.9, 89.7, 88.4, 81.2, 76.7, 62.1, 64.4],
         [59.5, 29.2, 66.7, 89.1, 88.1, 81. , 77. , 63.8, 66.7],
         [65.2, 27.9, 64.2, 88.4, 88.2, 82. , 77.4, 63.8, 66.3],
         [54.4, 29.8, 61. , 81.8, 87.1, 78.8, 76.3, 61. , 66.7],
         [52.1, 26.6, 62.2, 81.6, 86.4, 75. , 73.9, 58.7, 67.6],
         [51.7, 28.1, 61.3, 79.6, 87.2, 79.3, 73.2, 57.9, 64.6],
         [57.7

In [4]:
ind_effect = {
    "cola": [[0,1], [2,7], [3,8], [4,9], [5,10], [6,11], [17,22], [18,23], [19,24], [20,25]],
    "sst2": [[0,2], [1,7], [3,12], [4,13], [5,14], [6,15], [17,26], [18,27], [19,28], [20,29]],
    "mrpc": [[0,3], [4,16], [1,8], [2,12], [5,17], [6,18], [10,22], [11,23], [14,26], [15,27]],
    "stsb": [[0,4], [3,16], [1,9], [2,13], [5,19], [6,20], [10,24], [11,25], [14,28], [15,29]],
    "qnli": [[0,5], [6,21], [1,10], [2,14], [3,17], [4,19], [8,22], [9,24], [12,26], [13,28]],
    "rte": [[0,6], [5,21], [1,11], [2,15], [3,18], [4,20], [8,23], [9,25], [12,27], [13,29]]
}

In [5]:
ind_effect_dict = {}
for enc in encoders:
    ind_effect_dict[enc] = {}
    for rs in random_seeds:
        ind_effect_dict[enc][rs] = {}
        for task, states in ind_effect.items():
            probing_diff = np.zeros((len(states), len(probing_tasks)))
            for i, (s,f) in enumerate(states):
                probing_diff[i] = probing_dict[enc][rs][f] - probing_dict[enc][rs][s]
            ind_effect_dict[enc][rs][task] = probing_diff
ind_effect_dict

{'bert': {1: {'cola': array([[-3.3,  0.2, -2.1,  6.7, -1.1,  1. ,  1.5,  5.7, -0.5],
          [-3.1, -1. ,  0.6,  7.6,  0.1,  3.1,  0.6,  3. ,  2.4],
          [ 0.2, -0.8,  6.2,  3.6,  0.1,  0.3,  0.6,  1. ,  1. ],
          [-1.2, -5.9,  1.2,  7. ,  2.8,  1.8, -1. ,  0.5,  1.6],
          [ 3.1, -0.3, -1.5,  6.9,  0.7, -3.3, -0.3,  2.7, -0.3],
          [ 8.5, -1.4, -2. ,  5.3,  0.2, -0.9,  0.3,  3.1,  1.4],
          [-0.9,  0.3, -1.2,  5. , -1. , -0.9,  0. ,  3. , -4. ],
          [ 0.4, -0.3,  1.9,  4.4, -0.6,  0.9, -1. ,  1.8, -1.2],
          [-2.7,  0.1, -3.3,  6.1, -0.1, -1.5, -0.1, -0.3,  0.5],
          [-2. ,  0. ,  3.1,  8.3,  1.1,  2.8,  0.6,  1.3,  2.7]]),
   'sst2': array([[-2.6, -0.8, -3.3, -1.9, -0.9, -1.5, -0.6,  1.6,  1.1],
          [-2.4, -2. , -0.6, -1. ,  0.3,  0.6, -1.5, -1.1,  4. ],
          [-1.9,  1.6,  3.8, -4.2,  0. , -0.1,  0.7, -1.9,  1.9],
          [-6.8, -3.4, -0.5, -1.1,  0.8, -4.4, -3.8, -2.9,  4.8],
          [-4.7, -1.4, -6.9, -2.6, -0.2, -5. , 

In [6]:
ind_effect_dfs_dict = {}
for task in ft_tasks:
    df_data = []
    for enc in encoders:
        for rs in random_seeds:
            for i, row in enumerate(ind_effect_dict[enc][rs][task]):
                df_data += [[enc, rs, ind_effect[task][i][0], ind_effect[task][i][1]] + np.round(row,1).tolist()]
    df = pd.DataFrame(data=df_data, columns=["enc", "seed", "start", "finish"]+probing_tasks)
    ind_effect_dfs_dict[task] = df

In [7]:
cola_df = ind_effect_dfs_dict["cola"]
cola_df

Unnamed: 0,enc,seed,start,finish,Length,Depth,TopConstituents,BigramShift,Tense,SubjNumber,ObjNumber,OddManOut,CoordinationInversion
0,bert,1,0,1,-3.3,0.2,-2.1,6.7,-1.1,1.0,1.5,5.7,-0.5
1,bert,1,2,7,-3.1,-1.0,0.6,7.6,0.1,3.1,0.6,3.0,2.4
2,bert,1,3,8,0.2,-0.8,6.2,3.6,0.1,0.3,0.6,1.0,1.0
3,bert,1,4,9,-1.2,-5.9,1.2,7.0,2.8,1.8,-1.0,0.5,1.6
4,bert,1,5,10,3.1,-0.3,-1.5,6.9,0.7,-3.3,-0.3,2.7,-0.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,roberta,1234,6,11,2.2,2.5,3.9,8.0,0.9,0.3,2.7,5.5,7.5
96,roberta,1234,17,22,-1.2,2.0,5.6,10.4,0.9,1.4,0.4,4.0,0.7
97,roberta,1234,18,23,2.4,2.6,8.0,11.9,3.8,1.1,5.1,4.9,6.1
98,roberta,1234,19,24,1.7,1.4,3.0,11.8,-1.0,1.2,1.8,0.9,-0.9


In [8]:
cola_df[probing_tasks].mean()

Length                  -1.556
Depth                   -0.091
TopConstituents          0.671
BigramShift              7.757
Tense                    0.490
SubjNumber              -0.062
ObjNumber               -0.046
OddManOut                2.798
CoordinationInversion    1.544
dtype: float64

In [9]:
cola_df.groupby('enc')[probing_tasks].mean()

Unnamed: 0_level_0,Length,Depth,TopConstituents,BigramShift,Tense,SubjNumber,ObjNumber,OddManOut,CoordinationInversion
enc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
bert,-0.364,-0.466,-0.826,6.348,-0.232,-0.38,-0.686,1.934,1.088
roberta,-2.748,0.284,2.168,9.166,1.212,0.256,0.594,3.662,2.0


In [10]:
cola_df.groupby(['enc', 'seed'])[probing_tasks].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Length,Depth,TopConstituents,BigramShift,Tense,SubjNumber,ObjNumber,OddManOut,CoordinationInversion
enc,seed,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
bert,1,-0.1,-0.91,0.29,6.09,0.22,0.33,0.12,2.18,0.36
bert,10,-1.56,0.01,-0.37,5.73,-0.34,0.16,-0.33,1.27,1.25
bert,42,-0.31,-0.12,-1.13,5.78,-0.45,-0.18,-0.28,1.31,0.55
bert,123,-0.09,-0.45,-1.74,7.13,-0.54,-1.73,-1.61,3.04,1.16
bert,1234,0.24,-0.86,-1.18,7.01,-0.05,-0.48,-1.33,1.87,2.12
roberta,1,-2.01,-0.18,0.25,8.33,1.62,-0.51,-0.05,2.78,0.18
roberta,10,-4.78,0.3,1.35,9.04,2.07,1.6,1.55,2.54,1.88
roberta,42,-3.77,0.09,0.87,9.31,-0.11,-1.25,-1.41,2.75,1.22
roberta,123,-2.47,-0.04,5.26,9.21,0.36,2.9,2.51,5.4,3.7
roberta,1234,-0.71,1.25,3.11,9.94,2.12,-1.46,0.37,4.84,3.02


In [11]:
cola_df.groupby(['enc', 'start', 'finish'])[probing_tasks].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Length,Depth,TopConstituents,BigramShift,Tense,SubjNumber,ObjNumber,OddManOut,CoordinationInversion
enc,start,finish,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
bert,0,1,-1.32,0.8,-2.94,6.32,-1.14,-0.02,-0.34,3.56,0.3
bert,2,7,-1.16,-1.2,-1.776357e-16,7.4,-0.7,0.22,-0.7,2.22,1.82
bert,3,8,-0.6,0.78,-0.06,5.6,0.44,0.14,-0.18,0.56,0.92
bert,4,9,-2.84,-1.0,-0.8,6.48,0.94,0.08,-1.94,3.34,3.54
bert,5,10,1.24,-1.26,-1.26,7.2,-0.24,-0.96,-0.66,2.46,0.58
bert,6,11,1.5,-1.82,-3.3,5.94,-0.26,-2.02,-0.68,2.16,0.96
bert,17,22,0.62,0.46,-2.86,5.22,-1.08,-0.62,-0.46,1.12,0.28
bert,18,23,-0.58,-0.64,0.92,5.52,-0.32,1.34,-0.24,1.46,1.02
bert,19,24,-0.32,-0.52,1.12,6.56,0.24,-1.26,-0.44,1.38,0.38
bert,20,25,-0.18,-0.26,0.92,7.24,-0.2,-0.7,-1.22,1.08,1.08
