## Split Rules based on similarity

In [1]:
import os

In [1072]:
loc = os.path.expanduser('~/checkpoint/lgw/data/comp_r10_n100_ov/')

In [3]:
all_paths = {}
modes = ["train","valid","test"]
for mode in modes:
    data_path = os.path.join(loc, mode)
    all_paths[mode] = [
        folder
        for folder in os.listdir(data_path)
        if os.path.isdir(os.path.join(data_path, folder)) and os.path.exists(os.path.join(data_path, folder, 'graph_prop.json'))
    ] 

In [4]:
len(all_paths['train'])

51

In [5]:
import sys
sys.path.append(os.path.expanduser('~/mlp/lgw/'))
from lgw.graph_generator import GraphGen, BigGraphGen
from lgw.args import get_args

In [6]:
args = get_args("--num_rel_choices 10 --num_splits 100 --graphs_per_world 5000 --num_worlds 100 --per_inverse_choices 0.5 --corrupt_eps_choices 0 --expand_steps_choices 5 --uniform_prob --fix_num_relations --policy overlap --folder ~/checkpoint/lgw/data/comp_r10_n100_ov --save_path ~/checkpoint/lgw/data/comp_r10_n100_ov/ --num_nodes 5000 --sample_graphs --num_train_rows 5000 --num_valid_rows 1000 --num_test_rows 1000 --world_train_val_test_split 0.95")

In [9]:
num_des = []
all_rules_mat = {}
all_big_gs = {}
for rule_world_mode, rule_worlds in all_paths.items():
    for rule_world in rule_worlds:
        rule_folder = os.path.join(loc, rule_world_mode, rule_world)
        big_g = BigGraphGen(None, args, gen_graph=False)
        big_g.load(rule_folder)
        num_des.append((rule_world, sum([len(big_g.path_dict[p]) for p in big_g.path_dict.keys()])))
        all_rules_mat[rule_world] = set(get_rule_key(big_g.rules))
        all_big_gs[rule_world] = big_g

In [10]:
sum([len(big_g.path_dict[p]) for p in big_g.path_dict.keys()])

259

In [607]:
len(num_des)

57

In [608]:
import re

def atoi(text):
    return int(text) if text.isdigit() else text

def natural_keys(text):
    '''
    alist.sort(key=natural_keys) sorts in human order
    http://nedbatchelder.com/blog/200712/human_sorting.html
    (See Toothy's implementation in the comments)
    '''
    return [ atoi(c) for c in re.split(r'(\d+)', text) ]

In [613]:
training_worlds = sorted(num_des, key=lambda x: natural_keys(x[0]))[:51]
valid_worlds = sorted(num_des, key=lambda x: natural_keys(x[0]))[51:54]
test_worlds = sorted(num_des, key=lambda x: natural_keys(x[0]))[54:]

In [12]:
similarity_dist = []
rules = list(all_rules_mat.keys())
for a in rules:
    sim = []
    for b in rules:
        sim.append(len(all_rules_mat[a].intersection(all_rules_mat[b])))
    similarity_dist.append(sim)

In [8]:
def get_rule_key(insp_rule):
    key = []
    for body,head in insp_rule.items():
        if type(body) == tuple:
            key.append('{},{}->{}'.format(body[0], body[1], head))
    return key

In [13]:
import numpy as np
similarity_dist = np.array(similarity_dist)

In [29]:
rules[:5]

['rule_49', 'rule_39', 'rule_13', 'rule_32', 'rule_45']

In [31]:
similarity_dist[0]

array([20, 10,  0,  3, 16,  0,  0, 11, 12,  0,  5,  0,  6, 14,  0,  0,  9,
        0,  0,  0, 13,  0,  4, 19,  0, 17, 19,  8,  0, 15,  0,  0,  0,  0,
        0,  0,  0,  0,  0, 18,  2,  0,  0,  0,  0,  1,  0,  7,  0,  0,  0,
       18, 17, 16, 15, 14, 13])

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
import random

### Calculate splits

For each world, make sure all test descriptors are unique

In [17]:
split_per = 0.8
global_train = []
global_test = []
all_splits = {}
com_f = 0
for rule_world, big_g in all_big_gs.items():
    splits_d = {}
    for target, p in big_g.path_dict.items():
        desc_lens = {}  # length -> descriptors
        desc = list(p.keys())
        if len(desc) > 0:
            for d in desc:
                d_len = len(d.split(","))
                if d_len not in desc_lens:
                    desc_lens[d_len] = []
                desc_lens[d_len].append(d)
            splits_d[target] = {"train": [], "valid": [], "test": []}
            for dlen, len_ds in desc_lens.items():
                num_train_used = 0
                num_test_used = 0
                if dlen == 2:
                    splits_d[target]["train"].extend(len_ds)
                    global_train.extend(len_ds)
                else:
                    # keep the common ones in train
                    common_train = set(len_ds).intersection(set(global_train))
                    if len(common_train) > 0:
                        splits_d[target]["train"].extend(list(common_train))
                        global_train.extend(list(common_train))
                        num_train_used = len(list(common_train))
                        com_f += 1
                    rest_ds = set(len_ds) - set(global_train)
                    # keep the common tests in test
                    common_test = set(rest_ds).intersection(set(global_test))
                    if len(common_test) > 0:
                        splits_d[target]["test"].extend(list(common_test))
                        global_test.extend(list(common_test))
                        num_test_used = len(list(common_test))
                        com_f += 1
                    rest_ds = set(rest_ds) - set(global_test)
                    if len(rest_ds) > 1:
                        train, test = train_test_split(
                            list(rest_ds), shuffle=True, train_size=split_per
                        )
                        if num_train_used > 0:
                            # adjust
                            test = test + train[-num_train_used:]
                            train = train[:-num_train_used]
                        assert len(set(test).intersection(global_train)) == 0
                        splits_d[target]["train"].extend(train)
                        global_train.extend(train)
                        splits_d[target]["test"].extend(test)
                        global_test.extend(test)
                        assert len(set(global_train).intersection(set(global_test))) == 0
                valid = random.sample(splits_d[target]["train"], int(len(splits_d[target]["train"]) * (1 - split_per)))
                splits_d[target]["valid"].extend(valid)
        all_splits[rule_world] = splits_d
        

In [18]:
len(set(global_train).intersection(set(global_test)))

0

In [19]:
## verify
## pool all train descriptors
## for each rule_world, check if there is no overlap

pooled_train = []
for rule_mode, split_d in all_splits.items():
    for target, split in split_d.items():
        pooled_train.extend(split['train'])
pooled_train = set(pooled_train)

pooled_test = []
for rule_mode, split_d in all_splits.items():
    for target, split in split_d.items():
        pooled_test.extend(split['test'])
pooled_test = set(pooled_test)

In [20]:
len(pooled_train)

16763

In [21]:
len(pooled_test)

6058

In [22]:
leak = 0
for rule_mode, split_d in all_splits.items():
    for target, split in split_d.items():
        leak += len(pooled_train.intersection(set(split['test'])))

In [23]:
leak

0

In [1073]:
## put the splits in graphs
for rule_mode, split_d in all_splits.items():
    all_big_gs[rule_mode].descriptor_splits = split_d
    if rule_mode in all_paths['train']:
        mode = 'train'
    elif rule_mode in all_paths['valid']:
        mode = 'valid'
    else:
        mode = 'test'
    rule_folder = os.path.join(loc, mode, rule_mode)
    # all_big_gs[rule_mode].save(rule_folder)

In [68]:
com_f

1272

## Calculate Rule overlaps

In [32]:
similarity_dist

array([[20, 10,  0, ..., 15, 14, 13],
       [10, 20,  0, ...,  5,  4,  3],
       [ 0,  0, 20, ...,  0,  0,  0],
       ...,
       [15,  5,  0, ..., 20, 19, 18],
       [14,  4,  0, ..., 19, 20, 19],
       [13,  3,  0, ..., 18, 19, 20]])

In [33]:
delta = 0.5
gs = np.exp(- np.array(similarity_dist) ** 2 / (2. * delta ** 2))

In [34]:
gs

array([[0.00000000e+000, 1.38389653e-087, 1.00000000e+000, ...,
        3.69388307e-196, 5.70904011e-171, 1.61608841e-147],
       [1.38389653e-087, 0.00000000e+000, 1.00000000e+000, ...,
        1.92874985e-022, 1.26641655e-014, 1.52299797e-008],
       [1.00000000e+000, 1.00000000e+000, 0.00000000e+000, ...,
        1.00000000e+000, 1.00000000e+000, 1.00000000e+000],
       ...,
       [3.69388307e-196, 1.92874985e-022, 1.00000000e+000, ...,
        0.00000000e+000, 2.75032531e-314, 3.77724997e-282],
       [5.70904011e-171, 1.26641655e-014, 1.00000000e+000, ...,
        2.75032531e-314, 0.00000000e+000, 2.75032531e-314],
       [1.61608841e-147, 1.52299797e-008, 1.00000000e+000, ...,
        3.77724997e-282, 2.75032531e-314, 0.00000000e+000]])

In [35]:
from sklearn.cluster import SpectralClustering
# mat = np.matrix(similarity)
sp = SpectralClustering(5).fit_predict(gs)



In [37]:
from collections import Counter

In [38]:
Counter(sp)

Counter({4: 11, 2: 12, 3: 12, 1: 11, 0: 11})

In [39]:
rule_splits = []
for i in range(5):
    rs = []
    for ri, rule in enumerate(rules):
        if sp[ri] == i:
            rs.append(rule)
    rule_splits.append(rs)

In [1077]:
rule_splits

[['rule_7',
  'rule_0',
  'rule_9',
  'rule_2',
  'rule_5',
  'rule_4',
  'rule_3',
  'rule_1',
  'rule_6',
  'rule_10',
  'rule_8'],
 ['rule_32',
  'rule_29',
  'rule_23',
  'rule_33',
  'rule_28',
  'rule_25',
  'rule_31',
  'rule_26',
  'rule_30',
  'rule_24',
  'rule_27'],
 ['rule_39',
  'rule_45',
  'rule_40',
  'rule_41',
  'rule_34',
  'rule_35',
  'rule_43',
  'rule_38',
  'rule_42',
  'rule_37',
  'rule_44',
  'rule_36'],
 ['rule_13',
  'rule_12',
  'rule_21',
  'rule_14',
  'rule_22',
  'rule_16',
  'rule_19',
  'rule_15',
  'rule_17',
  'rule_11',
  'rule_18',
  'rule_20'],
 ['rule_49',
  'rule_50',
  'rule_46',
  'rule_48',
  'rule_47',
  'rule_51',
  'rule_52',
  'rule_53',
  'rule_54',
  'rule_55',
  'rule_56']]

In [None]:
all_rules_mat['rule_7']

In [43]:
rules.index('rule_7')

9

In [64]:
similarity_dist[rules.index('rule_5')][rules.index('rule_56')]

0

In [51]:
## Create train and test splits
rule_train_test_splits = {}
for ri,rs in enumerate(rule_splits):
    train,test = train_test_split(rs, train_size=0.8)
    rule_train_test_splits[ri] = {'train': train, 'test': test}

## Dataset Statistics

In [620]:
print("Total number of worlds : {}".format(len(num_des)))
print("Total number of distinct rules : {}".format(len(set([r for k,v in all_rules_mat.items() for r in v]))))

Total number of worlds : 57
Total number of distinct rules : 76


In [619]:
len(all_rules_mat)

57

In [621]:
print("Average number of descriptors : {}".format(np.mean([n[1] for n in num_des])))

Average number of descriptors : 522.0701754385965


In [623]:
num_des_dict = {n[0]:n[1] for n in num_des}

In [633]:
max(num_des_dict.values())

5230

In [634]:
min(num_des_dict.values())

114

In [631]:
easy_num_des = [num_des_dict[n] for n in easy_rules]
print("Average number of descriptors for easy rules : {}".format(round(np.mean(easy_num_des),2)))
medium_num_des = [num_des_dict[n] for n in medium_rules]
print("Average number of descriptors for medium rules : {}".format(round(np.mean(medium_num_des),2)))
hard_num_des = [num_des_dict[n] for n in hard_rules]
print("Average number of descriptors for hard rules : {}".format(round(np.mean(hard_num_des),2)))

Average number of descriptors for easy rules : 1055.5
Average number of descriptors for medium rules : 385.76
Average number of descriptors for hard rules : 240.53


## Experiment 1 - Train on one similar split and evaluate on a similar split

`multitask_sim_sim`

In [54]:
# train rules
','.join(rule_train_test_splits[0]['train'])

'rule_5,rule_7,rule_1,rule_6,rule_9,rule_3,rule_2,rule_10'

In [58]:
# test rules
','.join(rule_train_test_splits[0]['test'])

'rule_4,rule_8,rule_0'

In [142]:
similar_train = 'rule_5,rule_7,rule_1,rule_6,rule_9,rule_3,rule_2,rule_10'

In [104]:
mean_max_overlap('rule_5,rule_7,rule_1,rule_6,rule_9,rule_3,rule_2,rule_10','rule_4,rule_8,rule_0')

19.0

In [134]:
perc_overlap('rule_5,rule_7,rule_1,rule_6,rule_9,rule_3,rule_2,rule_10','rule_4')

0.6896551724137931

In [133]:
perc_overlap('rule_5,rule_7,rule_1,rule_6,rule_9,rule_3,rule_2,rule_10','rule_8')

0.6896551724137931

In [132]:
perc_overlap('rule_5,rule_7,rule_1,rule_6,rule_9,rule_3,rule_2,rule_10','rule_0')

0.6551724137931034

|    | rep_fn   | comp_fn   | mode   | test_rule   |   updates |   accuracy |   minibatch |   epoch |   loss |   k | top_mode   | rule_world           |
|----|----------|-----------|--------|-------------|-----------|------------|-------------|---------|--------|-----|------------|----------------------|
|  0 | GAT      | E-GAT     | test   |             |         0 |      0.225 |           0 |    1500 |  5.213 |   0 | test       | rule_4,rule_8,rule_0 |
|  1 | Param    | E-GAT     | test   |             |         0 |      0.213 |           0 |    1500 |  5.723 |   0 | test       | rule_4,rule_8,rule_0 |
|  2 | GCN      | E-GAT     | test   |             |         0 |      0.219 |           0 |    1500 |  5.39  |   0 | test       | rule_4,rule_8,rule_0 |
|  3 | GAT      | RGCN      | test   |             |         0 |      0.181 |           0 |    1500 |  7.347 |   0 | test       | rule_4,rule_8,rule_0 |
|  4 | Param    | RGCN      | test   |             |         0 |      0.224 |           0 |    1500 |  4.975 |   0 | test       | rule_4,rule_8,rule_0 |
|  5 | GCN      | RGCN      | test   |             |         0 |      0.186 |           0 |    1500 |  9.465 |   0 | test       | rule_4,rule_8,rule_0 |

In [297]:
gradient = ['rule_4,rule_8,rule_1',
'rule_15,rule_20,rule_25',
'rule_40,rule_45,rule_50']
for g in gradient:
    print(perc_overlap_list(similar_train, g))

1.0
0.5
0.0


In [298]:
gradient_names = {g:round(perc_overlap_list(similar_train, g), 3) for g in gradient}

In [299]:
gradient_names

{'rule_4,rule_8,rule_1': 1.0,
 'rule_15,rule_20,rule_25': 0.5,
 'rule_40,rule_45,rule_50': 0.0}

In [None]:
import matplotlib.pyplt

In [243]:
import pandas as pd

In [300]:
sim_df = pd.read_csv('../scripts/multitask_logic_sim_sim_eval_results.csv')

In [301]:
sim_df = sim_df[sim_df.rule_world.isin(gradient_names.keys())]

In [397]:
sim_df_b = calc_recovery(sim_df)

In [304]:
sim_df['rule_sp'] = sim_df.rule_world.apply(lambda x: gradient_names[x])

In [400]:
sim_df_b.to_csv('clean_data/multitask_logic_sim_sim_eval_rec.csv')

In [344]:
### Correlate with absolute performance in the supervised setup
### Add a new metric : `recovery` - if it is > 100%, then its awesome

In [396]:
def calc_recovery(df):
    df['baseline'] = 0.0
    for i,row in df.iterrows():
        rule_worlds = row['rule_world'].split(',')
        #print(rule_worlds)
        #print("get_baseline({}, {}, rule)".format(row['rep_fn'], row['comp_fn']))
        mean_baseline = np.mean([get_baseline(row['rep_fn'], row['comp_fn'], rule) for rule in rule_worlds])
        df.at[i,'baseline'] = mean_baseline
    df['recovery'] = df.accuracy / df.baseline
    return df

In [345]:
supervised_all = pd.read_csv('supervised_result_complete.csv')

In [361]:
clean_model_names = {
    "GatedNodeGatEncoder": "GAT",
    "GatedGatEncoder": "E-GAT",
    "RepresentationGCNEncoder": "GCN",
    "CompositionRGCNEncoder": "RGCN",
    "Param": "Param",
}
supervised_all['rep_fn'] = supervised_all.model_representation_fn_path.apply(lambda x: clean_model_names[x.split('.')[-1]])
supervised_all['comp_fn'] = supervised_all.model_composition_fn_path.apply(lambda x: clean_model_names[x.split('.')[-1]])

In [640]:
supervised_test = pd.read_csv('supervised_result_complete_test.csv')
supervised_test['rep_fn'] = supervised_test.model_representation_fn_path.apply(lambda x: clean_model_names[x.split('.')[-1]])
supervised_test['comp_fn'] = supervised_test.model_composition_fn_path.apply(lambda x: clean_model_names[x.split('.')[-1]])

In [593]:
def get_baseline(rep_fn, comp_fn, rule):
    if rule in ['rule_54','rule_55','rule_56']:
        rez = supervised_test[(supervised_test.rep_fn == rep_fn) & (supervised_test.comp_fn == comp_fn) & (supervised_test.general_train_rule == rule)]
        rez['train_test_accuracy'] = rez.test_test_accuracy
    else:
        rez = supervised_all[(supervised_all.rep_fn == rep_fn) & (supervised_all.comp_fn == comp_fn) & (supervised_all.general_train_rule == rule)]
    return rez['train_test_accuracy'].values[0]

In [358]:
get_baseline('codes.model.gat.sig_edge_gat.GatedNodeGatEncoder', 'codes.model.gat.edge_gat.GatedGatEncoder', 'rule_34')

0.9083378165960312

In [594]:
get_baseline('GAT', 'E-GAT', 'rule_54')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


0.6379983872175217

In [415]:
conv_sim_sim = pd.read_csv('raw_data/convergence_sim_sim.csv')

In [417]:
conv_sim_sim['rep_fn'] = conv_sim_sim.model_representation_fn_path.apply(lambda x: clean_model_names[x.split('.')[-1]])
conv_sim_sim['comp_fn'] = conv_sim_sim.model_composition_fn_path.apply(lambda x: clean_model_names[x.split('.')[-1]])

In [418]:
def get_conv_sim(rep_fn, comp_fn, rule):
    rez = conv_sim_sim[(conv_sim_sim.rep_fn == rep_fn) & (conv_sim_sim.comp_fn == comp_fn) & (conv_sim_sim.general_train_rule == rule)]
    return rez['train_test_accuracy'].values[0]

In [422]:
get_conv_sim('GAT', 'E-GAT', 'rule_40')

0.5059267207980156

In [426]:
def calc_conv_sim(df):
    df['k_conv'] = 0.0
    for i,row in df.iterrows():
        rule_worlds = row['rule_world'].split(',')
        #print(rule_worlds)
        #print("get_baseline({}, {}, rule)".format(row['rep_fn'], row['comp_fn']))
        mean_baseline = np.mean([get_conv_sim(row['rep_fn'], row['comp_fn'], rule) for rule in rule_worlds])
        df.at[i,'k_conv'] = mean_baseline
    df['recovery_conv'] = df.k_conv / df.baseline
    return df

In [427]:
sim_df_c = calc_conv_sim(sim_df_b)

In [429]:
sim_df_c.to_csv('clean_data/multitask_logic_sim_sim_eval_conv.csv')

## Table - Inductive Training on Similar datasets

In [1074]:
sim_sim_all = pd.read_csv('raw_data/multitask_logic_sim_sim_eval_all_results.csv')

In [1075]:
sim_sim_all.sort_values(by="rep_fn")

Unnamed: 0.1,Unnamed: 0,rep_fn,comp_fn,mode,test_rule,updates,accuracy,minibatch,epoch,acc_std,loss,k,top_mode,rule_world
0,0,GAT,E-GAT,test,,0,0.534,0,1900,0.109,2.597,0,test,"rule_5,rule_7,rule_1,rule_6,rule_9,rule_3,rule..."
3,3,GAT,RGCN,test,,0,0.474,0,1900,0.109,3.262,0,test,"rule_5,rule_7,rule_1,rule_6,rule_9,rule_3,rule..."
2,2,GCN,E-GAT,test,,0,0.522,0,1900,0.1,2.609,0,test,"rule_5,rule_7,rule_1,rule_6,rule_9,rule_3,rule..."
5,5,GCN,RGCN,test,,0,0.448,0,1900,0.089,4.142,0,test,"rule_5,rule_7,rule_1,rule_6,rule_9,rule_3,rule..."
1,1,Param,E-GAT,test,,0,0.507,0,1900,0.091,2.625,0,test,"rule_5,rule_7,rule_1,rule_6,rule_9,rule_3,rule..."
4,4,Param,RGCN,test,,0,0.416,0,1900,0.075,2.488,0,test,"rule_5,rule_7,rule_1,rule_6,rule_9,rule_3,rule..."


In [497]:
# print latex
def print_latex_body(df):
    df = df.sort_values(by='rep_fn')
    for i,row in df.iterrows():
        print("{} & {} & {} \pm{{{}}}".format(row['rep_fn'],row['comp_fn'],round(row['accuracy'],3),round(row['acc_std'],2)))

In [498]:
print_latex_body(sim_sim_all)

GAT & E-GAT & 0.534 \pm{0.11}
GAT & RGCN & 0.474 \pm{0.11}
GCN & E-GAT & 0.522 \pm{0.1}
GCN & RGCN & 0.448 \pm{0.09}
Param & E-GAT & 0.507 \pm{0.09}
Param & RGCN & 0.416 \pm{0.07}


In [650]:
supervised_test['ind_acc'] = 0.0
for i,row in supervised_test.iterrows():
    if not pd.isnull(row['test_test_accuracy']):
        supervised_test.at[i,'ind_acc'] = row['test_test_accuracy']
    else:
        supervised_test.at[i,'ind_acc'] = get_baseline(row['rep_fn'], row['comp_fn'], row['general_train_rule'])
    supervised_test.at[i,'num_des'] = num_des_dict[row['general_train_rule']]

In [653]:
supervised_test['rule_id'] = supervised_test.general_train_rule.apply(lambda x: int(x.split('_')[-1]))

In [657]:
supervised_test['model'] = supervised_test.rep_fn + '_' + supervised_test.comp_fn

In [660]:
cleaned_sup_test = supervised_test.sort_values(by='rule_id')[['general_train_rule','num_des','model','ind_acc']]

In [667]:
row_lines = []
last_row_w = ''
row_j = {}
for i,row in cleaned_sup_test.iterrows():
    if last_row_w != row['general_train_rule']:
        if len(row_j) > 0:
            row_lines.append(row_j)
        row_j = {'World ID':row['general_train_rule'], 'Number of Descriptors': int(row['num_des'])}
        last_row_w = row['general_train_rule']
    row_j.update({row['model']: round(row['ind_acc'],3)})
row_lines.append(row_j)

In [668]:
all_models_table = pd.DataFrame(row_lines)

In [671]:
all_models_table.std()

Number of Descriptors    776.417417
GAT_RGCN                   0.136884
GCN_E-GAT                  0.139301
Param_RGCN                 0.133182
GCN_RGCN                   0.136612
Param_E-GAT                0.144128
GAT_E-GAT                  0.141933
dtype: float64

In [672]:
### Adaptation on Similar to Sim + Diss
ad_5 = pd.read_csv('../scripts/multitask_logic_sim_sim_k5eval_results.csv')
ad_10 = pd.read_csv('../scripts/multitask_logic_sim_sim_k10eval_results.csv')
ad_15 = pd.read_csv('../scripts/multitask_logic_sim_sim_k15eval_results.csv')
ad_20 = pd.read_csv('../scripts/multitask_logic_sim_sim_k20eval_results.csv')

In [695]:
rule_grouping = {
    'rule_4,': 1,
    'rule_8,': 1,
    'rule_1,': 1,
    'rule_15,': 2,
    'rule_20,': 2,
    'rule_25,': 2,
    'rule_40,': 3,
    'rule_45,': 3,
    'rule_50,': 3
}
rule_grouping_gr = {
    'rule_4,rule_8,rule_1': 1,
    'rule_15,rule_20,rule_25': 2,
    'rule_40,rule_45,rule_50': 3
}

In [694]:
ad_0 = sim_df_c.copy()

In [696]:
ad_0['model'] = ad_0.rep_fn + '_' + ad_0.comp_fn
ad_0['group'] = ad_0.rule_world.apply(lambda x: rule_grouping_gr[x])

In [691]:
ad_5['model'] = ad_5.rep_fn + '_' + ad_5.comp_fn
ad_5['group'] = ad_5.rule_world.apply(lambda x: rule_grouping[x])
ad_10['model'] = ad_10.rep_fn + '_' + ad_10.comp_fn
ad_10['group'] = ad_10.rule_world.apply(lambda x: rule_grouping[x])
ad_15['model'] = ad_15.rep_fn + '_' + ad_15.comp_fn
ad_15['group'] = ad_15.rule_world.apply(lambda x: rule_grouping[x])
ad_20['model'] = ad_20.rep_fn + '_' + ad_20.comp_fn
ad_20['group'] = ad_20.rule_world.apply(lambda x: rule_grouping[x])

In [698]:
ad_0 = ad_0.groupby(['group','model']).mean().reset_index()
ad_5 = ad_5.groupby(['group','model']).mean().reset_index()
ad_10 = ad_10.groupby(['group','model']).mean().reset_index()
ad_15 = ad_15.groupby(['group','model']).mean().reset_index()
ad_20 = ad_20.groupby(['group','model']).mean().reset_index()

In [701]:
ad_0['k'] = 0
ad_5['k'] = 5
ad_10['k'] = 10
ad_15['k'] = 15
ad_20['k'] = 20

In [702]:
ad_all = pd.concat([ad_0, ad_5, ad_10, ad_15, ad_20])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [704]:
ad_all = ad_all[['model','accuracy','k','group']]

In [706]:
ad_all.to_csv('clean_data/multitask_sim_all_adapt.csv')

## Experiment 2 - Train on one similar split and evaluate on a dissimilar split

In [60]:
# train rules
','.join(rule_train_test_splits[0]['train'])

'rule_5,rule_7,rule_1,rule_6,rule_9,rule_3,rule_2,rule_10'

In [61]:
# test rules
','.join(rule_train_test_splits[2]['test'])

'rule_39,rule_34,rule_43'

In [135]:
','.join(rule_train_test_splits[1]['test'])

'rule_29,rule_32,rule_23'

In [136]:
','.join(rule_train_test_splits[3]['test'])

'rule_11,rule_12,rule_18'

In [137]:
','.join(rule_train_test_splits[4]['test'])

'rule_47,rule_51,rule_55'

|    | rep_fn   | comp_fn   | mode   | test_rule   |   updates |   accuracy |   minibatch |   epoch |   loss |   k | top_mode   | rule_world              |
|----|----------|-----------|--------|-------------|-----------|------------|-------------|---------|--------|-----|------------|-------------------------|
|  0 | GAT      | E-GAT     | test   |             |         0 |      0.099 |           0 |    1500 |  8.129 |   0 | test       | rule_39,rule_34,rule_43 |
|  1 | Param    | E-GAT     | test   |             |         0 |      0.095 |           0 |    1500 |  8.604 |   0 | test       | rule_39,rule_34,rule_43 |
|  2 | GCN      | E-GAT     | test   |             |         0 |      0.109 |           0 |    1500 |  8.375 |   0 | test       | rule_39,rule_34,rule_43 |
|  3 | GAT      | RGCN      | test   |             |         0 |      0.108 |           0 |    1500 | 10.618 |   0 | test       | rule_39,rule_34,rule_43 |
|  4 | Param    | RGCN      | test   |             |         0 |      0.092 |           0 |    1500 |  9.011 |   0 | test       | rule_39,rule_34,rule_43 |
|  5 | GCN      | RGCN      | test   |             |         0 |      0.104 |           0 |    1500 | 14.914 |   0 | test       | rule_39,rule_34,rule_43 |

## Experiment 3 - Train on mix of similar and dissimilar split and evaluate the change in the above

In [57]:
# train_rules
','.join(rule_train_test_splits[0]['train'][:-3] + rule_train_test_splits[4]['train'][0:3])

'rule_5,rule_7,rule_1,rule_6,rule_9,rule_53,rule_50,rule_56'

In [59]:
# test rules
','.join(rule_train_test_splits[0]['test'])

'rule_4,rule_8,rule_0'

In [266]:
sim_dis_train = 'rule_5,rule_7,rule_1,rule_6,rule_9,rule_53,rule_50,rule_56'

In [106]:
mean_max_overlap('rule_5,rule_7,rule_1,rule_6,rule_9,rule_53,rule_50,rule_56','rule_4,rule_8,rule_0')

19.0

|    | rep_fn   | comp_fn   | mode   | test_rule   |   updates |   accuracy |   minibatch |   epoch |   loss |   k | top_mode   | rule_world           |
|----|----------|-----------|--------|-------------|-----------|------------|-------------|---------|--------|-----|------------|----------------------|
|  0 | GAT      | E-GAT     | test   |             |         0 |      0.203 |           0 |    1500 |  5.994 |   0 | test       | rule_4,rule_8,rule_0 |
|  1 | Param    | E-GAT     | test   |             |         0 |      0.186 |           0 |    1500 |  6.042 |   0 | test       | rule_4,rule_8,rule_0 |
|  2 | GCN      | E-GAT     | test   |             |         0 |      0.223 |           0 |    1500 |  5.872 |   0 | test       | rule_4,rule_8,rule_0 |
|  3 | GAT      | RGCN      | test   |             |         0 |      0.196 |           0 |    1500 |  8.36  |   0 | test       | rule_4,rule_8,rule_0 |
|  4 | Param    | RGCN      | test   |             |         0 |      0.195 |           0 |    1500 |  6.435 |   0 | test       | rule_4,rule_8,rule_0 |
|  5 | GCN      | RGCN      | test   |             |         0 |      0.201 |           0 |    1500 |  9.655 |   0 | test       | rule_4,rule_8,rule_0 |

In [404]:
gradient = ['rule_4,rule_8,rule_1',
'rule_15,rule_20,rule_23',
'rule_30,rule_31,rule_32']

In [405]:
for g in gradient:
    print(perc_overlap_list(sim_dis_train, g))

1.0
0.5
0.09090909090909091


In [406]:
gradient_names = {g:round(perc_overlap_list(sim_dis_train, g), 3) for g in gradient}

In [407]:
gradient_names

{'rule_4,rule_8,rule_1': 1.0,
 'rule_15,rule_20,rule_23': 0.5,
 'rule_30,rule_31,rule_32': 0.091}

In [408]:
gradient_names['rule_30,rule_31,rule_32'] = 0

In [437]:
sim_dis_df = pd.read_csv('../scripts/multitask_logic_sim_dis_eval_results.csv')
sim_dis_df['rule_sp'] = sim_dis_df.rule_world.apply(lambda x: gradient_names[x])
#sim_dis_df.to_csv('clean_data/multitask_logic_sim_dis_eval.csv')

In [438]:
sim_dis_df_b = calc_recovery(sim_dis_df)

In [411]:
sim_dis_df_b.to_csv('clean_data/multitask_logic_sim_dis_eval_rec.csv')

In [446]:
conv_sim_dis = pd.read_csv('raw_data/convergence_sim_dis.csv')
conv_sim_dis['rep_fn'] = conv_sim_dis.model_representation_fn_path.apply(lambda x: clean_model_names[x.split('.')[-1]])
conv_sim_dis['comp_fn'] = conv_sim_dis.model_composition_fn_path.apply(lambda x: clean_model_names[x.split('.')[-1]])

In [447]:
def get_conv_sim_dis(rep_fn, comp_fn, rule):
    rez = conv_sim_dis[(conv_sim_dis.rep_fn == rep_fn) & (conv_sim_dis.comp_fn == comp_fn) & (conv_sim_dis.general_train_rule == rule)]
    return rez['train_test_accuracy'].values[0]

In [448]:
get_conv_sim_dis('GAT', 'E-GAT', 'rule_23')

0.486328125

In [450]:
def calc_conv_sim_dis(df):
    df['k_conv'] = 0.0
    for i,row in df.iterrows():
        rule_worlds = row['rule_world'].split(',')
        #print(rule_worlds)
        #print("get_baseline({}, {}, rule)".format(row['rep_fn'], row['comp_fn']))
        mean_baseline = np.mean([get_conv_sim_dis(row['rep_fn'], row['comp_fn'], rule) for rule in rule_worlds])
        df.at[i,'k_conv'] = mean_baseline
    df['recovery_conv'] = df.k_conv / df.baseline
    return df

In [451]:
sim_dis_df_c = calc_conv_sim_dis(sim_dis_df_b)

In [453]:
sim_dis_df_c.to_csv('clean_data/multitask_logic_sim_dis_eval_conv.csv')

## Table : Inductive Training on Dissimilar datasets

In [1076]:
sim_dis_all = pd.read_csv('raw_data/multitask_logic_sim_dis_eval_all_results.csv')
sim_dis_all.sort_values(by="rep_fn")

Unnamed: 0.1,Unnamed: 0,rep_fn,comp_fn,mode,test_rule,updates,accuracy,minibatch,epoch,acc_std,loss,k,top_mode,rule_world
0,0,GAT,E-GAT,test,,0,0.534,0,1900,0.092,2.838,0,test,"rule_5,rule_7,rule_1,rule_6,rule_9,rule_53,rul..."
3,3,GAT,RGCN,test,,0,0.502,0,1900,0.085,3.34,0,test,"rule_5,rule_7,rule_1,rule_6,rule_9,rule_53,rul..."
2,2,GCN,E-GAT,test,,0,0.533,0,1900,0.093,2.686,0,test,"rule_5,rule_7,rule_1,rule_6,rule_9,rule_53,rul..."
5,5,GCN,RGCN,test,,0,0.476,0,1900,0.089,4.232,0,test,"rule_5,rule_7,rule_1,rule_6,rule_9,rule_53,rul..."
1,1,Param,E-GAT,test,,0,0.5,0,1900,0.089,2.967,0,test,"rule_5,rule_7,rule_1,rule_6,rule_9,rule_53,rul..."
4,4,Param,RGCN,test,,0,0.449,0,1900,0.07,2.605,0,test,"rule_5,rule_7,rule_1,rule_6,rule_9,rule_53,rul..."


In [496]:
print_latex_body(sim_dis_all)

GAT & E-GAT & 0.534 & \pm{0.09}
GAT & RGCN & 0.502 & \pm{0.09}
GCN & E-GAT & 0.533 & \pm{0.09}
GCN & RGCN & 0.476 & \pm{0.09}
Param & E-GAT & 0.5 & \pm{0.09}
Param & RGCN & 0.449 & \pm{0.07}


## Experiment 3.1 - Train on a mix of similar and dissimilar, where the distribution of dissimilars are from a different distribution

In [109]:
# train_rules
','.join(rule_train_test_splits[0]['train'][:-3] + [rule_train_test_splits[4]['train'][0], rule_train_test_splits[3]['train'][2], rule_train_test_splits[2]['train'][4]])

'rule_5,rule_7,rule_1,rule_6,rule_9,rule_53,rule_22,rule_38'

In [110]:
# test rules
','.join(rule_train_test_splits[0]['test'])

'rule_4,rule_8,rule_0'

|    | rep_fn   | comp_fn   | mode   | test_rule   |   updates |   accuracy |   minibatch |   epoch |   loss |   k | top_mode   | rule_world           |
|----|----------|-----------|--------|-------------|-----------|------------|-------------|---------|--------|-----|------------|----------------------|
|  0 | GAT      | E-GAT     | test   |             |         0 |      0.209 |           0 |    1500 |  5.71  |   0 | test       | rule_4,rule_8,rule_0 |
|  1 | Param    | E-GAT     | test   |             |         0 |      0.182 |           0 |    1500 |  5.862 |   0 | test       | rule_4,rule_8,rule_0 |
|  2 | GCN      | E-GAT     | test   |             |         0 |      0.19  |           0 |    1500 |  5.343 |   0 | test       | rule_4,rule_8,rule_0 |
|  3 | GAT      | RGCN      | test   |             |         0 |      0.172 |           0 |    1500 |  7.361 |   0 | test       | rule_4,rule_8,rule_0 |
|  4 | Param    | RGCN      | test   |             |         0 |      0.165 |           0 |    1500 |  5.822 |   0 | test       | rule_4,rule_8,rule_0 |
|  5 | GCN      | RGCN      | test   |             |         0 |      0.157 |           0 |    1500 |  9.7   |   0 | test       | rule_4,rule_8,rule_0 |

In [336]:
## evidence experiment

In [339]:
sim_diff = 'rule_5,rule_7,rule_1,rule_6,rule_9,rule_53,rule_22,rule_38'
for g in gradient:
    print(perc_overlap_list(sim_diff, g))

1.0
1.0
1.0


In [343]:
for g in gradient:
    print(count_overlap_list(sim_diff, g))

1.8641975308641976
1.7678571428571428
0.46969696969696967


In [324]:
gradient

['rule_4,rule_8,rule_1', 'rule_15,rule_20,rule_23', 'rule_30,rule_31,rule_32']

In [325]:
gradient_names = {g:round(perc_overlap_list(sim_diff, g), 3) for g in gradient}

In [328]:
gradient_names

{'rule_4,rule_8,rule_1': 'Set A',
 'rule_15,rule_20,rule_23': 'Set B',
 'rule_30,rule_31,rule_32': 'Set C'}

In [327]:
gradient_names['rule_4,rule_8,rule_1'] = 'Set A'
gradient_names['rule_15,rule_20,rule_23'] = 'Set B'
gradient_names['rule_30,rule_31,rule_32'] = 'Set C'

In [329]:
sim_dis_df = pd.read_csv('../scripts/multitask_logic_sim_diff_eval_results.csv')
sim_dis_df['rule_sp'] = sim_dis_df.rule_world.apply(lambda x: gradient_names[x])
sim_dis_df.to_csv('clean_data/multitask_logic_sim_diff_eval.csv')

In [175]:
asl = [all_rules_mat[r] for r in sim_diff.split(',')]
asl = [x for r in asl for x in r]

In [178]:
len(asl)

160

In [180]:
tp = [all_rules_mat[r] for r in gradient[-1].split(',')]
tp = [x for r in tp for x in r]

In [182]:
aslc = Counter(asl)
tpc = Counter(tp)

In [185]:
len(tpc) / len(aslc)

0.4166666666666667

## Experiment: Train on entirely dissiimilar datasets

In [550]:
diff_diff = 'rule_0,rule_1,rule_2,rule_16,rule_17,rule_18,rule_32,rule_33,rule_34'
gradient = [
    'rule_10,rule_20,rule_30',
    'rule_6,rule_12,rule_25',
    'rule_54,rule_55,rule_56'
]

In [551]:
for g in gradient:
    print(count_overlap_list(diff_diff, g))

2.6625
2.6794871794871793
0.0


In [552]:
for g in gradient:
    print(perc_overlap_list(diff_diff, g))

1.0
1.0
0.0


In [553]:
gradient_names = {g:round(perc_overlap_list(sim_diff, g), 3) for g in gradient} = {g:round(perc_overlap_list(diff_diff, g), 3) for g in gradient}

In [582]:
gradient_names

{'rule_10,rule_20,rule_30': 1.0,
 'rule_6,rule_12,rule_25': 0.9,
 'rule_54,rule_55,rule_56': 0.0}

In [555]:
gradient_names['rule_6,rule_12,rule_25'] = 0.9

In [602]:
conv_diff = pd.read_csv('raw_data/convergence_diff_diff.csv')
conv_diff['rep_fn'] = conv_diff.model_representation_fn_path.apply(lambda x: clean_model_names[x.split('.')[-1]])
conv_diff['comp_fn'] = conv_diff.model_composition_fn_path.apply(lambda x: clean_model_names[x.split('.')[-1]])
def get_conv_diff(rep_fn, comp_fn, rule):
    rez = conv_diff[(conv_diff.rep_fn == rep_fn) & (conv_diff.comp_fn == comp_fn) & (conv_diff.general_train_rule == rule)]
    if rule in ['rule_54','rule_55','rule_56']:
        return rez['test_test_accuracy'].values[0]
    else:
        return rez['train_test_accuracy'].values[0]

In [603]:
def calc_conv_diff(df):
    df['k_conv'] = 0.0
    for i,row in df.iterrows():
        rule_worlds = row['rule_world'].split(',')
        #rint(rule_worlds)
        #print("get_baseline({}, {}, rule)".format(row['rep_fn'], row['comp_fn']))
        mean_baseline = np.mean([get_conv_diff(row['rep_fn'], row['comp_fn'], rule) for rule in rule_worlds])
        df.at[i,'k_conv'] = mean_baseline
    df['recovery_conv'] = df.k_conv / df.baseline
    return df

In [581]:
diff_df = pd.read_csv('../scripts/multitask_logic_diff_diff_results.csv')

In [500]:
# inductive test on the multitask train
diff_all = pd.read_csv('../scripts/multitask_logic_diff_diff_eval_all_results.csv')

In [501]:
print_latex_body(diff_all)

GAT & E-GAT & 0.515 \pm{0.16}
GAT & RGCN & 0.45 \pm{0.17}
GCN & E-GAT & 0.499 \pm{0.15}
GCN & RGCN & 0.451 \pm{0.17}
Param & E-GAT & 0.504 \pm{0.14}
Param & RGCN & 0.396 \pm{0.13}


In [583]:
diff_df['rule_sp'] = diff_df.rule_world.apply(lambda x: gradient_names[x])

In [595]:
diff_df_b = calc_recovery(diff_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [604]:
diff_df_c = calc_conv_diff(diff_df_b)

In [606]:
diff_df_c.to_csv('clean_data/multitask_logic_diff_diff_eval_conv.csv')

In [567]:
conv_diff.general_train_rule.unique()

array(['rule_25', 'rule_12', 'rule_10', 'rule_56', 'rule_20', 'rule_30',
       'rule_6', 'rule_54', 'rule_55'], dtype=object)

In [568]:
conv_diff.rep_fn.unique()

array(['GCN', 'Param', 'GAT'], dtype=object)

In [578]:
get_conv_diff('GAT','E-GAT','rule_10')

Series([], Name: train_test_accuracy, dtype: float64)

In [557]:
diff_df.to_csv('clean_data/multitask_logic_diff_diff_eval.csv')

## Splitting rules based on difficulty

In [65]:
scores = [(0.9336, "rule_34"),
(0.8482, "rule_46"),
(0.8416, "rule_37"),
(0.8286, "rule_20"),
(0.8083, "rule_48"),
(0.7993, "rule_31"),
(0.7914, "rule_35"),
(0.7868, "rule_19"),
(0.7593, "rule_9"),
(0.7438, "rule_39"),
(0.7415, "rule_49"),
(0.74, "rule_12"),
(0.7377, "rule_38"),
(0.7273, "rule_47"),
(0.7265, "rule_36"),
(0.713, "rule_24"),
(0.6921, "rule_18"),
(0.6898, "rule_33"),
(0.6715, "rule_32"),
(0.6645, "rule_50"),
(0.6422, "rule_8"),
(0.64, "rule_10"),
(0.6392, "rule_7"),
(0.6225, "rule_4"),
(0.6099, "rule_41"),
(0.6082, "rule_21"),
(0.5977, "rule_22"),
(0.5814, "rule_43"),
(0.5773, "rule_26"),
(0.572, "rule_28"),
(0.5692, "rule_45"),
(0.5561, "rule_44"),
(0.5523, "rule_11"),
(0.5336, "rule_40"),
(0.5316, "rule_42"),
(0.5165, "rule_6"),
(0.516, "rule_5"),
(0.5157, "rule_23"),
(0.4964, "rule_0"),
(0.4698, "rule_15"),
(0.4605, "rule_13"),
(0.432, "rule_1"),
(0.417, "rule_3"),
(0.4159, "rule_25"),
(0.4089, "rule_14"),
(0.4078, "rule_29"),
(0.4048, "rule_30"),
(0.3667, "rule_2"),
(0.3417, "rule_17"),
(0.3222, "rule_16")]

In [84]:
easy_rules = [s[1] for s in scores if s[0] >= 0.70]
medium_rules = [s[1] for s in scores if s[0] > 0.54 and s[0] < 0.7]
hard_rules = [s[1] for s in scores if s[0] <= 0.54]

In [708]:
difficulty_dict = {}
for e in easy_rules:
    difficulty_dict[e] = 'Easy World'
for e in medium_rules:
    difficulty_dict[e] = 'Medium World'
for e in hard_rules:
    difficulty_dict[e] = 'Hard World'

In [710]:
import json

In [711]:
json.dump(difficulty_dict, open('clean_data/difficulty_dict.json','w'))

In [707]:
easy_rules

['rule_34',
 'rule_46',
 'rule_37',
 'rule_20',
 'rule_48',
 'rule_31',
 'rule_35',
 'rule_19',
 'rule_9',
 'rule_39',
 'rule_49',
 'rule_12',
 'rule_38',
 'rule_47',
 'rule_36',
 'rule_24']

In [712]:
hard_rules

['rule_40',
 'rule_42',
 'rule_6',
 'rule_5',
 'rule_23',
 'rule_0',
 'rule_15',
 'rule_13',
 'rule_1',
 'rule_3',
 'rule_25',
 'rule_14',
 'rule_29',
 'rule_30',
 'rule_2',
 'rule_17',
 'rule_16']

In [96]:
def mean_max_overlap(tr,ts):
    tr = tr.split(',')
    ts = ts.split(',')
    scores = []
    for ti in ts:
        scores.append(max([similarity_dist[rules.index(p)][rules.index(ti)] for p in tr]))
    return np.mean(scores)

In [120]:
def mean_overlap(tr, ts):
    tr = tr.split(',')
    scores = []
    for ti in tr:
        scores.append(similarity_dist[rules.index(ti)][rules.index(ts)])
    return np.mean(scores)

In [131]:
def perc_overlap(tr, ts):
    tr = tr.split(',')
    scores = []
    all_rules = set([r for t in tr for r in all_rules_mat[t]])
    test_rule = all_rules_mat[ts]
    return len(all_rules_mat[ts].intersection(all_rules)) / len(all_rules)

In [191]:
def perc_overlap_list(tr, ts):
    tr = tr.split(',')
    ts = ts.split(',')
    scores = []
    base_rules = [all_rules_mat[r] for r in tr]
    base_rules = set([x for r in base_rules for x in r])
    comp_rules = [all_rules_mat[r] for r in ts]
    comp_rules = set([x for r in comp_rules for x in r])
    return len(comp_rules.intersection(base_rules)) / len(base_rules)

In [236]:
def perc_overlap_list(tr, ts):
    tr = tr.split(',')
    ts = ts.split(',')
    scores = []
    base_rules = [all_rules_mat[r] for r in tr]
    base_rules = set([x for r in base_rules for x in r])
    comp_rules = [all_rules_mat[r] for r in ts]
    comp_rules = set([x for r in comp_rules for x in r])
    return len(comp_rules.intersection(base_rules)) / len(comp_rules)

In [342]:
def count_overlap_list(tr, ts):
    tr = tr.split(',')
    ts = ts.split(',')
    scores = []
    base_rules = [all_rules_mat[r] for r in tr]
    base_rules = Counter([x for r in base_rules for x in r])
    comp_rules = [all_rules_mat[r] for r in ts]
    comp_rules = Counter([x for r in comp_rules for x in r])
    return np.mean([base_rules[c]/comp_rules[c] if c in base_rules else 0 for c in comp_rules])

In [85]:
len(easy_rules)

16

In [86]:
len(medium_rules)

17

In [87]:
len(hard_rules)

17

# Difficulty Experiments

## Experiment 4: Multitask Train on Easy, eval on Easy

In [89]:
## train_easy
','.join(easy_rules[:8])

'rule_34,rule_46,rule_37,rule_20,rule_48,rule_31,rule_35,rule_19'

In [93]:
## test easy
','.join(easy_rules[-3:])

'rule_47,rule_36,rule_24'

In [713]:
train_easy_worlds = 'rule_34,rule_46,rule_37,rule_20,rule_48,rule_31,rule_35,rule_19'

In [727]:
test_easy_worlds = 'rule_47,rule_36,rule_24'

In [745]:
test_easy_worlds = 'rule_9,rule_47,rule_24'

- **config_path** : `multitask/multitask_logic_easy_easy`

In [746]:
mean_max_overlap(train_easy_worlds, test_easy_worlds)

15.0

In [730]:
easy_rules

['rule_34',
 'rule_46',
 'rule_37',
 'rule_20',
 'rule_48',
 'rule_31',
 'rule_35',
 'rule_19',
 'rule_9',
 'rule_39',
 'rule_49',
 'rule_12',
 'rule_38',
 'rule_47',
 'rule_36',
 'rule_24']

|    | rep_fn   | comp_fn   | mode   | test_rule   |   updates |   accuracy |   minibatch |   epoch |   loss |   k | top_mode   | rule_world              |
|----|----------|-----------|--------|-------------|-----------|------------|-------------|---------|--------|-----|------------|-------------------------|
|  0 | GAT      | E-GAT     | test   |             |         0 |      0.23  |           0 |    1500 |  5.368 |   0 | test       | rule_47,rule_36,rule_24 |
|  1 | Param    | E-GAT     | test   |             |         0 |      0.255 |           0 |    1500 |  5.339 |   0 | test       | rule_47,rule_36,rule_24 |
|  2 | GCN      | E-GAT     | test   |             |         0 |      0.258 |           0 |    1500 |  4.685 |   0 | test       | rule_47,rule_36,rule_24 |
|  3 | GAT      | RGCN      | test   |             |         0 |      0.24  |           0 |    1500 |  6.054 |   0 | test       | rule_47,rule_36,rule_24 |
|  4 | Param    | RGCN      | test   |             |         0 |      0.208 |           0 |    1500 |  4.369 |   0 | test       | rule_47,rule_36,rule_24 |
|  5 | GCN      | RGCN      | test   |             |         0 |      0.218 |           0 |    1500 |  7.875 |   0 | test       | rule_47,rule_36,rule_24 |

## Experiment 7: Train on Easy, eval on Medium

In [741]:
mean_max_overlap(train_easy_worlds, test_medium_worlds)

16.0

|    | rep_fn   | comp_fn   | mode   | test_rule   |   updates |   accuracy |   minibatch |   epoch |   loss |   k | top_mode   | rule_world              |
|----|----------|-----------|--------|-------------|-----------|------------|-------------|---------|--------|-----|------------|-------------------------|
|  0 | GAT      | E-GAT     | test   |             |         0 |      0.244 |           0 |    1500 |  5.611 |   0 | test       | rule_45,rule_28,rule_11 |
|  1 | Param    | E-GAT     | test   |             |         0 |      0.225 |           0 |    1500 |  5.496 |   0 | test       | rule_45,rule_28,rule_11 |
|  2 | GCN      | E-GAT     | test   |             |         0 |      0.231 |           0 |    1500 |  5.131 |   0 | test       | rule_45,rule_28,rule_11 |
|  3 | GAT      | RGCN      | test   |             |         0 |      0.212 |           0 |    1500 |  6.997 |   0 | test       | rule_45,rule_28,rule_11 |
|  4 | Param    | RGCN      | test   |             |         0 |      0.192 |           0 |    1500 |  4.262 |   0 | test       | rule_45,rule_28,rule_11 |
|  5 | GCN      | RGCN      | test   |             |         0 |      0.191 |           0 |    1500 |  7.291 |   0 | test       | rule_45,rule_28,rule_11 |

## Experiment 8: Train on Easy, eval on Hard

In [742]:
mean_max_overlap(train_easy_worlds, test_hard_worlds)

11.666666666666666

|    | rep_fn   | comp_fn   | mode   | test_rule   |   updates |   accuracy |   minibatch |   epoch |   loss |   k | top_mode   | rule_world             |
|----|----------|-----------|--------|-------------|-----------|------------|-------------|---------|--------|-----|------------|------------------------|
|  0 | GAT      | E-GAT     | test   |             |         0 |      0.256 |           0 |    1500 |  4.933 |   0 | test       | rule_2,rule_17,rule_16 |
|  1 | Param    | E-GAT     | test   |             |         0 |      0.208 |           0 |    1500 |  4.939 |   0 | test       | rule_2,rule_17,rule_16 |
|  2 | GCN      | E-GAT     | test   |             |         0 |      0.232 |           0 |    1500 |  4.93  |   0 | test       | rule_2,rule_17,rule_16 |
|  3 | GAT      | RGCN      | test   |             |         0 |      0.198 |           0 |    1500 |  7.765 |   0 | test       | rule_2,rule_17,rule_16 |
|  4 | Param    | RGCN      | test   |             |         0 |      0.168 |           0 |    1500 |  4.182 |   0 | test       | rule_2,rule_17,rule_16 |
|  5 | GCN      | RGCN      | test   |             |         0 |      0.19  |           0 |    1500 |  7.841 |   0 | test       | rule_2,rule_17,rule_16 |

## Experiment 5: Multitask, Train on Medium, eval on Medium

In [90]:
## train medium
','.join(medium_rules[:8])

'rule_18,rule_33,rule_32,rule_50,rule_8,rule_10,rule_7,rule_4'

In [714]:
train_medium_worlds = 'rule_18,rule_33,rule_32,rule_50,rule_8,rule_10,rule_7,rule_4'

In [94]:
## test medium
','.join(medium_rules[-3:])

'rule_45,rule_44,rule_11'

In [724]:
test_medium_worlds = 'rule_45,rule_28,rule_11'

In [726]:
mean_max_overlap(train_medium_worlds, test_medium_worlds)

16.666666666666668

|    | rep_fn   | comp_fn   | mode   | test_rule   |   updates |   accuracy |   minibatch |   epoch |   loss |   k | top_mode   | rule_world              |
|----|----------|-----------|--------|-------------|-----------|------------|-------------|---------|--------|-----|------------|-------------------------|
|  0 | GAT      | E-GAT     | test   |             |         0 |      0.221 |           0 |    1500 |  5.507 |   0 | test       | rule_45,rule_44,rule_11 |
|  1 | Param    | E-GAT     | test   |             |         0 |      0.188 |           0 |    1500 |  5.904 |   0 | test       | rule_45,rule_44,rule_11 |
|  2 | GCN      | E-GAT     | test   |             |         0 |      0.175 |           0 |    1500 |  5.446 |   0 | test       | rule_45,rule_44,rule_11 |
|  3 | GAT      | RGCN      | test   |             |         0 |      0.191 |           0 |    1500 |  6.609 |   0 | test       | rule_45,rule_44,rule_11 |
|  4 | Param    | RGCN      | test   |             |         0 |      0.169 |           0 |    1500 |  4.964 |   0 | test       | rule_45,rule_44,rule_11 |
|  5 | GCN      | RGCN      | test   |             |         0 |      0.164 |           0 |    1500 |  8.957 |   0 | test       | rule_45,rule_44,rule_11 |

## Experiment 9: Train on Medium, eval on Easy

In [747]:
mean_max_overlap(train_medium_worlds, test_easy_worlds)

16.666666666666668

|    | rep_fn   | comp_fn   | mode   | test_rule   |   updates |   accuracy |   minibatch |   epoch |   loss |   k | top_mode   | rule_world              |
|----|----------|-----------|--------|-------------|-----------|------------|-------------|---------|--------|-----|------------|-------------------------|
|  0 | GAT      | E-GAT     | test   |             |         0 |      0.163 |           0 |    1500 |  6.486 |   0 | test       | rule_47,rule_36,rule_24 |
|  1 | Param    | E-GAT     | test   |             |         0 |      0.167 |           0 |    1500 |  6.207 |   0 | test       | rule_47,rule_36,rule_24 |
|  2 | GCN      | E-GAT     | test   |             |         0 |      0.145 |           0 |    1500 |  6.148 |   0 | test       | rule_47,rule_36,rule_24 |
|  3 | GAT      | RGCN      | test   |             |         0 |      0.131 |           0 |    1500 |  7.765 |   0 | test       | rule_47,rule_36,rule_24 |
|  4 | Param    | RGCN      | test   |             |         0 |      0.129 |           0 |    1500 |  5.567 |   0 | test       | rule_47,rule_36,rule_24 |
|  5 | GCN      | RGCN      | test   |             |         0 |      0.145 |           0 |    1500 | 10.325 |   0 | test       | rule_47,rule_36,rule_24 |

## Experiment 10: Train on Medium, eval on Hard

In [736]:
mean_max_overlap(train_medium_worlds, test_hard_worlds)

16.333333333333332

|    | rep_fn   | comp_fn   | mode   | test_rule   |   updates |   accuracy |   minibatch |   epoch |   loss |   k | top_mode   | rule_world             |
|----|----------|-----------|--------|-------------|-----------|------------|-------------|---------|--------|-----|------------|------------------------|
|  0 | GAT      | E-GAT     | test   |             |         0 |      0.252 |           0 |    1500 |  5.207 |   0 | test       | rule_30,rule_2,rule_16 |
|  1 | Param    | E-GAT     | test   |             |         0 |      0.23  |           0 |    1500 |  5.483 |   0 | test       | rule_30,rule_2,rule_16 |
|  2 | GCN      | E-GAT     | test   |             |         0 |      0.214 |           0 |    1500 |  5.019 |   0 | test       | rule_30,rule_2,rule_16 |
|  3 | GAT      | RGCN      | test   |             |         0 |      0.163 |           0 |    1500 |  6.343 |   0 | test       | rule_30,rule_2,rule_16 |
|  4 | Param    | RGCN      | test   |             |         0 |      0.17  |           0 |    1500 |  4.487 |   0 | test       | rule_30,rule_2,rule_16 |
|  5 | GCN      | RGCN      | test   |             |         0 |      0.158 |           0 |    1500 |  8.024 |   0 | test       | rule_30,rule_2,rule_16 |

## Experiment 6: Multitask, Train on Hard, Eval on Hard

In [91]:
## train hard
','.join(hard_rules[:8])

'rule_40,rule_42,rule_6,rule_5,rule_23,rule_0,rule_15,rule_13'

In [715]:
train_hard_worlds = 'rule_40,rule_42,rule_6,rule_5,rule_23,rule_0,rule_15,rule_13'

In [113]:
## test hard
','.join(random.sample(hard_rules[8:],3))

'rule_30,rule_2,rule_16'

In [722]:
mean_max_overlap(train_easy_worlds, 'rule_30,rule_2,rule_16')

11.666666666666666

In [723]:
test_hard_worlds = 'rule_25,rule_2,rule_16'

|    | rep_fn   | comp_fn   | mode   | test_rule   |   updates |   accuracy |   minibatch |   epoch |   loss |   k | top_mode   | rule_world             |
|----|----------|-----------|--------|-------------|-----------|------------|-------------|---------|--------|-----|------------|------------------------|
|  0 | GAT      | E-GAT     | test   |             |         0 |      0.177 |           0 |    1500 |  6.25  |   0 | test       | rule_30,rule_2,rule_16 |
|  1 | Param    | E-GAT     | test   |             |         0 |      0.143 |           0 |    1500 |  6.39  |   0 | test       | rule_30,rule_2,rule_16 |
|  2 | GCN      | E-GAT     | test   |             |         0 |      0.152 |           0 |    1500 |  5.444 |   0 | test       | rule_30,rule_2,rule_16 |
|  3 | GAT      | RGCN      | test   |             |         0 |      0.181 |           0 |    1500 |  7.46  |   0 | test       | rule_30,rule_2,rule_16 |
|  4 | Param    | RGCN      | test   |             |         0 |      0.138 |           0 |    1500 |  5.354 |   0 | test       | rule_30,rule_2,rule_16 |
|  5 | GCN      | RGCN      | test   |             |         0 |      0.168 |           0 |    1500 |  8.855 |   0 | test       | rule_30,rule_2,rule_16 |

## Experiment 11: Train on Hard, eval on Easy

In [793]:
from itertools import combinations

In [802]:
def find_highest_overlap(all_rules, t1):
    t1 = t1.split(',')
    avbl = [t for t in all_rules if t not in t1]
    print(len(avbl))
    test_s = []
    for comb in combinations(avbl, 3):
        t = ','.join(comb)
        test_s.append((t, perc_overlap_list(','.join(t1),t)))
    test_s = sorted(test_s, key=lambda x: x[1], reverse=True)
    return test_s[0]

In [923]:
def find_lowest_overlap(all_rules, t1):
    t1 = t1.split(',')
    avbl = [t for t in all_rules if t not in t1]
    print(len(avbl))
    test_s = []
    for comb in combinations(avbl, 3):
        t = ','.join(comb)
        test_s.append((t, perc_overlap_list(','.join(t1),t)))
    test_s = sorted(test_s, key=lambda x: x[1])
    return test_s[0]

In [771]:
perc_overlap_list(train_easy_worlds, test_easy_worlds)

0.8181818181818182

In [772]:
perc_overlap_list(train_easy_worlds, test_medium_worlds)

0.8518518518518519

In [773]:
perc_overlap_list(train_easy_worlds, test_hard_worlds)

0.6046511627906976

In [928]:
train_easy_worlds

'rule_34,rule_46,rule_37,rule_20,rule_48,rule_31,rule_35,rule_19'

In [929]:
train_medium_worlds

'rule_18,rule_33,rule_32,rule_50,rule_8,rule_10,rule_7,rule_4'

In [930]:
train_hard_worlds

'rule_40,rule_42,rule_6,rule_5,rule_23,rule_0,rule_15,rule_13'

In [952]:
easy_rules

['rule_34',
 'rule_46',
 'rule_37',
 'rule_20',
 'rule_48',
 'rule_31',
 'rule_35',
 'rule_19',
 'rule_9',
 'rule_39',
 'rule_49',
 'rule_12',
 'rule_38',
 'rule_47',
 'rule_36',
 'rule_24']

In [958]:
c1 = ['rule_34','rule_20','rule_31','rule_19','rule_9','rule_12','rule_24']

In [967]:
','.join(c1)

'rule_34,rule_20,rule_31,rule_19,rule_9,rule_12,rule_24'

In [960]:
c1_m = (np.mean([get_baseline('GAT','E-GAT',c) for c in c1]), np.std([get_baseline('GAT','E-GAT',c) for c in c1]))

In [961]:
c1_m

(0.7932621964386531, 0.05851431226229156)

In [962]:
c2 = ['rule_18','rule_33','rule_32','rule_8','rule_10','rule_7','rule_26']
c2_m = (np.mean([get_baseline('GAT','E-GAT',c) for c in c2]), np.std([get_baseline('GAT','E-GAT',c) for c in c2]))

In [968]:
','.join(c2)

'rule_18,rule_33,rule_32,rule_8,rule_10,rule_7,rule_26'

In [963]:
c2_m

(0.6472204050847462, 0.04482468531224333)

In [964]:
c3 = ['rule_6','rule_5','rule_23','rule_15','rule_29','rule_16','rule_30']
c3_m = (np.mean([get_baseline('GAT','E-GAT',c) for c in c3]),np.std([get_baseline('GAT','E-GAT',c) for c in c3]))

In [969]:
','.join(c3)

'rule_6,rule_5,rule_23,rule_15,rule_29,rule_16,rule_30'

In [965]:
c3_m

(0.4604997304933412, 0.07448555465864513)

## Debiasing

In [803]:
find_highest_overlap(easy_rules, train_easy_worlds)

8


('rule_39,rule_38,rule_47', 1.0)

In [924]:
find_lowest_overlap(easy_rules, train_easy_worlds)

8


('rule_9,rule_12,rule_24', 0.7142857142857143)

In [817]:
test_easy_easy = 'rule_39,rule_38,rule_47'

In [805]:
find_highest_overlap(medium_rules, train_easy_worlds)

17


('rule_33,rule_32,rule_41', 1.0)

In [925]:
find_lowest_overlap(medium_rules, train_easy_worlds)

17


('rule_8,rule_7,rule_4', 0.375)

In [818]:
test_easy_med = 'rule_33,rule_32,rule_41'

In [806]:
find_highest_overlap(hard_rules, train_easy_worlds)

17


('rule_40,rule_42,rule_23', 1.0)

In [926]:
find_lowest_overlap(hard_rules, train_easy_worlds)

17


('rule_0,rule_1,rule_2', 0.13636363636363635)

In [819]:
test_easy_hard = 'rule_40,rule_42,rule_23'

In [807]:
find_highest_overlap(easy_rules, train_medium_worlds)

16


('rule_34,rule_46,rule_37', 1.0)

In [823]:
test_med_easy = 'rule_34,rule_46,rule_37'

In [808]:
find_highest_overlap(medium_rules, train_medium_worlds)

9


('rule_41,rule_21,rule_22', 1.0)

In [824]:
test_med_med = 'rule_41,rule_21,rule_22'

In [809]:
find_highest_overlap(hard_rules, train_medium_worlds)

17


('rule_40,rule_42,rule_6', 1.0)

In [825]:
test_med_hard = 'rule_40,rule_42,rule_6'

In [810]:
find_highest_overlap(easy_rules, train_hard_worlds)

16


('rule_34,rule_37,rule_20', 1.0)

In [826]:
test_hard_easy = 'rule_34,rule_37,rule_20'

In [811]:
find_highest_overlap(medium_rules, train_hard_worlds)

17


('rule_18,rule_33,rule_32', 1.0)

In [827]:
test_hard_med = 'rule_18,rule_33,rule_32'

In [812]:
find_highest_overlap(hard_rules, train_hard_worlds)

9


('rule_1,rule_3,rule_25', 1.0)

In [828]:
test_hard_hard = 'rule_1,rule_3,rule_25'

In [813]:
train_hard_worlds

'rule_40,rule_42,rule_6,rule_5,rule_23,rule_0,rule_15,rule_13'

In [776]:
perc_overlap_list(train_hard_worlds, test_hard_worlds)

1.0

In [783]:
perc_overlap_list(train_medium_worlds, test_easy_worlds)

1.0

In [784]:
perc_overlap_list(train_medium_worlds, test_medium_worlds)

1.0

In [785]:
perc_overlap_list(train_medium_worlds, test_hard_worlds)

0.9534883720930233

## Experiment 12: Train on Hard, eval on Medium

In [526]:
df_easy = pd.read_csv('../scripts/multitask_logic_easy_easy_eval_results.csv')
df_med = pd.read_csv('../scripts/multitask_logic_med_med_eval_results.csv')
df_hard = pd.read_csv('../scripts/multitask_logic_hard_hard_eval_results.csv')

In [537]:
df_easy = pd.read_csv('../scripts/multitask_logic_easy_easy_eval_indv_results.csv')
df_med = pd.read_csv('../scripts/multitask_logic_med_med_eval_indv_results.csv')
df_hard = pd.read_csv('../scripts/multitask_logic_hard_hard_eval_indv_results.csv')

In [532]:
clean_rule_world = {
    'rule_47,rule_36,rule_24': 'easy',
    'rule_45,rule_28,rule_11': 'medium',
    'rule_30,rule_2,rule_16': 'hard',
    'rule_47': 'easy',
    'rule_36': 'easy',
    'rule_24': 'easy',
    'rule_45': 'medium',
    'rule_28': 'medium',
    'rule_11': 'medium',
    'rule_30': 'hard',
    'rule_2': 'hard',
    'rule_16': 'hard'
}

In [538]:
df_easy['test_world'] = df_easy.rule_world.apply(lambda x: clean_rule_world[x.split(',')[0]])
df_easy['train_world'] = 'easy'
df_med['test_world'] = df_med.rule_world.apply(lambda x: clean_rule_world[x.split(',')[0]])
df_med['train_world'] = 'medium'
df_hard['test_world'] = df_hard.rule_world.apply(lambda x: clean_rule_world[x.split(',')[0]])
df_hard['train_world'] = 'hard'

In [539]:
df_diff = pd.concat([df_easy, df_med, df_hard])

In [540]:
df_diff.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,Unnamed: 0.1.1.1,Unnamed: 0.1.1.1.1,Unnamed: 0.1.1.1.1.1,Unnamed: 0.1.1.1.1.1.1,Unnamed: 0.1.1.1.1.1.1.1,Unnamed: 0.1.1.1.1.1.1.1.1,acc_std,...,loss,minibatch,mode,rep_fn,rule_world,test_rule,top_mode,updates,test_world,train_world
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.514,0,test,GAT,"rule_36,",,test,0,easy,easy
1,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,...,4.486,0,test,Param,"rule_36,",,test,0,easy,easy
2,2,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.0,...,3.817,0,test,GCN,"rule_36,",,test,0,easy,easy
3,3,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,0.0,...,5.632,0,test,GAT,"rule_36,",,test,0,easy,easy
4,4,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,0.0,...,3.802,0,test,Param,"rule_36,",,test,0,easy,easy


In [542]:
df_diff['recovery'] = 0.0
for i,row in df_diff.iterrows():
    base = get_baseline(row['rep_fn'],row['comp_fn'],row['rule_world'].split(',')[0])
    df_diff.at[i,'recovery'] = row['accuracy'] / base

In [543]:
df_diff.to_csv('clean_data/multitask_logic_diff_eval.csv')

In [502]:
## convergence results

In [514]:
df_easy_all = pd.read_csv('raw_data/convergence_easy_all.csv')
df_easy_all['test_world'] = df_easy_all.general_train_rule.apply(lambda x: clean_rule_world[x])
df_easy_all['train_world'] = 'easy'

In [515]:
df_medium_all = pd.read_csv('raw_data/convergence_medium_all.csv')
df_medium_all['test_world'] = df_medium_all.general_train_rule.apply(lambda x: clean_rule_world[x])
df_medium_all['train_world'] = 'medium'

In [516]:
df_hard_all = pd.read_csv('raw_data/convergence_hard_all.csv')
df_hard_all['test_world'] = df_hard_all.general_train_rule.apply(lambda x: clean_rule_world[x])
df_hard_all['train_world'] = 'hard'

In [517]:
df_e_m_h = pd.concat([df_easy_all,df_medium_all,df_hard_all])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [519]:
clean_model_names = {
    "GatedNodeGatEncoder": "GAT",
    "GatedGatEncoder": "E-GAT",
    "RepresentationGCNEncoder": "GCN",
    "CompositionRGCNEncoder": "RGCN",
    "Param": "Param",
}
df_e_m_h['rep_fn'] = df_e_m_h.model_representation_fn_path.apply(lambda x: clean_model_names[x.split('.')[-1]])
df_e_m_h['comp_fn'] = df_e_m_h.model_composition_fn_path.apply(lambda x: clean_model_names[x.split('.')[-1]])

In [525]:
df_e_m_h.to_csv('clean_data/multitask_diff_conv_eval.csv')

In [524]:
df_e_m_h['recovery'] = 0.0
for i,row in df_e_m_h.iterrows():
    base = get_baseline(row['rep_fn'],row['comp_fn'],row['general_train_rule'])
    df_e_m_h.at[i,'recovery'] = row['train_test_accuracy'] / base

In [523]:
get_baseline('GCN','RGCN','rule_16')

0.30630388110876083

In [769]:
train_easy_worlds

'rule_34,rule_46,rule_37,rule_20,rule_48,rule_31,rule_35,rule_19'

In [750]:
test_easy_worlds

'rule_9,rule_47,rule_24'

In [752]:
test_medium_worlds

'rule_45,rule_28,rule_11'

In [751]:
test_hard_worlds

'rule_25,rule_2,rule_16'

In [1062]:
# easy_all = pd.read_csv('../scripts/multitask_logic_easy_easy_k20eval_results.csv')
# med_all = pd.read_csv('../scripts/multitask_logic_med_med_k20eval_results.csv')
# hard_all = pd.read_csv('../scripts/multitask_logic_hard_hard_k20eval_results.csv')
# easy_all = pd.read_csv('../scripts/multitask_logic_easy_easy_k10eval_results.csv')
# med_all = pd.read_csv('../scripts/multitask_logic_med_med_k10eval_results.csv')
# hard_all = pd.read_csv('../scripts/multitask_logic_hard_hard_k10eval_results.csv')
# easy_all = pd.read_csv('../scripts/multitask_logic_easy_easy_k30eval_results.csv')
# med_all = pd.read_csv('../scripts/multitask_logic_med_med_k30eval_results.csv')
# hard_all = pd.read_csv('../scripts/multitask_logic_hard_hard_k30eval_results.csv')
# easy_all = pd.read_csv('../scripts/multitask_logic_easy_easy_k5eval_results.csv')
# med_all = pd.read_csv('../scripts/multitask_logic_med_med_k5eval_results.csv')
# hard_all = pd.read_csv('../scripts/multitask_logic_hard_hard_k5eval_results.csv')
# easy_all = pd.read_csv('../scripts/multitask_logic_easy_easy_k15eval_results.csv')
# med_all = pd.read_csv('../scripts/multitask_logic_med_med_k15eval_results.csv')
# hard_all = pd.read_csv('../scripts/multitask_logic_hard_hard_k15eval_results.csv')
# easy_all = pd.read_csv('../scripts/multitask_logic_easy_easy_k25eval_results.csv')
# med_all = pd.read_csv('../scripts/multitask_logic_med_med_k25eval_results.csv')
# hard_all = pd.read_csv('../scripts/multitask_logic_hard_hard_k25eval_results.csv')
# easy_all = pd.read_csv('../scripts/multitask_logic_easy_easy_k0eval_results.csv')
# med_all = pd.read_csv('../scripts/multitask_logic_med_med_k0eval_results.csv')
# hard_all = pd.read_csv('../scripts/multitask_logic_hard_hard_k0eval_results.csv')
# easy_all = pd.read_csv('../scripts/multitask_logic_easy_easy_k40eval_results.csv')
# med_all = pd.read_csv('../scripts/multitask_logic_med_med_k40eval_results.csv')
# hard_all = pd.read_csv('../scripts/multitask_logic_hard_hard_k40eval_results.csv')
easy_all = pd.read_csv('../scripts/multitask_logic_easy_easy_k-1eval_results.csv')
med_all = pd.read_csv('../scripts/multitask_logic_med_med_k-1eval_results.csv')
hard_all = pd.read_csv('../scripts/multitask_logic_hard_hard_k-1eval_results.csv')


In [1063]:
easy_all.rule_world = easy_all.rule_world.apply(lambda x: x.split(',')[0])
med_all.rule_world = med_all.rule_world.apply(lambda x: x.split(',')[0])
hard_all.rule_world = hard_all.rule_world.apply(lambda x: x.split(',')[0])

In [982]:
test_comb = 'rule_54,rule_55,rule_56'

In [1064]:
easy_all[easy_all.rule_world.isin(test_comb.split(','))].accuracy.mean()

0.5213148148148149

In [1065]:
med_all[med_all.rule_world.isin(test_comb.split(','))].accuracy.mean()

0.5397962962962963

In [1066]:
hard_all[hard_all.rule_world.isin(test_comb.split(','))].accuracy.mean()

0.5388333333333333

In [974]:
easy_all[easy_all.rule_world.isin(test_easy_med.split(','))].accuracy.mean()

0.49283333333333335

In [975]:
easy_all[easy_all.rule_world.isin(test_easy_hard.split(','))].accuracy.mean()

0.4297777777777778

In [916]:
hard_all[hard_all.rule_world.isin(test_hard_easy.split(','))].accuracy.mean()

0.204

In [917]:
hard_all[hard_all.rule_world.isin(test_hard_med.split(','))].accuracy.mean()

0.11325

In [918]:
hard_all[hard_all.rule_world.isin(test_hard_hard.split(','))].accuracy.mean()

0.18241666666666667

In [919]:
test_easy_modes = {}
for t in test_easy_easy.split(','):
    test_easy_modes[t] = 'easy'
for t in test_easy_med.split(','):
    test_easy_modes[t] = 'medium'
for t in test_easy_hard.split(','):
    test_easy_modes[t] = 'hard'

test_med_modes = {}
for t in test_med_easy.split(','):
    test_med_modes[t] = 'easy'
for t in test_med_med.split(','):
    test_med_modes[t] = 'medium'
for t in test_med_hard.split(','):
    test_med_modes[t] = 'hard'
    
test_hard_modes = {}
for t in test_hard_easy.split(','):
    test_hard_modes[t] = 'easy'
for t in test_hard_med.split(','):
    test_hard_modes[t] = 'medium'
for t in test_hard_hard.split(','):
    test_hard_modes[t] = 'hard'

In [1009]:
easy_all['train_world'] = 'easy'
easy_all['test_world'] = easy_all.rule_world.apply(lambda x: test_easy_modes[x])
med_all['train_world'] = 'medium'
med_all['test_world'] = med_all.rule_world.apply(lambda x: test_med_modes[x])
hard_all['train_world'] = 'hard'
hard_all['test_world'] = hard_all.rule_world.apply(lambda x: test_hard_modes[x])

KeyError: 'rule_54'

In [1067]:
easy_all['train_world'] = 'easy'
easy_all['test_world'] = 'novel'
med_all['train_world'] = 'medium'
med_all['test_world'] = 'novel'
hard_all['train_world'] = 'hard'
hard_all['test_world'] = 'novel'

In [1068]:
emh_all = pd.concat([easy_all, med_all, hard_all])

In [1070]:
# emh_all.to_csv('clean_data/k_20_difficulty.csv')
# emh_all.to_csv('clean_data/k_10_difficulty.csv')
# emh_all.to_csv('clean_data/k_30_difficulty.csv')
# emh_all.to_csv('clean_data/k_5_difficulty.csv')
# emh_all.to_csv('clean_data/k_15_difficulty.csv')
# emh_all.to_csv('clean_data/k_25_difficulty.csv')
# emh_all.to_csv('clean_data/k_0_difficulty.csv')
# emh_all.to_csv('clean_data/k_40_difficulty.csv')
# emh_all.to_csv('clean_data/novel_k_0_difficulty.csv')
# emh_all.to_csv('clean_data/novel_k_5_difficulty.csv')
# emh_all.to_csv('clean_data/novel_k_10_difficulty.csv')
# emh_all.to_csv('clean_data/novel_k_15_difficulty.csv')
# emh_all.to_csv('clean_data/novel_k_20_difficulty.csv')
# emh_all.to_csv('clean_data/novel_k_30_difficulty.csv')
# emh_all.to_csv('clean_data/novel_k_40_difficulty.csv')
emh_all.to_csv('clean_data/novel_k_100_difficulty.csv')

In [822]:
easy_all[easy_all.rule_world == test_easy_hard].accuracy.mean()

0.27566666666666667

In [829]:
hard_all[hard_all.rule_world == test_hard_easy].accuracy.mean()

0.20333333333333334

In [830]:
hard_all[hard_all.rule_world == test_hard_med].accuracy.mean()

0.11183333333333333

In [831]:
hard_all[hard_all.rule_world == test_hard_hard].accuracy.mean()

0.1835

In [832]:
med_all[med_all.rule_world == test_med_easy].accuracy.mean()

0.1645

In [833]:
med_all[med_all.rule_world == test_med_med].accuracy.mean()

0.17300000000000001

In [834]:
med_all[med_all.rule_world == test_med_hard].accuracy.mean()

0.1775