In [1]:
import pandas as pd
import warnings
import os
import numpy as np
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
plt.rc('font', family='serif')
plt.rc('xtick', labelsize='x-small')
plt.rc('ytick', labelsize='x-small')
plt.rc('text', usetex=False)
import matplotlib as mpl
mpl.rcParams['figure.dpi'] = 300

### Results Loading Code

In [2]:
sns.set(style="ticks", font_scale=1.5, color_codes=True)
# sns.set(, rc={'text.usetex' : False})

In [3]:
def get_splits(fname):
    splits = !grep -hnr 'TEST EVALS\|VAL EVALS\|TEST EASY\|VAL EASY' {fname}
    length = !wc -l {fname}
    length = int(length[0].split(' ')[0])
    starts = []
    names  = []
    for s in splits:
        start,sname = s.split(':')
        starts.append(int(start))
        names.append(sname)
    starts.append(length)
    sdict = {}
    for i in range(0,len(starts)-1):
        sdict[names[i]] = slice(starts[i],starts[i+1])
    return sdict   

In [4]:
def get_tokens(line):
    return line.split(" ")[1:]

In [5]:
def find_next_matching_block(lines,start):
    index = start
    found = False
    for l in lines[start:]:
        if l.startswith('INPUT:'):
            found = True
            break
        index+=1
    if found:
        inp   = lines[index]
        ref   = get_tokens(lines[index+1])
        pred  = get_tokens(lines[index+2])
        return (inp,ref,pred), index+3
    else:
        return None, None

In [6]:
def calculate_scores(lines):
    start=0 
    finished = False
    preds, tps, fps, fns, f1s= [],[],[],[],[]
    while not finished:
        data, start = find_next_matching_block(lines,start)
        if data is not None:
            inp, ref, pred_here = data
            tp = (len([p for p in pred_here if p in ref]))
            fp = (len([p for p in pred_here if p not in ref]))
            fn = (len([p for p in ref if p not in pred_here]))
            prec = tp / (tp + fp)
            rec = tp / (tp + fn)
            if prec == 0 or rec == 0:
                f1 = 0
            else:
                f1 = 2 * prec * rec / (prec + rec)
            f1s.append(f1)
            tps.append(tp)
            fps.append(fp)
            fns.append(fn)
            preds.append(pred_here == ref)
        else:
            finished = True
    tp, fp, fn = np.sum(tps), np.sum(fps), np.sum(fns)
    prec = tp / (tp + fp)
    rec = tp / (tp + fn)
    if prec == 0 or rec == 0:
        f1 = 0
    else:
        f1 = 2 * prec * rec / (prec + rec)
    return np.mean(preds), np.std(preds),f1,np.std(f1s)

In [7]:
def calculate_scores_unreduced(lines):
    start=0 
    finished = False
    preds, tps, fps, fns, f1s= [],[],[],[],[]
    while not finished:
        data, start = find_next_matching_block(lines,start)
        if data is not None:
            inp, ref, pred_here = data
            tp = (len([p for p in pred_here if p in ref]))
            fp = (len([p for p in pred_here if p not in ref]))
            fn = (len([p for p in ref if p not in pred_here]))
            prec = tp / (tp + fp)
            rec = tp / (tp + fn)
            if prec == 0 or rec == 0:
                f1 = 0
            else:
                f1 = 2 * prec * rec / (prec + rec)
            f1s.append(f1)
            tps.append(tp)
            fps.append(fp)
            fns.append(fn)
            preds.append(pred_here == ref)
        else:
            finished = True
    tp, fp, fn = np.sum(tps), np.sum(fps), np.sum(fns)
    prec = tp / (tp + fp)
    rec = tp / (tp + fn)
    if prec == 0 or rec == 0:
        f1 = 0
    else:
        f1 = 2 * prec * rec / (prec + rec)
    return np.mean(preds), preds, f1, f1s

In [8]:
testfile = "./checkpoints/SIGDataSet/spanish/logs/2proto.vae.true.hints.4.seed.0.cond.log"
testlines  = open(testfile,'r').readlines()
print(testfile)
for (s,r) in get_splits(testfile).items():
         print(f"split: {s}\n",calculate_scores(testlines[r]))

./checkpoints/SIGDataSet/spanish/logs/2proto.vae.true.hints.4.seed.0.cond.log
split: TEST EVALS
 (0.14, 0.34698703145794946, 0.7411865864144453, 0.18678766329735408)
split: VAL EVALS
 (0.02, 0.13999999999999999, 0.47993579454253615, 0.19854781800481414)
split: TEST EASY
 (0.61, 0.4877499359302879, 0.8627450980392157, 0.19229000089563794)
split: VAL EASY
 (0.62, 0.48538644398046393, 0.8636763412489007, 0.22038132700600857)


In [9]:
def get_lang_scores(df=None,
                    langs=("spanish","turkish","swahili"),
                    hintss=(4,8,16),
                    seeds=(0,1,2,3,4),
                    vaes =("true","false"),
                    models=("baseline","0proto","1proto","2proto"),
                    exppath="./checkpoints",
                   ):
    for lang in langs:
        for hints in hintss:
            for seed in seeds:
                for vae in vaes:
                    for model in  models:
                        langpath=os.path.join(exppath,"SIGDataSet",lang)
                        if model == "baseline" or model == "geca":
                            identifier ="{}.hints.{}.seed.{}".format(model,hints,seed)
                        else:
                            identifier ="{}.vae.{}.hints.{}.seed.{}".format(model,vae,hints,seed)
                        condfile=os.path.join(langpath,"logs",identifier+".cond.log") 
                        if os.path.exists(condfile):
                            lines  = open(condfile,'r').readlines()
                            if len(lines) < 2142:
                                print("format broken in "+condfile)
                                continue
#                             print("processing: "+condfile)
                            for (s,r) in get_splits(condfile).items():#splitinfo.items():
                                acc, accstd, f1, f1std = calculate_scores(lines[r])  
                                df.loc[len(df.index)] = (lang,hints,seed,vae,model,s,acc,accstd,f1,f1std)
                        else:
                            print(f"file doesnot exist: {condfile}")

### Morph Results

In [10]:
df = pd.DataFrame(columns=('Language', 'Hints', 'Seed', 'Vae','Model','Split','Acc','Acc_std','F1','F1_std',))
get_lang_scores(df=df,exppath="./checkpoints")
get_lang_scores(df=df,exppath="./checkpoints_gecaexp",
                models=("geca",))
df

Unnamed: 0,Language,Hints,Seed,Vae,Model,Split,Acc,Acc_std,F1,F1_std
0,spanish,4,0,true,baseline,TEST EVALS,0.01,0.099499,0.595486,0.172985
1,spanish,4,0,true,baseline,VAL EVALS,0.03,0.170587,0.465753,0.201385
2,spanish,4,0,true,baseline,TEST EASY,0.64,0.480000,0.873440,0.191549
3,spanish,4,0,true,baseline,VAL EASY,0.61,0.487750,0.866667,0.201838
4,spanish,4,0,true,0proto,TEST EVALS,0.20,0.400000,0.719523,0.207692
...,...,...,...,...,...,...,...,...,...,...
1795,swahili,16,4,true,geca,VAL EASY,0.57,0.495076,0.893037,0.161466
1796,swahili,16,4,false,geca,TEST EVALS,0.04,0.195959,0.746372,0.127119
1797,swahili,16,4,false,geca,VAL EVALS,0.12,0.324962,0.707844,0.228641
1798,swahili,16,4,false,geca,TEST EASY,0.51,0.499900,0.882096,0.167574


#### Graph

In [11]:
# #options
# splits_s = ["TEST EVALS", "TEST EASY", "VAL EVALS"]
# splits_s_alt = ["Future Tense", "Present Tense", "Past Tense"]
# score = "F1"
# cols = ["Language", "Seed", "Model", "Split", score]
# vae = "false"
# hints = 8
def get_morph_results_graph(df):

    #filter based on options
    df_filtered = df.replace(splits_s, splits_s_alt).loc[(df['Split'].isin(splits_s)) & (df['Vae'] == vae)  & (df["Hints"]==hints), cols]
    df_filtered.head()
    print(len(df_filtered))

    #aggregate to get the mean
    agg = df_filtered.groupby(by=["Model","Split","Seed"]).agg("mean"). \
          reset_index()
    agg["Language"] = "Average"+score
    agg = agg[df_filtered.columns]
    print(len(agg))

    #new df with mean
    df_filtered_avg = df_filtered.append(agg, ignore_index=True)
    return(df_filtered_avg)

In [12]:
sns.set_style({'font.family':'serif', 'font.serif':'Times New Roman'})
# sns.set_style({'font.family': 'Times New Roman'})

##### Hints=4, NOVAE

In [13]:
#options
splits_s = ["TEST EVALS", "TEST EASY", "VAL EVALS"]
splits_s_alt = ["Future Tense", "Present Tense", "Past Tense"]
score = "F1"
cols = ["Language", "Seed", "Model", "Split", score]
# vae = "false"
# hints = 4
df_filtered_avg = get_morph_results_graph(df)
df_filtered_avg.rename(columns={'Split':'Set'}, inplace=True)
palette = {"baseline":"grey",
          "0proto":"lightsalmon",
          "1proto":"salmon",
          "2proto":"coral",
          "geca":"cornflowerblue"}


NameError: name 'vae' is not defined

In [None]:
df_filtered_avg.head()

In [None]:
def get_morph_graph(df_filtered_avg, hints=4, vae="false"):
    g = sns.catplot(x="Set",
               y=score,
               col="Language",
               hue="Model",
               col_order=["Average"+score, "spanish","turkish","swahili"],
               kind="bar",
               data=df_filtered_avg,
               hue_order=["baseline","geca","0proto","1proto","2proto"],
               ci='sd',
               palette=palette#sns.color_palette("RdBu", n_colors=5)
               )

    #fix labels and save
    axes = g.axes.flatten()
    axes[0].set_title("Average")
    axes[0].set_ylim(0.25,)
    axes[1].set_title("Spanish")
    axes[2].set_title("Turkish")
    axes[3].set_title("Swahili")
    g.set_xticklabels(rotation=15)
    return(g)
    

In [None]:
# #options
# splits_s = ["TEST EVALS", "TEST EASY", "VAL EVALS"]
# splits_s_alt = ["Future Tense", "Present Tense", "Past Tense"]
# score = "F1"
# cols = ["Language", "Seed", "Model", "Split", score]
# vae = "false"
# hints = 4
# df_filtered_avg = get_morph_results_graph(df)

# df_filtered_avg = df_filtered_avg.rename(columns={'Split':'Set'})

g = get_morph_graph(df_filtered_avg, hints=4, vae="false")
g.savefig("morph_results_hints_{}_vae_{}_{}.pdf".format(hints,vae,score), dpi=300, verbose=True)

##### Hints=4, VAE

In [None]:
g = get_morph_graph(df_filtered_avg, hints=4, vae="true")
g.savefig("morph_results_hints_{}_vae_{}_{}.pdf".format(hints,vae,score), dpi=300, verbose=True)

# options
# splits_s = ["TEST EVALS", "TEST EASY", "VAL EVALS"]
# splits_s_alt = ["Future Tense", "Present Tense", "Past Tense"]
# score = "F1"
# cols = ["Language", "Seed", "Model", "Split", score]
# vae = "true"
# hints = 4
# df_filtered_avg = get_morph_results_graph(df)

# df_filtered_avg = df_filtered_avg.rename(columns={'Split':'Set'})

# palette = {"baseline":"grey",
#           "0proto":"lightsalmon",
#           "1proto":"salmon",
#           "2proto":"coral",
#           "geca":"cornflowerblue"}

# g = sns.catplot(x="Set",
#                y=score,
#                col="Language",
#                hue="Model",
#                col_order=["Average"+score, "spanish","turkish","swahili"],
#                kind="bar",
#                data=df_filtered_avg,
#                hue_order=["baseline","geca","0proto","1proto","2proto"],
#                ci='sd',
#                palette=palette#sns.color_palette("RdBu", n_colors=5)
#                )

# #fix labels and save
# axes = g.axes.flatten()
# axes[0].set_title("Average")
# axes[0].set_ylim(0.25,)
# axes[1].set_title("Spanish")
# axes[2].set_title("Turkish")
# axes[3].set_title("Swahili")
# g.set_xticklabels(rotation=15)
# g.savefig("morph_results_hints_{}_vae_{}_{}.pdf".format(hints,vae,score), dpi=300, verbose=True)


##### Hints=8, NOVAE

In [None]:
g = get_morph_graph(df_filtered_avg, hints=8, vae="false")
g.savefig("morph_results_hints_{}_vae_{}_{}.pdf".format(hints,vae,score), dpi=300, verbose=True)
#options
# splits_s = ["TEST EVALS", "TEST EASY", "VAL EVALS"]
# splits_s_alt = ["Future Tense", "Present Tense", "Past Tense"]
# score = "F1"
# cols = ["Language", "Seed", "Model", "Split", score]
# vae = "false"
# hints = 8
# df_filtered_avg = get_morph_results_graph(df)

# df_filtered_avg = df_filtered_avg.rename(columns={'Split':'Set'})

# palette = {"baseline":"grey",
#           "0proto":"lightsalmon",
#           "1proto":"salmon",
#           "2proto":"coral",
#           "geca":"cornflowerblue"}

# g = sns.catplot(x="Set",
#                y=score,
#                col="Language",
#                hue="Model",
#                col_order=["Average"+score, "spanish","turkish","swahili"],
#                kind="bar",
#                data=df_filtered_avg,
#                hue_order=["baseline","geca","0proto","1proto","2proto"],
#                ci='sd',
#                palette=palette#sns.color_palette("RdBu", n_colors=5)
#                )

# #fix labels and save
# axes = g.axes.flatten()
# axes[0].set_title("Average")
# axes[0].set_ylim(0.25,)
# axes[1].set_title("Spanish")
# axes[2].set_title("Turkish")
# axes[3].set_title("Swahili")
# g.set_xticklabels(rotation=15)
# g.savefig("morph_results_hints_{}_vae_{}_{}.pdf".format(hints,vae,score), dpi=300, verbose=True)

# g = sns.relplot(x="Model", y=score, #units="Seed", 
#                 #estimator=None, 
#                 ci="sd",
#                 hue="Split", style="Split", col_order=["Average"+score, "spanish","turkish","swahili"],
#                 col="Language", kind="line", markers=True, data=df_filtered_avg, sort=True, err_style="bars").\
#             map(sns.lineplot, "Model", "F1", order=[3,0,1,2,4])

##### Hints=8, VAE

In [None]:
g = get_morph_graph(df_filtered_avg, hints=8, vae="true")
g.savefig("morph_results_hints_{}_vae_{}_{}.pdf".format(hints,vae,score), dpi=300, verbose=True)
#options
# splits_s = ["TEST EVALS", "TEST EASY", "VAL EVALS"]
# splits_s_alt = ["Future Tense", "Present Tense", "Past Tense"]
# score = "F1"
# cols = ["Language", "Seed", "Model", "Split", score]
# vae = "true"
# hints = 8
# df_filtered_avg = get_morph_results_graph(df)

# df_filtered_avg = df_filtered_avg.rename(columns={'Split':'Set'})

# palette = {"baseline":"grey",
#           "0proto":"lightsalmon",
#           "1proto":"salmon",
#           "2proto":"coral",
#           "geca":"cornflowerblue"}

# g = sns.catplot(x="Set",
#                y=score,
#                col="Language",
#                hue="Model",
#                col_order=["Average"+score, "spanish","turkish","swahili"],
#                kind="bar",
#                data=df_filtered_avg,
#                hue_order=["baseline","geca","0proto","1proto","2proto"],
#                ci='sd',
#                palette=palette#sns.color_palette("RdBu", n_colors=5)
#                )

# #fix labels and save
# axes = g.axes.flatten()
# axes[0].set_title("Average")
# axes[0].set_ylim(0.25,)
# axes[1].set_title("Spanish")
# axes[2].set_title("Turkish")
# axes[3].set_title("Swahili")
# g.set_xticklabels(rotation=15)
# g.savefig("morph_results_hints_{}_vae_{}_{}.pdf".format(hints,vae,score), dpi=300, verbose=True)


#### Table

In [None]:
df

In [None]:
def mean_std(x):
    return(str(round(np.mean(x),2))+" (${+-}"+str(round(np.std(x),2))+"$)")

In [None]:
# def get_morph_results_table(df, hints=4, vae="false"):
#     hints=4
#     vae="false"
#     splits_s = ["TEST EVALS", "VAL EVALS", "TEST EASY"]
#     splits_s_alt = ["Future Tense", "Past Tense", "Present Tense"]
#     score_s = ["Acc", "F1"]
#     cols_s = ["Language", "Seed", "Model", "Split"] + score_s

#     df_filtered_s = df.replace(splits_s, splits_s_alt).\
#                     loc[(df['Split'].isin(splits_s)) & (df['Vae'] == vae) & (df['Hints'] == hints), cols_s].\
#                     reset_index().\
#                     drop(columns=['index'])

#     print("Len: ", len(df_filtered_s))
#     df_mean_std = df_filtered_s.groupby(by=["Model","Split", "Language"]).\
#                     agg({"Acc":["mean","std"], "F1":["mean","std"]}).\
#                     reset_index("Split").pivot(columns="Split")
#     df_mean_std.columns = df_mean_std.columns.swaplevel(1,0)
#     return(df_mean_std.iloc[:,[0,3,2,5,1,4]])


In [None]:
def my_mean(x):
    return(str(round(np.mean(x),2)))

In [None]:
def get_morph_results_table_alt_std(df, hints=4, vae="false", markdown=False):
    splits_s = ["TEST EVALS", "VAL EVALS", "TEST EASY"]
    splits_s_alt = ["Future Tense", "Past Tense", "Present Tense"]
    score_s = ["Acc", "F1"]
    cols_s = ["Language", "Seed", "Model", "Split"] + score_s

    df_filtered_s = df.replace(splits_s, splits_s_alt).\
                    loc[(df['Split'].isin(splits_s)) & (df['Vae'] == vae) & (df['Hints'] == hints), cols_s].\
                    reset_index().\
                    drop(columns=['index'])

    func = my_mean if markdown else mean_std
    print("Len: ", len(df_filtered_s))
    df_mean_std = df_filtered_s.groupby(by=["Model","Split", "Seed"]).\
                    agg({"Acc":"mean", "F1":"mean"}).\
                    reset_index().\
                    groupby(by=["Model","Split"]).\
                    agg({'Acc':func, 'F1':func}).\
                    reset_index("Split").pivot(columns="Split")

    df_mean_std.columns = df_mean_std.columns.swaplevel(1,0)
    return(df_mean_std.iloc[[3,4,0,1,2,],[0,3,1,4,2,5]])

##### Hints=4, NOVAE

In [None]:
df_mean_std = get_morph_results_table_alt_std(df)
display(df_mean_std)
print(df_mean_std.to_latex())

In [None]:
df_mean_std = get_morph_results_table_alt_std(df, hints=4, vae="true")
display(df_mean_std)
print(df_mean_std.to_latex())

In [None]:
df_mean_std = get_morph_results_table_alt_std(df, hints=8, vae="false")
display(df_mean_std)
print(df_mean_std.to_latex())

In [None]:
df_mean_std = get_morph_results_table_alt_std(df, hints=8, vae="true")
display(df_mean_std)
print(df_mean_std.to_latex())

In [None]:
df_mean_std = get_morph_results_table_alt_std(df, hints=4, vae="false", markdown=True)
display(df_mean_std)
print(df_mean_std.to_markdown())

### SCAN Results Graph

In [None]:
def get_scan_scores(df=None,
                    tasks=("jump","around_right"),
                    seeds=(0,1,2,3,4),
                    vaes =("true","false"),
                    models=("0proto","1proto","2proto"),
                    exppath="./checkpoints",
                   ):
    for task in tasks:
            for seed in seeds:
                for vae in vaes:
                    for model in  models:
                        taskpath=os.path.join(exppath,"SCANDataSet")
                        identifier ="{}.vae.{}.{}.seed.{}".format(model,vae,task,seed)
                        condfile=os.path.join(taskpath,"logs",identifier+".cond.log") 
                        if os.path.exists(condfile):
                            lines  = open(condfile,'r').readlines()
                            print("processing: "+condfile)
                            for (s,r) in get_splits(condfile).items():
                                acc, accstd, f1, f1std = calculate_scores(lines[r])  
                                df.loc[len(df.index)] = (task,s,seed,vae,model,acc,accstd,f1,f1std)
                        else:
                            print(f"file doesnot exist: {condfile}")

In [None]:
#create SCAN table
dfscan = pd.DataFrame(columns=("Task",'Split', 'Seed', 'Vae','Model','Acc','Acc_std','F1','F1_std',))
get_scan_scores(df=dfscan)
dfscan_other = pd.DataFrame(columns=("Task",'Split', 'Seed', 'Vae','Model','Acc','Acc_std','F1','F1_std',))
get_scan_scores(df=dfscan_other,
                models=("2proto",),
                tasks=("jump",),
                exppath="./checkpoints_dgx_scan/",
               )
# results with seed 5-9
dfscan_other['Seed'] = dfscan_other['Seed'] + 5
dfscan_all=dfscan.append(dfscan_other, ignore_index=True)
dfscan_all

In [None]:
dfscan_all

In [None]:
def get_scan_results_table(dfscan_all, markdown=False):
    splits_s = ["TEST EVALS"]
    score_s = "Acc"
    cols_s = ["Task", "Seed", "Model", "Split", score_s]
    vae_s = "false"
    
    df_filtered_s = dfscan_all.loc[(dfscan_all['Split'].isin(splits_s)) & (dfscan_all['Vae'] == vae_s), cols_s].\
                    reset_index().\
                    drop(columns=['index', 'Split'])
                    
    df_filtered_s.head()
    print("Len: ", len(df_filtered_s))
    
    # add geca and baseline scores
    geca_baseline_s = pd.read_csv("stats/scan-geca-baseline.csv", header=None)
    print("Total of {} GECA and baseline records.".format(len(geca_baseline_s)))
    
    # append
    for index,row in geca_baseline_s.iterrows():
    
        task = row[0]
        seed = row[1]
        model = row[2]
        val = float(row[3])

        df_filtered_s.loc[len(df_filtered_s)] = [task, seed, model, val]
    
    func = my_mean if markdown else mean_std
    dfscan_mean_std = df_filtered_s.groupby(by=["Model","Task"]).\
                    agg({"Acc":func}).\
                    reset_index("Task").\
                    pivot(columns="Task").\
                    rename(columns={"around_right":"AROUND RIGHT", "jump":"JUMP"})

    return(dfscan_mean_std.iloc[:,:])



In [None]:
dfscan_mean_std = get_scan_results_table(dfscan_all)
display(dfscan_mean_std)
print(dfscan_mean_std.to_latex())

In [None]:
dfscan_mean_std = get_scan_results_table(dfscan_all, markdown=True)
print(dfscan_mean_std.to_markdown())

In [None]:
def get_lang_scores_unreduced(df=None,
                    langs=("spanish","turkish","swahili"),
                    hintss=(4,8,16),
                    seeds=(0,1,2,3,4),
                    vaes =("true","false"),
                    models=("baseline","0proto","1proto","2proto"),
                    exppath="./checkpoints",
                   ):
    for lang in langs:
        for hints in hintss:
            for seed in seeds:
                for vae in vaes:
                    for model in  models:
                        langpath=os.path.join(exppath,"SIGDataSet",lang)
                        if model == "baseline" or model == "geca":
                            identifier ="{}.hints.{}.seed.{}".format(model,hints,seed)
                        else:
                            identifier ="{}.vae.{}.hints.{}.seed.{}".format(model,vae,hints,seed)
                        condfile=os.path.join(langpath,"logs",identifier+".cond.log") 
                        if os.path.exists(condfile):
                            lines  = open(condfile,'r').readlines()
                            if len(lines) < 2142:
                                print("format broken in "+condfile)
                                continue
#                             print("processing: "+condfile)
                            for (s,r) in get_splits(condfile).items():#splitinfo.items():
                                acc, accstd, f1, f1std = calculate_scores_unreduced(lines[r])  
                                df.loc[len(df.index)] = (lang,hints,seed,vae,model,s,acc,accstd,f1,f1std)
                        else:
                            print(f"file doesnot exist: {condfile}")

In [None]:
dfunreduced = pd.DataFrame(columns=('Language', 'Hints', 'Seed', 'Vae','Model','Split','Acc','Acc_std','F1','F1_std',))
get_lang_scores_unreduced(df=dfunreduced,exppath="./checkpoints")
get_lang_scores_unreduced(df=dfunreduced,exppath="./checkpoints_gecaexp",
                models=("geca",))
dfunreduced

In [None]:
dfunreduced.head()

In [None]:
d = {}
models=("baseline","geca","0proto","1proto","2proto")
langs = ("spanish", "turkish", "swahili")
hints=4
vae="false"
split="TEST EVALS"
cols = ["Language", "Seed", "Model", "Acc_std", "F1_std"]
conds = (dfunreduced["Hints"]==hints) & (dfunreduced['Vae']==vae) & (dfunreduced["Split"]==split)
dfunreduced_filtered = dfunreduced.loc[conds, cols]


In [None]:
dfunreduced_filtered

In [None]:
for m in models:
    d[m] = {}
    for t in ["Acc_std", "F1_std"]:
        d[m][t] = []
        for l in langs:
            for s in range(5):
                cond1 = (dfunreduced_filtered["Model"]==m) & (dfunreduced_filtered["Language"]==l)
                cond2 = (dfunreduced_filtered["Seed"]==s)
#                 pdb.set_trace()
                if t == "Acc_std":
                    d[m][t].extend([int(el) for el in dfunreduced_filtered.loc[cond1 & cond2, t].tolist()[0]])
                else:
                    d[m][t].extend(dfunreduced_filtered.loc[cond1 & cond2, t].tolist()[0])
#         d[m][t].update({t:ls})

In [None]:
len(d["baseline"]["Acc_std"])

In [None]:
sum(d["baseline"]["Acc_std"])

In [None]:
sum(d["0proto"]["Acc_std"])

In [None]:
d_avg = {}
for m in models:
    d_avg[m] = {}
    for t in ["Acc_std", "F1_std"]:
#         d_avg[m][t] = {}
        d_avg[m][t] = np.mean(d[m][t])
#         d_avg[m][t]["std"] = np.std(d[m][t])

In [None]:
pd.DataFrame(d_avg)

In [None]:
from scipy import stats

In [None]:
def t_test(model1, model2, t):
    return(stats.ttest_rel(d[model1][t],d[model2][t]).pvalue)

def get_pvals(t):
    sign = {}
    for m1 in models:
        sign[m1] = {}
        for m2 in models:
            sign[m1][m2] = t_test(m1,m2,t)
    return(pd.DataFrame(sign))
print("acc")       
display(get_pvals("Acc_std"))
print("f1")
display(get_pvals("F1_std"))

In [None]:
f1pvalsfut = get_pvals("F1_std")
print(f1pvalsfut.to_latex())

In [None]:
d = {}
models=("baseline","geca","0proto","1proto","2proto")
langs = ("spanish", "turkish", "swahili")
hints=4
vae="false"
split="VAL EVALS"
cols = ["Language", "Seed", "Model", "Acc_std", "F1_std"]
conds = (dfunreduced["Hints"]==hints) & (dfunreduced['Vae']==vae) & (dfunreduced["Split"]==split)
dfunreduced_filtered = dfunreduced.loc[conds, cols]

for m in models:
    d[m] = {}
    for t in ["Acc_std", "F1_std"]:
        d[m][t] = []
        for l in langs:
            for s in range(5):
                cond1 = (dfunreduced_filtered["Model"]==m) & (dfunreduced_filtered["Language"]==l)
                cond2 = (dfunreduced_filtered["Seed"]==s)
#                 pdb.set_trace()
                if t == "Acc_std":
                    d[m][t].extend([int(el) for el in dfunreduced_filtered.loc[cond1 & cond2, t].tolist()[0]])
                else:
                    d[m][t].extend(dfunreduced_filtered.loc[cond1 & cond2, t].tolist()[0])
                    
                    
d_avg = {}
for m in models:
    d_avg[m] = {}
    for t in ["Acc_std", "F1_std"]:
#         d_avg[m][t] = {}
        d_avg[m][t] = np.mean(d[m][t])
#         d_avg[m][t]["std"] = np.std(d[m][t])

f1pvalspst = get_pvals("F1_std")
print(f1pvalspst.to_latex())

In [None]:
def get_scan_scores_unreduced(df=None,
                    tasks=("jump","around_right"),
                    seeds=(0,1,2,3,4),
                    vaes =("true","false"),
                    models=("0proto","1proto","2proto"),
                    exppath="./checkpoints",
                   ):
    for task in tasks:
            for seed in seeds:
                for vae in vaes:
                    for model in  models:
                        taskpath=os.path.join(exppath,"SCANDataSet")
                        identifier ="{}.vae.{}.{}.seed.{}".format(model,vae,task,seed)
                        condfile=os.path.join(taskpath,"logs",identifier+".cond.log") 
                        if os.path.exists(condfile):
                            lines  = open(condfile,'r').readlines()
                            print("processing: "+condfile)
                            for (s,r) in get_splits(condfile).items():
                                acc, accstd, f1, f1std = calculate_scores_unreduced(lines[r])  
                                df.loc[len(df.index)] = (task,s,seed,vae,model,acc,accstd,f1,f1std)
                        else:
                            print(f"file doesnot exist: {condfile}")

#### Ablations

In [None]:
dfscan_ablations = pd.DataFrame(columns=("Task",'Split', 'Seed', 'Vae','Model','Acc','Acc_std','F1','F1_std',))
get_scan_scores(df=dfscan_ablations,
                models=("ID.1proto","nocopy.1proto", "nocopy.2proto"),
                exppath="./checkpoints_ablations/",
               )

In [None]:
dfscan_ablations[(dfscan_ablations["Split"]=="TEST EVALS") & (dfscan_ablations["Task"]=="around_right")]