In [1]:
%matplotlib inline
import pandas as pd
import warnings
import os
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl
from uncertainties import ufloat
from scipy import stats

In [2]:
plt.rc('font', family='serif')
plt.rc('xtick', labelsize='x-small')
plt.rc('ytick', labelsize='x-small')
plt.rc('text', usetex=False)
sns.set(style="ticks", font_scale=1.5, color_codes=True)
sns.set_style({'font.family':'serif', 'font.serif':'Times New Roman'})
mpl.rcParams['figure.dpi'] = 300

### Results Loading Code

In [3]:
def mean_std(x):
    return ufloat(np.mean(x),np.std(x))
def my_mean(x):
    return(str(round(np.mean(x),2)))
def sort_by(list1, list2):
    return [x for _,x in sorted(zip(list2,list1))]

In [4]:
def get_splits(fname):
    splits = !grep -hnr 'TEST EVALS\|VAL EVALS\|TEST EASY\|VAL EASY' {fname}
    length = !wc -l {fname}
    length = int(length[0].split(' ')[0])
    starts = []
    names  = []
    for s in splits:
        start,sname = s.split(':')
        starts.append(int(start))
        names.append(sname)
    starts.append(length)
    sdict = {}
    for i in range(0,len(starts)-1):
        sdict[names[i]] = slice(starts[i],starts[i+1])
    return sdict

def get_tokens(line):
    return line.rstrip().split(" ")[1:]

In [5]:
def find_next_matching_block(lines,start):
    index = start
    found = False
    for l in lines[start:]:
        if l.startswith('INPUT:'):
            found = True
            break
        index+=1
    if found:
        inp   = lines[index]
        ref   = get_tokens(lines[index+1])
        pred  = get_tokens(lines[index+2])
        return (inp,ref,pred), index+3
    else:
        return None, None

In [6]:
def calculate_scores(lines, train_tags=None, reduced=True):
    start=0 
    finished = False
    preds, tps, fps, fns, f1s, novel, sorter = [],[],[],[],[],[], []
    while not finished:
        data, start = find_next_matching_block(lines,start)
        if data is not None:
            inp, ref, pred_here = data
            d = inp+"\t".join(ref)
            correct = pred_here == ref
            tp = (len([p for p in pred_here if p in ref]))
            fp = (len([p for p in pred_here if p not in ref]))
            fn = (len([p for p in ref if p not in pred_here]))
            prec = tp / (tp + fp)
            rec = tp / (tp + fn)
            if prec == 0 or rec == 0:
                f1 = 0
            else:
                f1 = 2 * prec * rec / (prec + rec)
            for (k,v) in ((sorter,d),(f1s,f1),(tps,tp),(fps,fp),(fns,fn),(preds,correct)):
                k.append(v)
            if train_tags is not None:
                novel.append((";".join(ref[1:]) not in train_tags))
        else:
            finished = True
            
    if len(novel) == 0:
        novel = [True] * len(f1s)
        
    tps, fps, fns, preds, f1s, sorter = map(lambda x:  np.array(x)[novel], (tps, fps, fns, preds, f1s, sorter))
    tp, fp, fn = np.sum(tps), np.sum(fps), np.sum(fns)
    prec = tp / (tp + fp)
    rec = tp / (tp + fn)
    if prec == 0 or rec == 0:
        f1 = 0
    else:
        f1 = 2 * prec * rec / (prec + rec)
    f1_std =  np.std(f1s)
    acc = np.mean(preds)
    acc_std = np.std(preds)
    if not reduced:
        tps, fps, fns, preds, f1s = map(lambda x: sort_by(x,sorter), (tps, fps, fns, preds, f1s))
        return np.mean(preds), preds, f1, f1s
    return acc, acc_std, f1, f1_std

In [7]:
import json
def get_tags(datafolder, hints, seed):
    train_tags= set([line.rstrip().split('\t')[2] for line in open(datafolder+f"train.hints-{hints}.{seed}.txt")])
#   for split in ("test_hard", "val_hard"):
#       split_tags = set([line.rstrip().split('\t')[2] for line in open(datafolder+f"{split}.hints-{hints}.{seed}.txt")])
    return train_tags

In [8]:
get_tags("./data/SIGDataSet.large/spanish/", hints=4, seed=0);

In [9]:
testfile = "./checkpoints_seperate_large/SIGDataSet/spanish/logs/2proto.vae.true.hints.16.seed.0.cond.log"
train_tags = get_tags('data/SIGDataSet.large/spanish/', hints=4,seed=0)
testlines  = open(testfile,'r').readlines()
print(testfile)
for (s,r) in get_splits(testfile).items():
         print(f"split: {s}\n",calculate_scores(testlines[r],train_tags=train_tags))

./checkpoints_seperate_large/SIGDataSet/spanish/logs/2proto.vae.true.hints.16.seed.0.cond.log
split: TEST EVALS
 (0.515625, 0.49975579974123363, 0.8501291989664083, 0.17237705255322905)
split: VAL EVALS
 (0.2441860465116279, 0.42960356283514334, 0.7231329690346084, 0.2502584644495325)
split: TEST EASY
 (nan, nan, nan, nan)
split: VAL EASY
 (nan, nan, nan, nan)


  keepdims=keepdims)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


## SIGMorphon
### Scores

In [10]:
def get_lang_scores(df=None,
                    langs=("spanish","turkish","swahili"),
                    hintss=(4,8,16),
                    seeds=(0,1,2,3,4),
                    vaes =("true","false"),
                    models=("baseline","0proto","1proto","2proto"),
                    exppath="./checkpoints",
                    datapath="./data/SIGDataSet.large/",
                    novel=False,
                    reduced=True,
                   ):
    train_tags=None
    for lang in langs:
        for hints in hintss:
            for seed in seeds:
                if novel:
                    train_tags = get_tags(datapath + lang + '/', hints=hints,seed=seed)
                for vae in vaes:
                    for model in  models:
                        langpath=os.path.join(exppath,"SIGDataSet",lang)
                        if model == "baseline" or model == "geca":
                            identifier ="{}.hints.{}.seed.{}".format(model,hints,seed)
                        else:
                            identifier ="{}.vae.{}.hints.{}.seed.{}".format(model,vae,hints,seed)
                        condfile=os.path.join(langpath,"logs",identifier+".cond.log") 
                        if os.path.exists(condfile):
                            lines  = open(condfile,'r').readlines()
                            if len(lines) < 2142:
                                print("format broken in "+condfile)
                                continue
#                             print("processing: "+condfile)
                            for (s,r) in get_splits(condfile).items():#splitinfo.items():
                                acc, accstd, f1, f1std = calculate_scores(lines[r], train_tags=train_tags, reduced=reduced)  
                                df.loc[len(df.index)] = (lang,hints,seed,vae,model,s,acc,accstd,f1,f1std)
                        else:
                            print(f"file doesnot exist: {condfile}")

In [18]:
dfcopy = pd.DataFrame(columns=('Language', 'Hints', 'Seed', 'Vae','Model','Split','Acc','Acc_std','F1','F1_std',))
get_lang_scores(df=dfcopy,exppath="./checkpoints_sig_copy",datapath="./data/SIGDataSet.large.copy/",  models=("baseline",), novel=False)

format broken in ./checkpoints_sig_copy/SIGDataSet/turkish/logs/baseline.hints.4.seed.3.cond.log
format broken in ./checkpoints_sig_copy/SIGDataSet/turkish/logs/baseline.hints.4.seed.3.cond.log
file doesnot exist: ./checkpoints_sig_copy/SIGDataSet/turkish/logs/baseline.hints.4.seed.4.cond.log
file doesnot exist: ./checkpoints_sig_copy/SIGDataSet/turkish/logs/baseline.hints.4.seed.4.cond.log
file doesnot exist: ./checkpoints_sig_copy/SIGDataSet/turkish/logs/baseline.hints.8.seed.0.cond.log
file doesnot exist: ./checkpoints_sig_copy/SIGDataSet/turkish/logs/baseline.hints.8.seed.0.cond.log
file doesnot exist: ./checkpoints_sig_copy/SIGDataSet/turkish/logs/baseline.hints.8.seed.1.cond.log
file doesnot exist: ./checkpoints_sig_copy/SIGDataSet/turkish/logs/baseline.hints.8.seed.1.cond.log
file doesnot exist: ./checkpoints_sig_copy/SIGDataSet/turkish/logs/baseline.hints.8.seed.2.cond.log
file doesnot exist: ./checkpoints_sig_copy/SIGDataSet/turkish/logs/baseline.hints.8.seed.2.cond.log
file d

In [None]:
dfrare = pd.DataFrame(columns=('Language', 'Hints', 'Seed', 'Vae','Model','Split','Acc','Acc_std','F1','F1_std',))
get_lang_scores(df=dfrare,exppath="./checkpoints_seperate_large",datapath="./data/SIGDataSet.large/",  models=("0proto","1proto","2proto"), novel=False)
get_lang_scores(df=dfrare,exppath="./checkpoints_large_test",datapath="./data/SIGDataSet.large/",  models=("baseline",), novel=False)
get_lang_scores(df=dfrare,exppath="./checkpoints_large_test_geca",datapath="./data/SIGDataSet.large/",models=("geca",), novel=False)

In [None]:
dfrare_novel = pd.DataFrame(columns=('Language', 'Hints', 'Seed', 'Vae','Model','Split','Acc','Acc_std','F1','F1_std',))
get_lang_scores(df=dfrare_novel,exppath="./checkpoints_seperate_large",datapath="./data/SIGDataSet.large/",  models=("0proto","1proto","2proto"), novel=True)
get_lang_scores(df=dfrare_novel,exppath="./checkpoints_large_test",datapath="./data/SIGDataSet.large/",  models=("baseline",), novel=True)
get_lang_scores(df=dfrare_novel,exppath="./checkpoints_large_test_geca",datapath="./data/SIGDataSet.large/",models=("geca",), novel=True)

In [None]:
dfnorare = pd.DataFrame(columns=('Language', 'Hints', 'Seed', 'Vae','Model','Split','Acc','Acc_std','F1','F1_std',))
get_lang_scores(df=dfnorare,exppath="./checkpoints_morph_norare",datapath="./data/SIGDataSet.large/",  models=("0proto","1proto","2proto"), novel=False, vaes=("false",))
get_lang_scores(df=dfnorare,exppath="./checkpoints_large_test",datapath="./data/SIGDataSet.large/",  models=("baseline",), novel=False, vaes=("false",))
get_lang_scores(df=dfnorare,exppath="./checkpoints_large_test_geca_norare",datapath="./data/SIGDataSet.large/",models=("geca",), novel=False)

In [None]:
dfnorare_novel = pd.DataFrame(columns=('Language', 'Hints', 'Seed', 'Vae','Model','Split','Acc','Acc_std','F1','F1_std',))
get_lang_scores(df=dfnorare_novel,exppath="./checkpoints_morph_norare",datapath="./data/SIGDataSet.large/",  models=("0proto","1proto","2proto"), novel=True, vaes=("false",))
get_lang_scores(df=dfnorare_novel,exppath="./checkpoints_large_test",datapath="./data/SIGDataSet.large/",  models=("baseline",), novel=True, vaes=("false",))
get_lang_scores(df=dfnorare_novel,exppath="./checkpoints_large_test_geca_norare",datapath="./data/SIGDataSet.large/",models=("geca",), novel=True)

### Tables

In [15]:
def get_morph_results_table(df, hints=8, vae="false", score="F1", markdown=False):
    splits_s = ["TEST EVALS", "VAL EVALS", "TEST EASY"]
    splits_s_alt = ["Fut", "Past", "Pres"]
    cols_s = ["Language", "Seed", "Model", "Split"] + [score]
    df = df.replace(splits_s, splits_s_alt).\
                    loc[(df['Split'].isin(splits_s)) & (df['Vae'] == vae) & (df['Hints'] == hints), cols_s].\
                    reset_index().\
                    drop(columns=['index']).\
                    groupby(by=["Model","Split","Language"]).\
                    agg({score:mean_std}).\
                    reset_index("Split").pivot(columns="Split")

    df.columns = df.columns.swaplevel(1,0)
    df['Fut-Past '+ score] =  (df['Fut'] + df['Past'])/2
    df = df.drop(columns=['Fut','Past'])
    df.columns = [' '.join(col).strip() for col in df.columns.values]
    df = df.reset_index("Language").pivot(columns="Language")
    df.columns = df.columns.swaplevel(1,0)
    df.columns = [' '.join(col).strip().title() for col in df.columns.values]
    return df
    return df.iloc[:,[3,0,4,1,5,2]]

In [19]:
df_copy_table = get_morph_results_table(dfcopy, hints=8, vae="false", score="F1", markdown=False)

In [22]:
print(df_copy_table.to_markdown())

| Model    | Spanish Pres F1   | Swahili Pres F1   | Spanish Fut-Past F1   | Swahili Fut-Past F1   |
|:---------|:------------------|:------------------|:----------------------|:----------------------|
| baseline | 0.882+/-0.017     | 0.900+/-0.019     | 0.652+/-0.006         | 0.770+/-0.010         |


In [None]:
df_rare_table = get_morph_results_table(dfrare, hints=16, vae="false", score="Acc", markdown=False)

In [None]:
df_norare_table = get_morph_results_table(dfnorare, hints=16, vae="false", score="Acc", markdown=False)

In [None]:
df_rare_table = df_rare_table.reset_index("Model")
df_rare_table["Model"] = df_rare_table["Model"] .astype(str) + ' +rare'
df_rare_table.set_index("Model")

In [None]:
print(df_rare_table.to_latex(index=False,float_format="{:0.2f}", caption="Morphology Results"))

In [None]:
df_norare_table = df_norare_table.reset_index("Model")
df_norare_table["Model"] = df_norare_table["Model"] .astype(str)
df_norare_table.set_index("Model")

In [None]:
df_morph = df_norare_table[3:5].append([df_rare_table.iloc[4],
                             df_norare_table.iloc[0],
                             df_rare_table.iloc[0],
                             df_norare_table.iloc[1],
                             df_rare_table.iloc[1],
                             df_norare_table.iloc[2],
                             df_rare_table.iloc[2]],ignore_index=True)

In [None]:
df_morph 

In [None]:
print(df_morph.to_latex(index=False,float_format="{:0.2f}", caption="Morphology Results"))

In [None]:
df_rare_novel_table = get_morph_results_table(dfrare_novel, hints=8, vae="false", score="F1", markdown=False)
df_rare_novel_table = df_rare_novel_table.reset_index("Model")
df_rare_novel_table["Model"] = df_rare_novel_table["Model"] .astype(str) + ' +rare'
df_rare_novel_table.set_index("Model")

In [None]:
df_norare_novel_table = get_morph_results_table(dfnorare_novel, hints=8, vae="false", score="F1", markdown=False)
df_norare_novel_table = df_norare_novel_table.reset_index("Model")
df_norare_novel_table["Model"] = df_norare_novel_table["Model"] .astype(str) 
df_norare_novel_table.set_index("Model")

In [None]:
df_morph_novel = df_norare_novel_table[3:5].append([df_rare_novel_table.iloc[4],
                             df_norare_novel_table.iloc[0],
                             df_rare_novel_table.iloc[0],
                             df_norare_novel_table.iloc[1],
                             df_rare_novel_table.iloc[1],
                             df_norare_novel_table.iloc[2],
                             df_rare_novel_table.iloc[2]],ignore_index=True)

In [None]:
print(df_morph_novel.to_latex(index=False,float_format="{:0.2f}", caption="Morphology Results"))

#### Plots

In [None]:
# #options
splits_s = ["TEST EVALS", "TEST EASY", "VAL EVALS"]
splits_s_alt = ["Future Tense", "Present Tense", "Past Tense"]
palette = {"baseline":"grey",
          "0proto":"lightsalmon",
          "1proto":"salmon",
          "2proto":"coral",
          "geca":"cornflowerblue"}
def get_morph_results_graph(df, hints, vae, score):
    cols = ["Language", "Seed", "Model", "Split", score]
    #filter based on options
    df = df.replace(splits_s, splits_s_alt).loc[(df['Split'].isin(splits_s)) & (df['Vae'] == vae)  & (df["Hints"]==hints), cols]
#   df.head()
#   print(len(df))
    #aggregate to get the mean
    agg = df.groupby(by=["Model","Split","Seed"]).agg("mean"). \
          reset_index()
    agg["Language"] = "Average"+score
    agg = agg[df.columns]
#   print(len(agg))
    #new df with mean
    df= df.append(agg, ignore_index=True)
    return(df)

def show_values_on_bars(axs):
    def _show_on_single_plot(ax):        
        for p in ax.patches:
            _x = p.get_x() + p.get_width() / 2
            _y = p.get_y() +  0.02
            value = '{:.2f}'.format(p.get_height())
            ax.text(_x, _y, value, ha="center",rotation="vertical",fontsize=12) 

    if isinstance(axs, np.ndarray):
        for idx, ax in np.ndenumerate(axs):
            _show_on_single_plot(ax)
    else:
        _show_on_single_plot(axs)
        
def get_morph_graph(df, hints, vae, score, ylim=0.0):
    df = get_morph_results_graph(df, hints, vae, score)
    df.rename(columns={'Split':'Set'}, inplace=True)
    g = sns.catplot(x="Set",
               y=score,
               col="Language",
               hue="Model",
               col_order=["Average"+score, "spanish","turkish","swahili"],
               kind="bar",
               data=df,
               hue_order=["baseline","geca","0proto","1proto","2proto"],
               ci='sd',
               legend_out=True,
               palette=palette#sns.color_palette("RdBu", n_colors=5)
               )

    #fix labels and save
    axes = g.axes.flatten()
    show_values_on_bars(axes)
    axes[0].set_title("Average"+score)
    axes[0].set_ylim(ylim,)
    axes[1].set_title("Spanish")
    axes[2].set_title("Turkish")
    axes[3].set_title("Swahili")
    g.set_xticklabels(rotation=15)
    g._legend.set_title(f"hints: {hints}\nvae: {vae}")
    return(g)
    

In [None]:
for hints in (4,8,16):
    for vae in ("true","false"):
        for score in ("Acc","F1"):
            plt.figure()
            #print("morph_results_{}_hints_{}_vae_{}_{}.pdf".format(score,hints,vae,score))
            g = get_morph_graph(dfrare, hints=hints, vae=vae, score=score)
            #g.savefig("morph_results_{}_hints_{}_vae_{}_{}.pdf".format(score,hints,vae,score), dpi=300, verbose=True)
            plt.show()

## SCAN 
### Scores & Tables

In [None]:
def get_scan_scores(df=None,
                    tasks=("jump","around_right"),
                    seeds=(0,1,2,3,4),
                    vaes =("true","false"),
                    models=("0proto","1proto","2proto"),
                    exppath="./checkpoints",
                    reduced=True,
                   ):
    for task in tasks:
            for seed in seeds:
                for vae in vaes:
                    for model in  models:
                        taskpath=os.path.join(exppath,"SCANDataSet")
                        identifier ="{}.vae.{}.{}.seed.{}".format(model,vae,task,seed)
                        condfile=os.path.join(taskpath,"logs",identifier+".cond.log") 
                        if os.path.exists(condfile):
                            lines  = open(condfile,'r').readlines()
#                           print("processing: "+condfile)
                            for (s,r) in get_splits(condfile).items():
                                acc, accstd, f1, f1std = calculate_scores(lines[r], reduced=reduced)  
                                df.loc[len(df.index)] = (task,s,seed,vae,model,acc,accstd,f1,f1std)
                        else:
                            print(f"file doesnot exist: {condfile}")

In [None]:
def get_scan_results_table(df, markdown=False):
    splits_s = ["TEST EVALS"]
    score_s = "Acc"
    cols_s = ["Task", "Seed", "Model", "Split", score_s]
    vae_s = "false"
    
    df = df.loc[(df['Split'].isin(splits_s)) & (df['Vae'] == vae_s), cols_s].\
                    reset_index().\
                    drop(columns=['index', 'Split'])                  
#    df.head()
#    print("Len: ", len(df))
    # add geca and baseline scores
    geca_baseline_s = pd.read_csv("stats/scan-geca-baseline.csv", header=None)
    # append
    for index,row in geca_baseline_s.iterrows():
        task, seed, model, val = row[0:4]
        df.loc[len(df)] = [task, seed, model, float(val)]

    df= df.groupby(by=["Model","Task"]).\
                    agg({"Acc":mean_std}).\
                    reset_index("Task").\
                    pivot(columns="Task").\
                    rename(columns={"around_right":"AROUND RIGHT", "jump":"JUMP"})
    
    df.columns = [' '.join(col).strip().title() for col in  df.columns.values]
    df = df.reset_index('Model')
    
    return df

In [None]:
#create SCAN table
dfscannorare = pd.DataFrame(columns=("Task",'Split', 'Seed', 'Vae','Model','Acc','Acc_std','F1','F1_std',))
get_scan_scores(df=dfscannorare, exppath="./checkpoints.bak",vaes=("false",))
dfscan_other_norare = pd.DataFrame(columns=("Task",'Split', 'Seed', 'Vae','Model','Acc','Acc_std','F1','F1_std',))
get_scan_scores(df=dfscan_other_norare,
                 models=("2proto",),
                 tasks=("jump",),
                 exppath="./checkpoints_dgx_scan/",
                 vaes=("false",),
                )
#results with seed 5-9
dfscan_other_norare['Seed'] = dfscan_other_norare['Seed'] + 5
dfscannorare=dfscannorare.append(dfscan_other_norare, ignore_index=True)
dfscannorare

In [None]:
#create SCAN table
dfscan = pd.DataFrame(columns=("Task",'Split', 'Seed', 'Vae','Model','Acc','Acc_std','F1','F1_std',))
get_scan_scores(df=dfscan, exppath="./checkpoints_scan_rare2",vaes=("false",))
dfscan_other = pd.DataFrame(columns=("Task",'Split', 'Seed', 'Vae','Model','Acc','Acc_std','F1','F1_std',))
get_scan_scores(df=dfscan_other,
                 models=("2proto",),
                 tasks=("jump",),
                 exppath="./checkpoints_scan_rare3/",
                 vaes=("false",),
                )
#results with seed 5-9
dfscan_other['Seed'] = dfscan_other['Seed'] + 5
dfscan=dfscan.append(dfscan_other, ignore_index=True)
dfscan

In [None]:
table_scannorare = get_scan_results_table(dfscannorare)

In [None]:
table_scannorare

In [None]:
print(table_scannorare.to_latex(index=False, float_format="{:0.2f}", caption="SCAN Experiments with no rare filtering"))

In [None]:
table_scanrare = get_scan_results_table(dfscan)
table_scanrare.loc[len(table_scanrare)] = ("0proto","NaN","NaN")
table_scanrare = table_scanrare.iloc[[0,1,4,2,3],:]

In [None]:
table_scanrare

In [None]:
print(table_scanrare.to_latex(index=False, float_format="{:0.2f}", caption="SCAN Experiments with rare filtering"))

## Significance Analyses

In [None]:
def merge_test_val(df):
    df = df.set_index(["Language","Hints","Seed","Vae","Model"])
    t1 = df[df['Split'] == 'TEST EVALS']
    t2 = df[df['Split'] == 'VAL EVALS']
    t = t1.join(t2, lsuffix='_test', rsuffix='_val')
    t['Acc_std'] = t['Acc_std_test'] + t['Acc_std_val'] 
    t['F1_std'] = t['F1_std_test'] + t['F1_std_val']
    #t=t.drop(columns=['Acc_std_test', 'F1_std_test', 'F1_std_val', 'Acc_std_val', 'Acc_test', 'Acc_val','F1_val', 'F1_test', 'Split_val', 'Split'])
    t = t[['Acc_std', 'F1_std']].reset_index()
    return t

def get_others(df):
    df = df.set_index(["Language","Hints","Seed","Vae","Model"])
    df = df[df['Split'] == 'TEST EASY']
    df = df[['Acc_std', 'F1_std']].reset_index()
    return df

In [None]:
dfunreduced_rare = pd.DataFrame(columns=('Language', 'Hints', 'Seed', 'Vae','Model','Split','Acc','Acc_std','F1','F1_std',))
get_lang_scores(df=dfunreduced_rare,exppath="./checkpoints_seperate_large",datapath="./data/SIGDataSet.large/",  models=("0proto","1proto","2proto"), vaes=("false",), novel=False, reduced=False)
get_lang_scores(df=dfunreduced_rare,exppath="./checkpoints_large_test_geca",datapath="./data/SIGDataSet.large/",models=("geca",), novel=False, reduced=False)
dfunreduced_rare['Model'] = dfunreduced_rare['Model'].astype(str) + ' +rare'
dfunreduced_rare = dfunreduced_rare
dfunreduced_rare

In [None]:
dfunreduced_rare.head()

In [None]:
dfunreduced_norare = pd.DataFrame(columns=('Language', 'Hints', 'Seed', 'Vae','Model','Split','Acc','Acc_std','F1','F1_std',))
get_lang_scores(df=dfunreduced_norare,exppath="./checkpoints_morph_norare",datapath="./data/SIGDataSet.large/",  models=("0proto","1proto","2proto"), vaes=("false",), novel=False, reduced=False)
get_lang_scores(df=dfunreduced_norare,exppath="./checkpoints_large_test",datapath="./data/SIGDataSet.large/",  models=("baseline",), novel=False,reduced=False)
get_lang_scores(df=dfunreduced_norare,exppath="./checkpoints_large_test_geca_norare",datapath="./data/SIGDataSet.large/",models=("geca",), novel=False, reduced=False)
dfunreduced_norare.head()

In [None]:
len(dfunreduced_norare)

In [None]:
dfunreduced = dfunreduced_rare.append(dfunreduced_norare, ignore_index=True)

In [None]:
dfunreduced

In [None]:
def filter_for_eval(df, hints=8, vae="false"):
    return df[(df["Hints"]==hints) & (df['Vae']==vae)]

In [None]:
df_merged = filter_for_eval(merge_test_val(dfunreduced))

In [None]:
df_merged

In [None]:
df_others = filter_for_eval(get_others(dfunreduced))

In [None]:
df_others 

In [None]:
models=("baseline","geca","0proto","1proto","2proto","geca +rare","0proto +rare","1proto +rare","2proto +rare")
langs = ("spanish", "turkish", "swahili")
hints=8
vae="false"
cols = ["Language", "Seed", "Model", "Acc_std", "F1_std"]
def get_significance_data(df, models,langs,hints,vae,cols):
    data = {}
    for l in langs:
        data[l] = {}
        for m in models:
            data[l][m] = {}
            for t in ["Acc_std", "F1_std"]:
                data[l][m][t] = []
                for s in range(5):
                    cond1 = (df["Model"]==m) & (df["Language"]==l) & (df["Seed"]==s)
                    if t == "Acc_std":
                        data[l][m][t].extend([int(el) for el in df.loc[cond1, t].tolist()[0]])
                    else:
                        data[l][m][t].extend(df.loc[cond1, t].tolist()[0])
    return data

In [None]:
pval_pstfut = get_significance_data(df_merged, models,langs,hints,vae,cols)

In [None]:
np.mean(pval_pstfut["turkish"]['baseline']['Acc_std'])

In [None]:
len(pval_pstfut["swahili"]["baseline"]["Acc_std"])

In [None]:
def get_d_avg(pvals, models, langs):
    d_avg = {}
    for l in langs:
        d_avg[l] = {}
        for m in models:
            d_avg[l][m] = {}
            for t in ["Acc_std", "F1_std"]:
                d_avg[l][m][t] = np.mean(pvals[l][m][t])
#         d_avg[m][t]["std"] = np.std(d[m][t])

In [None]:
d_avg_pstfut = get_d_avg(pval_pstfut,models,langs)

In [None]:
pd.DataFrame(d_avg["turkish"])

In [None]:
pd.DataFrame(d_avg["spanish"])

In [None]:
pd.DataFrame(d_avg["swahili"])

In [None]:
def t_test(d, model1, model2, t):
    return(stats.ttest_rel(d[model1][t],d[model2][t]).pvalue)

def get_pvals(d, t):
    sign = {}
    for m1 in models:
        sign[m1] = {}
        for m2 in models:
            sign[m1][m2] = t_test(d,m1,m2,t)
    df = pd.DataFrame(sign)
    df = remove_upper_diagonal(df)
    return df.replace(np.nan, '', regex=True)

def remove_upper_diagonal(df):
    return df.where(np.tril(np.ones(df.shape)).astype(np.bool))

In [None]:
print("acc")       
display(get_pvals(pval_pstfut["turkish"],"Acc_std"))
print("f1")
display(get_pvals(pval_pstfut["turkish"],"F1_std"))

In [None]:
print("acc")       
display(get_pvals(pval_pstfut["spanish"],"Acc_std"))
print("f1")
display(get_pvals(pval_pstfut["spanish"],"F1_std"))

In [None]:
print("acc")       
display(get_pvals(pval_pstfut["swahili"],"Acc_std"))
print("f1")
display(get_pvals(pval_pstfut["swahili"],"F1_std"))

In [None]:
print(get_pvals(pval_pstfut["turkish"],"F1_std").to_latex(caption="Turkish F1 Significance"))

In [None]:
print(get_pvals(pval_pstfut["spanish"],"F1_std").to_latex(caption="Spanish F1 Significance"))

In [None]:
print(get_pvals(pval_pstfut["swahili"],"F1_std").to_latex(caption="Swahili F1 Significance"))

In [None]:
pval_prs =  get_significance_data(df_others, models,langs,hints,vae,cols)

In [None]:
print(get_pvals(pval_prs["turkish"],"F1_std").to_latex(caption="Turkish F1 Significance"))

In [None]:
print(get_pvals(pval_prs["spanish"],"F1_std").to_latex(caption="Spanish F1 Significance"))

In [None]:
print(get_pvals(pval_prs["swahili"],"F1_std").to_latex(caption="Swahili F1 Significance"))

## Ablations

In [None]:
dfscan_ablations = pd.DataFrame(columns=("Task",'Split', 'Seed', 'Vae','Model','Acc','Acc_std','F1','F1_std',))
get_scan_scores(df=dfscan_ablations,
                models=("ID.1proto","nocopy.1proto", "nocopy.2proto"),
                exppath="./checkpoints_ablations/",
               )

In [None]:
dfscan_ablations

In [None]:
# def mean_std(x):
#     return(str(round(np.mean(x),2))+" (${+-}"+str(round(np.std(x),2))+"$)")

In [None]:
# def get_scan_scores_unreduced(df=None,
#                     tasks=("jump","around_right"),
#                     seeds=(0,1,2,3,4),
#                     vaes =("true","false"),
#                     models=("0proto","1proto","2proto"),
#                     exppath="./checkpoints",
#                    ):
#     for task in tasks:
#             for seed in seeds:
#                 for vae in vaes:
#                     for model in  models:
#                         taskpath=os.path.join(exppath,"SCANDataSet")
#                         identifier ="{}.vae.{}.{}.seed.{}".format(model,vae,task,seed)
#                         condfile=os.path.join(taskpath,"logs",identifier+".cond.log") 
#                         if os.path.exists(condfile):
#                             lines  = open(condfile,'r').readlines()
#                             print("processing: "+condfile)
#                             for (s,r) in get_splits(condfile).items():
#                                 acc, accstd, f1, f1std = calculate_scores_unreduced(lines[r])  
#                                 df.loc[len(df.index)] = (task,s,seed,vae,model,acc,accstd,f1,f1std)
#                         else:
#                             print(f"file doesnot exist: {condfile}")

In [None]:
# def get_morph_results_table_alt_std_deprecated(df, hints=4, vae="false", lang="average", markdown=False):
#     splits_s = ["TEST EVALS", "VAL EVALS", "TEST EASY"]
#     splits_s_alt = ["Future Tense", "Past Tense", "Present Tense"]
#     score_s = ["Acc", "F1"]
#     cols_s = ["Language", "Seed", "Model", "Split"] + score_s

#     df = df.replace(splits_s, splits_s_alt).\
#                     loc[(df['Split'].isin(splits_s)) & (df['Vae'] == vae) & (df['Hints'] == hints), cols_s].\
#                     reset_index().\
#                     drop(columns=['index'])

#     func = my_mean if markdown else mean_std
#     if lang == "average":
#         df = df.groupby(by=["Model","Split", "Seed"]).\
#                     agg({"Acc":"mean", "F1":"mean"}).\
#                     reset_index().\
#                     groupby(by=["Model","Split"]).\
#                     agg({'Acc':func, 'F1':func}).\
#                     reset_index("Split").pivot(columns="Split")
#     else:
#         df= df[df["Language"]==lang].groupby(by=["Model","Split", "Seed"]).\
#             agg({"Acc":"mean", "F1":"mean"}).\
#             reset_index().\
#             groupby(by=["Model","Split"]).\
#             agg({'Acc':func, 'F1':func}).\
#             reset_index("Split").pivot(columns="Split")
        

#     df.columns = df.columns.swaplevel(1,0)
#     #return(df_mean_std.iloc[[3,4,0,1,2,],[0,3,1,4,2,5]])
#     return(df.iloc[[0,1,2],[0,3,1,4,2,5]])

In [None]:
# #TODO: implement novel_tags
# def calculate_scores_unreduced_deprecated(lines):
#     start=0 
#     finished = False
#     preds, tps, fps, fns, f1s, novel, sorter = [],[],[],[],[],[],[]
#     while not finished:
#         data, start = find_next_matching_block(lines,start)
#         if data is not None:
#             inp, ref, pred_here = data
#             tp = (len([p for p in pred_here if p in ref]))
#             fp = (len([p for p in pred_here if p not in ref]))
#             fn = (len([p for p in ref if p not in pred_here]))
#             prec = tp / (tp + fp)
#             rec = tp / (tp + fn)
#             if prec == 0 or rec == 0:
#                 f1 = 0
#             else:
#                 f1 = 2 * prec * rec / (prec + rec)
#             sorter.append(inp+"\t".join(ref))
#             f1s.append(f1)
#             tps.append(tp)
#             fps.append(fp)
#             fns.append(fn)
#             preds.append(pred_here == ref)
#         else:
#             finished = True
#     f1s = sort_by(f1s,sorter)
#     tps = sort_by(tps,sorter)
#     fps = sort_by(fps,sorter)
#     fns = sort_by(fns,sorter)
#     preds = sort_by(preds,sorter)
#     tp, fp, fn = np.sum(tps), np.sum(fps), np.sum(fns)
#     prec = tp / (tp + fp)
#     rec = tp / (tp + fn)
#     if prec == 0 or rec == 0:
#         f1 = 0
#     else:
#         f1 = 2 * prec * rec / (prec + rec)
#     return np.mean(preds), preds, f1, f1s

In [None]:
# def get_lang_scores_unreduced(df=None,
#                     langs=("spanish","turkish","swahili"),
#                     hintss=(4,8,16),
#                     seeds=(0,1,2,3,4),
#                     vaes =("true","false"),
#                     models=("baseline","0proto","1proto","2proto"),
#                     exppath="./checkpoints",
#                     datapath="data/SIGDataSet.large",
#                     novel=False,
#                    ):
#     train_tags=None
#     for lang in langs:
#         for hints in hintss:
#             for seed in seeds:
#                 if novel:
#                     train_tags = get_tags(datapath + lang + '/', hints=hints,seed=seed)
#                 for vae in vaes:
#                     for model in  models:
#                         langpath=os.path.join(exppath,"SIGDataSet",lang)
#                         if model == "baseline" or model == "geca":
#                             identifier ="{}.hints.{}.seed.{}".format(model,hints,seed)
#                         else:
#                             identifier ="{}.vae.{}.hints.{}.seed.{}".format(model,vae,hints,seed)
#                         condfile=os.path.join(langpath,"logs",identifier+".cond.log") 
#                         if os.path.exists(condfile):
#                             lines  = open(condfile,'r').readlines()
#                             if len(lines) < 2142:
#                                 print("format broken in "+condfile)
#                                 continue
# #                             print("processing: "+condfile)
#                             for (s,r) in get_splits(condfile).items():
#                                 acc, accstd, f1, f1std = calculate_scores(lines[r], train_tags=train_tags, reduced=False)  
#                                 df.loc[len(df.index)] = (lang,hints,seed,vae,model,s,acc,accstd,f1,f1std)
#                         else:
#                             print(f"file doesnot exist: {condfile}")