In [1]:
import pandas as pd
from glob import glob
import numpy as np
from lr.text_processing.util import simple_pre_process_text_df
from lr.training.util import get_ternary_label, filter_df_by_label
from multiprocessing import Pool
from functools import reduce
from collections import Counter

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


In [2]:
syn_noum = pd.read_csv("data/mnli/syn_noun.csv")
keys = syn_noum.key.to_list()
values = syn_noum.value.to_list()
syn_noum_dict = {k:v for k,v in zip(syn_noum.key, syn_noum.value)}
syn_noum_dict_inv = {v:k for k,v in zip(syn_noum.key, syn_noum.value)}

def check_key(s):
    return [w for w in s.split(" ") if w in keys] 

def check_key_df(df):    
    df.loc[:, "w_trans"] = df["text"].map(check_key)
    df.loc[:, "n_trans"] = df["w_trans"].map(lambda x : len(x))
    return df
    
def parallelize(df, func, n_cores):
    """
    general fucntion to parallelize a function applied to
    a df
    """
    df_split = np.array_split(df, n_cores)
    pool = Pool(n_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

In [3]:
df = pd.read_csv("data/mnli/test.csv")
test = filter_df_by_label(df.dropna()).reset_index(drop=True)
df_t = pd.read_csv("data/mnli/test_p_h_syn_noun.csv")
test_t = filter_df_by_label(df_t.dropna()).reset_index(drop=True)
test_t = test_t[["premise", "hypothesis"]].add_suffix("_t")

models = ["xgb", "albert_base", "bert_base", "roberta_base", "roberta_large", "xlnet_base"]

all_summaries ={}
all_res = {}

for model in models:

    all_combs = []
    paths = glob("raw_results/mnli/{}/syn_p_h/batch1/*.csv".format(model))
    for path in paths:
        df1 = pd.read_csv(path).rename(columns={"label":"label_code"})
        assert test.shape[0]==test_t.shape[0]==df1.shape[0]
        comb = pd.concat([test,test_t, df1],1)
        comb_cut = comb.query("A==1 & B==0")
        all_combs.append(comb_cut)


    res = pd.concat(all_combs)
    res.index.name="id"
    res = res.reset_index()
    res = res.drop_duplicates(subset="id")
    summary = np.round((res["label"].value_counts()/res.shape[0])*100,1)
    all_res[model] = res
    all_summaries[model] = summary

In [4]:
for model in models:
    print(model)
    print(all_summaries[model])
    print()

xgb
entailment       39.3
neutral          38.1
contradiction    22.6
Name: label, dtype: float64

albert_base
entailment       38.9
neutral          31.7
contradiction    29.4
Name: label, dtype: float64

bert_base
entailment       41.4
neutral          32.0
contradiction    26.5
Name: label, dtype: float64

roberta_base
entailment       40.5
neutral          31.9
contradiction    27.6
Name: label, dtype: float64

roberta_large
entailment       39.0
neutral          33.7
contradiction    27.3
Name: label, dtype: float64

xlnet_base
entailment       39.9
neutral          32.2
contradiction    27.9
Name: label, dtype: float64



### Deep learning dificult

In [5]:
DL =  ["albert_base", "bert_base", "roberta_base", "roberta_large", "xlnet_base"]
count =[]
for model in DL:
    count += all_res[model].id.to_list()
    
count = Counter(count)

ids = [i for i,j in count.most_common(10)]
ids

[63, 111, 274, 288, 700, 721, 723, 845, 858, 946]

In [6]:
pd.set_option("max_colwidth", None)
intesection_error = all_res["bert_base"].loc[all_res["bert_base"].isin({'id': ids}).id]
intesection_error

Unnamed: 0,id,premise,hypothesis,label,premise_t,hypothesis_t,label_code,A,B
5,111,The sacred is not mysterious to her.,The woman is familiar with the sacred.,entailment,the sacred is not mysterious to her,the adult female is familiar with the sacred,1,1,0
18,274,"When the next modernist revolution comes around, he'll be ready.",The man will be prepared.,entailment,when the next modernist revolution comes around inferno be ready,the adult male will be prepared,1,1,0
19,288,The best beach in Europe ' at least that's the verdict of its regulars.,Regular beachgoers say that it is the best in Europe.,entailment,the best beach in europe at least thats the finding of fact of its fixture,regular beachgoers say that it is the best in europe,1,1,0
44,700,so you know well a lot of the stuff you hear coming from South Africa now and from West Africa that's considered world music because it's not particularly using certain types of folk styles,They consider the West African music to be worldly since they do not rely on folk styles.,entailment,so you know well a lot of the material you hear coming from south africa now and from west africa thats considered creation music because its not particularly using certain types of folk way,they consider the west african music to be worldly since they do not trust on folk way,1,1,0
46,721,The last 12 years of his life are a blank.,He can't remember the last 12 years of his life,entailment,the last 12 years of his life are a space,he cant think the last 12 years of his life,1,1,0
47,723,"A funny place for a piece of brown paper, I mused.",I was thinking about strangeness of a piece of brown paper being in that spot.,entailment,a funny story place for a piece of brown paper i mused,i was thinking about unfamiliarity of a piece of brown paper being in that place,1,1,0
60,845,well do you know you have a ten limit a ten minute time limit well that's okay and then they come on and tell you and they tell you got five seconds to say good-bye,"Usually there's a 10 minute time limit, but they'll say you have a few seconds to go!",entailment,well do you know you have a decade boundary a decade min time boundary well thats okey and then they come on and tell you and they tell you get five s to say bye,usually theres a 10 min time boundary but theyll say you have a few s to go,1,1,0
61,858,"Good-bye."" Julius was bending over the car.",Julius said good bye at the car.,entailment,bye julius was bending over the car,julius state good pass at the car,1,1,0
67,946,oh like if they say i i we just type it in like that,Change it before typing it in.,contradiction,oh like if they say i i we just type it in like that,modification it earlier typing it in,0,1,0
4182,63,the net cost of operations.,The gross cost.,contradiction,the net cost of trading operations,the 144 cost,0,1,0
