In [55]:
import pandas as pd
from glob import glob
import numpy as np
from lr.text_processing.util import simple_pre_process_text_df
from lr.training.util import get_ternary_label, filter_df_by_label
from multiprocessing import Pool
from functools import reduce
from collections import Counter

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


In [2]:
syn_noum = pd.read_csv("data/snli/syn_noun.csv")
keys = syn_noum.key.to_list()
values = syn_noum.value.to_list()
syn_noum_dict = {k:v for k,v in zip(syn_noum.key, syn_noum.value)}
syn_noum_dict_inv = {v:k for k,v in zip(syn_noum.key, syn_noum.value)}

def check_key(s):
    return [w for w in s.split(" ") if w in keys] 

def check_key_df(df):    
    df.loc[:, "w_trans"] = df["text"].map(check_key)
    df.loc[:, "n_trans"] = df["w_trans"].map(lambda x : len(x))
    return df
    
def parallelize(df, func, n_cores):
    """
    general fucntion to parallelize a function applied to
    a df
    """
    df_split = np.array_split(df, n_cores)
    pool = Pool(n_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

In [95]:
df = pd.read_csv("data/snli/test.csv")
test = filter_df_by_label(df.dropna()).reset_index(drop=True)
df_t = pd.read_csv("data/snli/test_p_h_syn_noun.csv")
test_t = filter_df_by_label(df_t.dropna()).reset_index(drop=True)
test_t = test_t[["premise", "hypothesis"]].add_suffix("_t")

models = ["xgb", "albert_base", "bert_base", "roberta_base", "roberta_large", "xlnet_base"]

all_summaries ={}
all_res = {}

for model in models:

    all_combs = []
    paths = glob("raw_results/snli/{}/syn_p_h/batch1/*.csv".format(model))
    for path in paths:
        df1 = pd.read_csv(path).rename(columns={"label":"label_code"})
        assert test.shape[0]==test_t.shape[0]==df1.shape[0]
        comb = pd.concat([test,test_t, df1],1)
        comb_cut = comb.query("A==1 & B==0")
        all_combs.append(comb_cut)


    res = pd.concat(all_combs)
    res.index.name="id"
    res = res.reset_index()
    res = res.drop_duplicates(subset="id")
    summary = np.round((res["label"].value_counts()/res.shape[0])*100,1)
    all_res[model] = res
    all_summaries[model] = summary

In [96]:
for model in models:
    print(model)
    print(all_summaries[model])
    print()

xgb
entailment       34.9
neutral          34.2
contradiction    30.9
Name: label, dtype: float64

albert_base
contradiction    35.1
neutral          34.6
entailment       30.4
Name: label, dtype: float64

bert_base
neutral          41.8
entailment       32.9
contradiction    25.3
Name: label, dtype: float64

roberta_base
neutral          44.7
entailment       29.7
contradiction    25.6
Name: label, dtype: float64

roberta_large
neutral          44.2
entailment       32.2
contradiction    23.6
Name: label, dtype: float64

xlnet_base
neutral          40.0
entailment       31.3
contradiction    28.7
Name: label, dtype: float64



### Deep learning dificult

In [130]:
DL =  ["albert_base", "bert_base", "roberta_base", "roberta_large", "xlnet_base"]
count =[]
for model in DL:
    count += all_res[model].id.to_list()
    
count = Counter(count)

ids = [i for i,j in count.most_common(10)]
ids

[31, 32, 44, 73, 78, 232, 556, 677, 712, 820]

In [137]:
pd.set_option("max_colwidth", None)
intesection_error = all_res["bert_base"].loc[all_res["bert_base"].isin({'id': ids}).id]
intesection_error

Unnamed: 0,id,premise,hypothesis,label,premise_t,hypothesis_t,label_code,A,B
1,31,3 young man in hoods standing in the middle of a quiet street facing the camera.,Three hood wearing people pose for a picture.,entailment,3 young adult male in hoods standing in the center of a quiet street facing the photographic camera,three punk wearing people pose for a image,1,1,0
2,32,3 young man in hoods standing in the middle of a quiet street facing the camera.,Three hood wearing people stand in a street.,entailment,3 young adult male in hoods standing in the center of a quiet street facing the photographic camera,three punk wearing people base in a street,1,1,0
3,44,Male in a blue jacket decides to lay in the grass.,The guy wearing a blue jacket is laying on the green grass taking a nap.,neutral,male in a blue jacket decides to ballad in the grass,the guy wearing a blue jacket is laying on the green grass taking a sleep,2,1,0
7,73,A man in a black shirt overlooking bike maintenance.,A man learns bike maintenance.,neutral,a adult male in a black shirt overlooking motorcycle care,a adult male acquire motorcycle care,2,1,0
8,78,A man looking over a bicycle's rear wheel in the maintenance garage with various tools visible in the background.,A man repairs bicycles.,entailment,a adult male looking over a bike rear wheel in the care garage with various tools visible in the background,a adult male repairs bike,1,1,0
43,556,Boys with their backs against an incoming wave.,A group of people play in the ocean.,neutral,boys with their backs against an incoming moving ridge,a group of people play in the ocean,2,1,0
58,677,Five children playing soccer chase after a ball.,They are playing football.,contradiction,five child playing association football pursuit after a ball,they are playing football game,0,1,0
64,712,A mother and daughter walk along the side of a bridge.,A mother and daughter are walking home.,neutral,a female parent and girl walk along the side of a span,a female parent and girl are walking place,2,1,0
70,820,A man with a long white beard is examining a camera and another man with a black shirt is in the background.,A man is with a cowboy,neutral,a adult male with a hanker white whiskers is examining a photographic camera and another adult male with a black shirt is in the background,a adult male is with a puncher,2,1,0
672,232,Several younger people sitting in front of a statue.,several young people sitting in a school courtyard,neutral,several younger people sitting in front of a statue,several young people sitting in a school court,2,1,0
