In [1]:
import pandas as pd
import numpy as np
from lr.text_processing.util import simple_pre_process_text_df
from multiprocessing import Pool
from functools import reduce
from collections import Counter

In [47]:
syn_noum = pd.read_csv("data/snli/syn_noun.csv")
keys = syn_noum.key.to_list()
values = syn_noum.value.to_list()
syn_noum_dict = {k:v for k,v in zip(syn_noum.key, syn_noum.value)}
syn_noum_dict_inv = {v:k for k,v in zip(syn_noum.key, syn_noum.value)}

def check_key(s):
    return [w for w in s.split(" ") if w in keys] 

def check_key_df(df):    
    df.loc[:, "w_trans"] = df["text"].map(check_key)
    df.loc[:, "n_trans"] = df["w_trans"].map(lambda x : len(x))
    return df
    
def parallelize(df, func, n_cores):
    """
    general fucntion to parallelize a function applied to
    a df
    """
    df_split = np.array_split(df, n_cores)
    pool = Pool(n_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

In [3]:
df = pd.read_csv("data/snli/train.csv").dropna()
simple_pre_process_text_df(df, "premise")
simple_pre_process_text_df(df, "hypothesis")
df.loc[:, "text"] = df.premise + " " + df.hypothesis
df.loc[:, "size"] = df["text"].map(lambda x: len(x.split(" ")))
df = parallelize(df, func=check_key_df, n_cores=20)

df_neu  = df.query("label=='neutral'")
df_con  = df.query("label=='contradiction'")
df_ent  = df.query("label=='entailment'")

df_p = pd.read_csv("data/snli/train_p_h_syn_noun.csv").dropna()
df_p.loc[:, "text"] = df_p.premise + " " + df_p.hypothesis
df_p.loc[:, "size"] = df_p["text"].map(lambda x: len(x.split(" ")))
df_p = parallelize(df_p, func=check_key_df, n_cores=20)

df_p_neu  = df_p.query("label=='neutral'")
df_p_con  = df_p.query("label=='contradiction'")
df_p_ent  = df_p.query("label=='entailment'")

## Sizes (Original)

In [11]:
avg_total = df[["size"]].describe().loc["mean"].values[0]
avg_con = df_con[["size"]].describe().loc["mean"].values[0]
avg_neu = df_neu[["size"]].describe().loc["mean"].values[0]
avg_ent = df_ent[["size"]].describe().loc["mean"].values[0]
results = [avg_total, avg_con, avg_neu, avg_ent]
results = [np.round(i,2) for i in results]
columns = ["total", "contradiction", "neutral", "entailment"]
original = pd.DataFrame([results], columns=columns)
original

Unnamed: 0,total,contradiction,neutral,entailment
0,20.27,20.21,21.11,19.48


## Sizes (Transformed)

In [12]:
avg_total = df_p[["size"]].describe().loc["mean"].values[0]
avg_con = df_p_con[["size"]].describe().loc["mean"].values[0]
avg_neu = df_p_neu[["size"]].describe().loc["mean"].values[0]
avg_ent = df_p_ent[["size"]].describe().loc["mean"].values[0]
results = [avg_total, avg_con, avg_neu, avg_ent]
results = [np.round(i,2) for i in results]
columns = ["total", "contradiction", "neutral", "entailment"]
transformed = pd.DataFrame([results], columns=columns)
transformed

Unnamed: 0,total,contradiction,neutral,entailment
0,23.61,23.59,24.49,22.77


In [13]:
pd.concat([original, transformed]).pct_change()* 100

Unnamed: 0,total,contradiction,neutral,entailment
0,,,,
0,16.477553,16.724394,16.011369,16.889117


In [None]:
df.head(3)

## Count transformations

In [22]:
avg_total = df["n_trans"].mean()
avg_con = df_con["n_trans"].mean()
avg_neu = df_neu["n_trans"].mean()
avg_ent = df_ent["n_trans"].mean()
results = [avg_total, avg_con, avg_neu, avg_ent]
results = [np.round(i,2) for i in results]
columns = ["total", "contradiction", "neutral", "entailment"]
n_tranforms = pd.DataFrame([results], columns=columns)
n_tranforms

Unnamed: 0,total,contradiction,neutral,entailment
0,3.25,3.25,3.39,3.11


## Check transformations

In [23]:
def lists2list(arr):
    return reduce(lambda x,y: x+y, arr)

def get_counter(arrays, n_cores):
    splits = np.array_split(arrays, n_cores)
    pool = Pool(n_cores)
    final_list = lists2list(pool.map(lists2list, splits))
    pool.close()
    pool.join()
    key_count = Counter(final_list)
    return key_count

count_total = get_counter(df["w_trans"].values, n_cores=20)
count_total.most_common(10)

[('man', 264898),
 ('woman', 136681),
 ('boy', 58015),
 ('dog', 53772),
 ('person', 35833),
 ('child', 32229),
 ('children', 26210),
 ('bike', 15051),
 ('park', 13056),
 ('sidewalk', 12874)]

In [24]:
count_con = get_counter(df_con["w_trans"].values, n_cores=20)
count_con.most_common(10)

[('man', 89172),
 ('woman', 47644),
 ('boy', 19657),
 ('dog', 18301),
 ('child', 10148),
 ('person', 10078),
 ('children', 8546),
 ('park', 4789),
 ('bike', 4738),
 ('sidewalk', 4332)]

In [25]:
count_neu = get_counter(df_neu["w_trans"].values, n_cores=20)
count_neu.most_common(10)

[('man', 87705),
 ('woman', 44446),
 ('boy', 19547),
 ('dog', 18061),
 ('child', 10586),
 ('person', 10215),
 ('children', 8713),
 ('bike', 5079),
 ('park', 4953),
 ('couple', 4347)]

In [26]:
count_ent = get_counter(df_ent["w_trans"].values, n_cores=20)
count_ent.most_common(10)

[('man', 87632),
 ('woman', 44418),
 ('boy', 18731),
 ('dog', 17341),
 ('person', 15497),
 ('child', 11433),
 ('children', 8920),
 ('outdoors', 5967),
 ('bike', 5217),
 ('picture', 4748)]

## Artifacts analysis

In [114]:
count_gen = lambda w: lambda s: int(w in s.split(" "))
con_art = ["nobody", "sleeping", "no", "tv", "cat"]
neu_art = ["tall", "first", "competition", "sad", "favorite"]
ent_art = ["outdoors", "least", "instrument", "outside", "animal"]
con_art_t = [(w, syn_noum_dict[w]) for w in con_art if w in keys]
neu_art_t = [(w, syn_noum_dict[w]) for w in neu_art if w in keys]
ent_art_t = [(w, syn_noum_dict[w]) for w in ent_art if w in keys]

### Countradition

In [115]:
con_art_t, [(syn_noum_dict_inv[w], w) for w in con_art if w in values]

([('tv', 'video'), ('cat', 'true cat')], [])

In [116]:
for w in con_art:
    f_count = count_gen(w)
    df_con.loc[:, "{}_count".format(w)] = df_con["text"].map(f_count)
    df_p_con.loc[:, "{}_count".format(w)] = df_p_con["text"].map(f_count)
    print(w, np.round(df_con["{}_count".format(w)].mean()*100,2), np.round(df_p_con["{}_count".format(w)].mean()*100,2))

nobody 1.29 1.29
sleeping 3.59 3.59
no 1.75 1.75
tv 0.88 0.0
cat 1.4 1.4


### Neutral

In [117]:
neu_art_t, [(syn_noum_dict_inv[w], w) for w in neu_art if w in values]

([('favorite', 'favourite')], [('1st', 'first'), ('rivals', 'competition')])

In [118]:
for w in neu_art:
    f_count = count_gen(w)
    df_neu.loc[:, "{}_count".format(w)] = df_neu["text"].map(f_count)
    df_p_neu.loc[:, "{}_count".format(w)] = df_p_neu["text"].map(f_count)
    print(w, np.round(df_neu["{}_count".format(w)].mean()*100,2), np.round(df_p_neu["{}_count".format(w)].mean()*100,2))

tall 1.29 1.29
first 0.66 0.67
competition 0.85 1.12
sad 0.6 0.6
favorite 0.37 0.0


### Entailment

In [119]:
ent_art_t, [(syn_noum_dict_inv[w], w) for w in ent_art if w in values]

([('outdoors', 'open'), ('animal', 'creature')],
 [('outdoor', 'outside'), ('creatures', 'animal')])

In [120]:
for w in ent_art:
    f_count = count_gen(w)
    df_ent.loc[:, "{}_count".format(w)] = df_ent["text"].map(f_count)
    df_p_ent.loc[:, "{}_count".format(w)] = df_p_ent["text"].map(f_count)
    print(w, np.round(df_ent["{}_count".format(w)].mean()*100,2), np.round(df_p_ent["{}_count".format(w)].mean()*100,2))

outdoors 3.17 0.0
least 0.21 0.21
instrument 0.67 0.67
outside 9.81 10.41
animal 0.85 0.05
