In [1]:
import pandas as pd
import numpy as np
from lr.text_processing.util import simple_pre_process_text_df
from multiprocessing import Pool
from functools import reduce
from collections import Counter

In [2]:
syn_noum = pd.read_csv("data/mnli/syn_noun.csv")
keys = syn_noum.key.to_list()
values = syn_noum.value.to_list()
syn_noum_dict = {k:v for k,v in zip(syn_noum.key, syn_noum.value)}
syn_noum_dict_inv = {v:k for k,v in zip(syn_noum.key, syn_noum.value)}

def check_key(s):
    return [w for w in s.split(" ") if w in keys] 

def check_key_df(df):    
    df.loc[:, "w_trans"] = df["text"].map(check_key)
    df.loc[:, "n_trans"] = df["w_trans"].map(lambda x : len(x))
    return df
    
def parallelize(df, func, n_cores):
    """
    general fucntion to parallelize a function applied to
    a df
    """
    df_split = np.array_split(df, n_cores)
    pool = Pool(n_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

In [3]:
df = pd.read_csv("data/mnli/train.csv").dropna()
simple_pre_process_text_df(df, "premise")
simple_pre_process_text_df(df, "hypothesis")
df.loc[:, "text"] = df.premise + " " + df.hypothesis
df.loc[:, "size"] = df["text"].map(lambda x: len(x.split(" ")))
df = parallelize(df, func=check_key_df, n_cores=20)

df_neu  = df.query("label=='neutral'")
df_con  = df.query("label=='contradiction'")
df_ent  = df.query("label=='entailment'")

df_p = pd.read_csv("data/mnli/train_p_h_syn_noun.csv").dropna()
df_p.loc[:, "text"] = df_p.premise + " " + df_p.hypothesis
df_p.loc[:, "size"] = df_p["text"].map(lambda x: len(x.split(" ")))
df_p = parallelize(df_p, func=check_key_df, n_cores=20)

df_p_neu  = df_p.query("label=='neutral'")
df_p_con  = df_p.query("label=='contradiction'")
df_p_ent  = df_p.query("label=='entailment'")

In [24]:
df.shape[0], df_p.shape[0]

(392662, 392662)

In [27]:
(df["n_trans"]>0).mean()*100

91.30040594709953

In [30]:
df["n_trans"].mean()

4.232604631973555

## Sizes (Original)

In [4]:
avg_total = df[["size"]].describe().loc["mean"].values[0]
avg_con = df_con[["size"]].describe().loc["mean"].values[0]
avg_neu = df_neu[["size"]].describe().loc["mean"].values[0]
avg_ent = df_ent[["size"]].describe().loc["mean"].values[0]
results = [avg_total, avg_con, avg_neu, avg_ent]
results = [np.round(i,2) for i in results]
columns = ["total", "contradiction", "neutral", "entailment"]
original = pd.DataFrame([results], columns=columns)
original

Unnamed: 0,total,contradiction,neutral,entailment
0,29.93,29.3,30.66,29.82


## Sizes (Transformed)

In [5]:
avg_total = df_p[["size"]].describe().loc["mean"].values[0]
avg_con = df_p_con[["size"]].describe().loc["mean"].values[0]
avg_neu = df_p_neu[["size"]].describe().loc["mean"].values[0]
avg_ent = df_p_ent[["size"]].describe().loc["mean"].values[0]
results = [avg_total, avg_con, avg_neu, avg_ent]
results = [np.round(i,2) for i in results]
columns = ["total", "contradiction", "neutral", "entailment"]
transformed = pd.DataFrame([results], columns=columns)
transformed

Unnamed: 0,total,contradiction,neutral,entailment
0,32.29,31.64,33.04,32.19


In [6]:
pd.concat([original, transformed]).pct_change()* 100

Unnamed: 0,total,contradiction,neutral,entailment
0,,,,
0,7.885065,7.986348,7.762557,7.947686


In [7]:
df.head(3)

Unnamed: 0,premise,hypothesis,label,text,size,w_trans,n_trans
0,conceptually cream skimming has two basic dime...,product and geography are what make cream skim...,neutral,conceptually cream skimming has two basic dime...,20,"[cream, product, geography, product, geography...",6
1,you know during the season and i guess at at y...,you lose the things to the following level if ...,entailment,you know during the season and i guess at at y...,74,"[level, level, decide, recall, team, decide, c...",13
2,one of our number will carry out your instruct...,a member of my team will execute your orders w...,entailment,one of our number will carry out your instruct...,22,"[instructions, member, team]",3


## Count transformations

In [8]:
avg_total = df["n_trans"].mean()
avg_con = df_con["n_trans"].mean()
avg_neu = df_neu["n_trans"].mean()
avg_ent = df_ent["n_trans"].mean()
results = [avg_total, avg_con, avg_neu, avg_ent]
results = [np.round(i,2) for i in results]
columns = ["total", "contradiction", "neutral", "entailment"]
n_tranforms = pd.DataFrame([results], columns=columns)
n_tranforms

Unnamed: 0,total,contradiction,neutral,entailment
0,4.23,4.12,4.34,4.24


## Check transformations

In [9]:
def lists2list(arr):
    return reduce(lambda x,y: x+y, arr)

def get_counter(arrays, n_cores):
    splits = np.array_split(arrays, n_cores)
    pool = Pool(n_cores)
    final_list = lists2list(pool.map(lists2list, splits))
    pool.close()
    pool.join()
    key_count = Counter(final_list)
    return key_count

count_total = get_counter(df["w_trans"].values, n_cores=20)
count_total.most_common(10)

[('been', 23306),
 ('said', 16356),
 ('also', 12919),
 ('got', 10534),
 ('information', 10260),
 ('year', 10163),
 ('made', 9838),
 ('before', 9331),
 ('need', 8519),
 ('man', 8140)]

In [10]:
count_con = get_counter(df_con["w_trans"].values, n_cores=20)
count_con.most_common(10)

[('been', 7720),
 ('said', 5398),
 ('also', 3579),
 ('got', 3399),
 ('information', 3329),
 ('year', 3142),
 ('made', 3069),
 ('need', 3027),
 ('before', 2957),
 ('man', 2649)]

In [11]:
count_neu = get_counter(df_neu["w_trans"].values, n_cores=20)
count_neu.most_common(10)

[('been', 8280),
 ('said', 5464),
 ('also', 5244),
 ('year', 3796),
 ('got', 3593),
 ('made', 3528),
 ('before', 3457),
 ('information', 3419),
 ('need', 2888),
 ('man', 2817)]

In [12]:
count_ent = get_counter(df_ent["w_trans"].values, n_cores=20)
count_ent.most_common(10)

[('been', 7306),
 ('said', 5494),
 ('also', 4096),
 ('got', 3542),
 ('information', 3512),
 ('made', 3241),
 ('year', 3225),
 ('before', 2917),
 ('man', 2674),
 ('need', 2604)]

## Artifacts analysis

In [13]:
count_gen = lambda w: lambda s: int(w in s.split(" "))
con_art = ["never", "no", "nothing", "any", "none"]
neu_art = ["also", "because", "popular", "many", "most"]
ent_art = ["some", "yes", "something", "sometimes", "various"]
con_art_t = [(w, syn_noum_dict[w]) for w in con_art if w in keys]
neu_art_t = [(w, syn_noum_dict[w]) for w in neu_art if w in keys]
ent_art_t = [(w, syn_noum_dict[w]) for w in ent_art if w in keys]

### Countradition

In [14]:
con_art_t, [(syn_noum_dict_inv[w], w) for w in con_art if w in values]

([], [])

In [15]:
df_con.head(1)

Unnamed: 0,premise,hypothesis,label,text,size,w_trans,n_trans
8,gays and lesbians,heterosexuals,contradiction,gays and lesbians heterosexuals,4,[heterosexuals],1


In [16]:
df_p_con.head(1)

Unnamed: 0,premise,hypothesis,label,text,size,w_trans,n_trans
8,gays and lesbians,straight,contradiction,gays and lesbians straight,6,[],0


In [17]:
for w in con_art:
    f_count = count_gen(w)
    df_con.loc[:, "{}_count".format(w)] = df_con["text"].map(f_count)
    df_p_con.loc[:, "{}_count".format(w)] = df_p_con["text"].map(f_count)
    print(w, np.round(df_con["{}_count".format(w)].mean()*100,2), np.round(df_p_con["{}_count".format(w)].mean()*100,2))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value


never 6.04 6.04
no 12.5 12.5
nothing 2.08 2.08
any 5.81 5.81
none 0.56 0.56


### Neutral

In [18]:
neu_art_t, [(syn_noum_dict_inv[w], w) for w in neu_art if w in values]

([('also', 'too')], [])

In [19]:
for w in neu_art:
    f_count = count_gen(w)
    df_neu.loc[:, "{}_count".format(w)] = df_neu["text"].map(f_count)
    df_p_neu.loc[:, "{}_count".format(w)] = df_p_neu["text"].map(f_count)
    print(w, np.round(df_neu["{}_count".format(w)].mean()*100,2), np.round(df_p_neu["{}_count".format(w)].mean()*100,2))

also 3.87 0.0
because 7.08 7.08
popular 1.05 1.05
many 4.51 4.51
most 4.66 4.66


### Entailment

In [20]:
ent_art_t, [(syn_noum_dict_inv[w], w) for w in ent_art if w in values]

([('various', 'assorted')], [('respective', 'various')])

In [21]:
for w in ent_art:
    f_count = count_gen(w)
    df_ent.loc[:, "{}_count".format(w)] = df_ent["text"].map(f_count)
    df_p_ent.loc[:, "{}_count".format(w)] = df_p_ent["text"].map(f_count)
    print(w, np.round(df_ent["{}_count".format(w)].mean()*100,2), np.round(df_p_ent["{}_count".format(w)].mean()*100,2))

some 5.07 5.07
yes 1.12 1.12
something 2.25 2.25
sometimes 0.66 0.66
various 0.39 0.03
