In [3]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
from relation_modeling_utils import load_data

train_df = load_data("data/atomic2020_data-feb2021/train.tsv", multi_label=True)
val_df = load_data("data/atomic2020_data-feb2021/dev.tsv", multi_label=True)
test_df = load_data("data/atomic2020_data-feb2021/test.tsv", multi_label=True)

In [5]:
len(train_df), len(val_df), len(test_df)

(36940, 2962, 6569)

In [6]:
train_df.head()

Unnamed: 0,text,label
0,PersonX abandons ___ altogether,"[0, 0, 1]"
1,PersonX abandons the ___ altogether,"[0, 1, 1]"
2,PersonX abolishes ___ altogether,"[0, 1, 1]"
3,PersonX abolishes ___ in the states,"[0, 1, 1]"
4,PersonX abolishes the ___ altogether,"[0, 1, 1]"


In [91]:
from relation_modeling_utils import explode_labels
train_df, test_df = explode_labels(train_df), explode_labels(test_df)

In [92]:
test_df.label_0.value_counts(), test_df.label_1.value_counts(), test_df.label_2.value_counts()

(0    4668
 1    1901
 Name: label_0, dtype: int64,
 1    4419
 0    2150
 Name: label_1, dtype: int64,
 0    3996
 1    2573
 Name: label_2, dtype: int64)

## Original data lexical overlap

In [69]:
from relation_modeling_utils import create_vocab
train_vocab, val_vocab, test_vocab = create_vocab(train_df), create_vocab(val_df), create_vocab(test_df)

100%|██████████| 36940/36940 [00:00<00:00, 172259.68it/s]
100%|██████████| 2962/2962 [00:00<00:00, 168591.78it/s]
100%|██████████| 6569/6569 [00:00<00:00, 131311.25it/s]


### Lexical overlap with stopwords

In [70]:
train_test_overlap = set(train_vocab).intersection(set(test_vocab))
len(train_test_overlap) / len(train_vocab), len(train_test_overlap) / len(test_vocab)

(0.27977483705313055, 0.8140804597701149)

In [71]:
train_val_overlap = set(train_vocab).intersection(set(val_vocab))
len(train_val_overlap) / len(train_vocab), len(train_val_overlap) / len(val_vocab)

(0.1602804661268023, 0.8868852459016393)

### Lexical overlap without stopwords

In [72]:
from spacy.lang.en.stop_words import STOP_WORDS

train_vocab_nostp = create_vocab(train_df, include_stopwords=False)
val_vocab_nostp = create_vocab(val_df, include_stopwords=False)
test_vocab_nostp = create_vocab(test_df, include_stopwords=False)

100%|██████████| 36940/36940 [00:00<00:00, 128906.58it/s]
100%|██████████| 2962/2962 [00:00<00:00, 156637.27it/s]
100%|██████████| 6569/6569 [00:00<00:00, 155559.47it/s]


In [73]:
train_test_overlap_nostp = set(train_vocab_nostp).intersection(set(test_vocab_nostp))
len(train_test_overlap_nostp) / len(train_vocab_nostp), len(train_test_overlap_nostp) / len(test_vocab_nostp)

(0.27056102955962197, 0.8061713600958658)

In [74]:
train_val_overlap_nostp = set(train_vocab_nostp).intersection(set(val_vocab_nostp))
len(train_val_overlap_nostp) / len(train_vocab_nostp), len(train_val_overlap_nostp) / len(val_vocab_nostp)

(0.15041222602051077, 0.8789659224441834)

## Create new ATOMIC datasets

In [13]:
import pandas as pd

atomic_df = pd.concat([train_df, test_df])

In [14]:
len(atomic_df)

43509

### Handle duplicates

In [15]:
atomic_df.duplicated(subset=["text"]).sum()

1922

In [16]:
from relation_modeling_utils import explode_labels
atomic_df = explode_labels(atomic_df)

In [17]:
atomic_df.head()

Unnamed: 0,text,label,label_0,label_1,label_2
0,PersonX abandons ___ altogether,"[0, 0, 1]",0,0,1
1,PersonX abandons the ___ altogether,"[0, 1, 1]",0,1,1
2,PersonX abolishes ___ altogether,"[0, 1, 1]",0,1,1
3,PersonX abolishes ___ in the states,"[0, 1, 1]",0,1,1
4,PersonX abolishes the ___ altogether,"[0, 1, 1]",0,1,1


In [18]:
atomic_df.duplicated(subset=["text", "label_0", "label_1", "label_2"]).sum()

165

In [19]:
atomic_df = atomic_df.drop_duplicates(subset=["text", "label_0", "label_1", "label_2"])

In [20]:
duplicated_df = atomic_df[atomic_df.duplicated(subset=["text"])]

In [21]:
len(duplicated_df)

1757

In [22]:
atomic_df[atomic_df.text == "PersonX forgets PersonX's lines"]

Unnamed: 0,text,label,label_0,label_1,label_2
5256,PersonX forgets PersonX's lines,"[0, 0, 1]",0,0,1
3591,PersonX forgets PersonX's lines,"[0, 1, 0]",0,1,0


In [23]:
train_df[train_df.text == "PersonX forgets PersonX's lines"]

Unnamed: 0,text,label
5256,PersonX forgets PersonX's lines,"[0, 0, 1]"


In [24]:
test_df[test_df.text == "PersonX forgets PersonX's lines"]

Unnamed: 0,text,label
3591,PersonX forgets PersonX's lines,"[0, 1, 0]"


In [25]:
atomic_df.duplicated(subset=["text", "label_0"]).sum()

1757

In [26]:
all_duplicate_df = atomic_df[atomic_df.duplicated(subset=["text"], keep=False)]


In [27]:
len(all_duplicate_df)

3514

In [28]:
all_duplicate_df.head()

Unnamed: 0,text,label,label_0,label_1,label_2
0,PersonX abandons ___ altogether,"[0, 0, 1]",0,0,1
5,PersonX about to get married,"[0, 0, 1]",0,0,1
11,PersonX accepts PersonY thanks,"[0, 0, 1]",0,0,1
25,PersonX accidentally burned,"[0, 0, 1]",0,0,1
27,PersonX accidentally cut,"[0, 0, 1]",0,0,1


In [29]:
import numpy as np

def group_duplicate_heads(subdf):
    label_s = np.logical_or(*[np.array(l) for l in subdf.label]).astype(int).tolist()
    label0_s = np.logical_or(*subdf.label_0.to_list()).astype(int)
    label1_s = np.logical_or(*subdf.label_1.to_list()).astype(int)
    label2_s = np.logical_or(*subdf.label_2.to_list()).astype(int)
    return pd.Series({"label": label_s, "label_0": label0_s, "label_1": label1_s, "label_2": label2_s})

handled_dup_df = all_duplicate_df.groupby("text").apply(group_duplicate_heads).reset_index()

In [30]:
handled_dup_df.head()

Unnamed: 0,text,label,label_0,label_1,label_2
0,PersonX abandons ___ altogether,"[0, 1, 1]",0,1,1
1,PersonX about to get married,"[0, 1, 1]",0,1,1
2,PersonX accepts PersonY thanks,"[0, 1, 1]",0,1,1
3,PersonX accidentally burned,"[0, 1, 1]",0,1,1
4,PersonX accidentally cut,"[0, 1, 1]",0,1,1


In [31]:
len(atomic_df)

43344

In [32]:
atomic_df = atomic_df.drop_duplicates(subset=["text"], keep=False)

In [33]:
len(atomic_df)

39830

In [34]:
atomic_df = pd.concat([atomic_df, handled_dup_df])

In [35]:
len(atomic_df)

41587

In [36]:
atomic_df.duplicated(subset=["text"]).sum()

0

### Create docs out of heads

In [37]:
import spacy
from tqdm import tqdm
from relation_modeling_utils import IGNORE_WORDS, create_vocab, get_doc
from spacy.lang.en.stop_words import STOP_WORDS

def make_docs(data, vocab, include_stopwords=True):
    nlp = spacy.load("en_core_web_sm", exclude=["ner"])
    docs = []

    for row in tqdm(data.itertuples(), total=len(data)):
        doc = get_doc(nlp, row.text)
        words = set()

        for token in doc:
            if token.text not in IGNORE_WORDS and (include_stopwords or token.text not in STOP_WORDS):
                words.add(token.lemma_)
        
        doc.user_data['words'] = words
        doc.user_data['label'] = row.label
        docs.append(doc)
    
    for doc in docs:
        freqs = 0

        for word in doc.user_data['words']:
            freqs += max(vocab.get(word, 0) - 1, 0)
        
        doc.user_data['relative_freq'] = freqs
    
    return docs

In [75]:
atomic_vocab = create_vocab(atomic_df)

100%|██████████| 41587/41587 [00:00<00:00, 166489.15it/s]


In [76]:
atomic_docs = make_docs(atomic_df, atomic_vocab, include_stopwords=False)

100%|██████████| 41587/41587 [00:00<00:00, 105698.05it/s]


In [77]:
sorted(atomic_vocab.items(), key=lambda i: i[1])[-5:]

[('in', 1440), ('a', 2819), ('to', 2948), ('the', 4474), ("'s", 6003)]

In [78]:
class1_docs = [doc for doc in atomic_docs if doc.user_data['label'][0] == 1]
class2_docs = [doc for doc in atomic_docs if doc.user_data['label'][1] == 1]
class3_docs = [doc for doc in atomic_docs if doc.user_data['label'][2] == 1]

In [116]:
FREQUENCY_THRESHOLD = 5
class1_freq1_docs = [doc for doc in class1_docs if doc.user_data['relative_freq'] < 1][:500]
class2_freq1_docs = [doc for doc in class2_docs if doc.user_data['relative_freq'] < FREQUENCY_THRESHOLD][:500]
class3_freq1_docs = [doc for doc in class3_docs if doc.user_data['relative_freq'] < FREQUENCY_THRESHOLD][:500]

In [117]:
len(class1_freq1_docs), len(class2_freq1_docs), len(class3_freq1_docs)

(500, 500, 500)

In [118]:
test_samples = [doc.text for doc in class1_freq1_docs+class2_freq1_docs+class3_freq1_docs]

In [119]:
len(test_samples)

1500

In [120]:
new_train_data, new_test_data = [], []

for row in atomic_df.itertuples():
    if row.text in test_samples:
        new_test_data.append((row.text, row.label))
    else:
        new_train_data.append((row.text, row.label))

In [121]:
new_train_df = pd.DataFrame(new_train_data, columns=["text", "label"])
new_test_df = pd.DataFrame(new_test_data, columns=["text", "label"])

In [122]:
len(new_train_df), len(new_test_df)

(40395, 1192)

In [123]:
from relation_modeling_utils import create_vocab
new_train_vocab, new_test_vocab = create_vocab(new_train_df), create_vocab(new_test_df)

100%|██████████| 40395/40395 [00:00<00:00, 177888.02it/s]
100%|██████████| 1192/1192 [00:00<00:00, 238827.28it/s]


### New lexical overlap with stopwords

In [124]:
new_train_test_overlap = set(new_train_vocab).intersection(set(new_test_vocab))
len(new_train_test_overlap) / len(new_train_vocab), len(new_train_test_overlap) / len(new_test_vocab)

(0.04423538831064852, 0.36140637775960754)

### New lexical overlap without stopwords

In [125]:
new_train_vocab_nostp = create_vocab(new_train_df, include_stopwords=False)
new_test_vocab_nostp = create_vocab(new_test_df, include_stopwords=False)


100%|██████████| 40395/40395 [00:00<00:00, 116597.84it/s]
100%|██████████| 1192/1192 [00:00<00:00, 225359.94it/s]


In [126]:
new_train_test_overlap_nostp = set(new_train_vocab_nostp).intersection(set(new_test_vocab_nostp))
len(new_train_test_overlap_nostp) / len(new_train_vocab_nostp), len(new_train_test_overlap_nostp) / len(new_test_vocab_nostp)

(0.034236804564907276, 0.30134529147982064)

### Class distributions

In [127]:
from relation_modeling_utils import explode_labels
new_train_df, new_test_df = explode_labels(new_train_df), explode_labels(new_test_df)

In [128]:
new_test_df.label_0.value_counts(), new_test_df.label_1.value_counts(), new_test_df.label_2.value_counts()

(0    623
 1    569
 Name: label_0, dtype: int64,
 0    692
 1    500
 Name: label_1, dtype: int64,
 0    654
 1    538
 Name: label_2, dtype: int64)

In [68]:
from relation_modeling_utils import get_class_dist_report

get_class_dist_report(new_test_df)

{('class_0', 0): 0.48237476808905383,
 ('class_0', 'class_0', 0, 0): 0.48237476808905383,
 ('class_0', 'class_0', 0, 1): 0.0,
 ('class_0', 1): 0.5176252319109462,
 ('class_0', 'class_0', 1, 0): 0.0,
 ('class_0', 'class_0', 1, 1): 0.5176252319109462,
 ('class_0', 'class_1', 0, 0): 0.19851576994434136,
 ('class_0', 'class_1', 0, 1): 0.28385899814471244,
 ('class_0', 'class_1', 1, 0): 0.45732838589981445,
 ('class_0', 'class_1', 1, 1): 0.06029684601113173,
 ('class_0', 'class_2', 0, 0): 0.07792207792207792,
 ('class_0', 'class_2', 0, 1): 0.4044526901669759,
 ('class_0', 'class_2', 1, 0): 0.5111317254174397,
 ('class_0', 'class_2', 1, 1): 0.006493506493506494,
 ('class_1', 0): 0.6558441558441559,
 ('class_1', 'class_0', 0, 0): 0.19851576994434136,
 ('class_1', 'class_0', 0, 1): 0.45732838589981445,
 ('class_1', 1): 0.34415584415584416,
 ('class_1', 'class_0', 1, 0): 0.28385899814471244,
 ('class_1', 'class_0', 1, 1): 0.06029684601113173,
 ('class_1', 'class_1', 0, 0): 0.6558441558441559,
 

### Vocabulary info

In [115]:
new_train_df.to_csv("data/atomic_ood2/n5/train_n5.csv")
new_test_df.to_csv("data/atomic_ood2/n5/test_n5.csv")

In [100]:
from relation_modeling_utils import load_fdata, create_vocab
from spacy.lang.en.stop_words import STOP_WORDS

def get_vocab_info(dataset_type):
    train_f = load_fdata(f"data/atomic_ood2/{dataset_type}/train_{dataset_type}.csv")
    test_f = load_fdata(f"data/atomic_ood2/{dataset_type}/test_{dataset_type}.csv")
    train_f_vocab, test_f_vocab = create_vocab(train_f), create_vocab(test_f)
    train_f_nostp, test_f_nostp = create_vocab(train_f,include_stopwords=False), create_vocab(test_f, include_stopwords=False)
    return {
        'train': len(train_f_vocab), 'test': len(test_f_vocab),
        'train_nostp': len(train_f_nostp), 'test_nostp': len(test_f_nostp)
    }

In [101]:
get_vocab_info("n1")

100%|██████████| 40777/40777 [00:00<00:00, 193758.42it/s]
100%|██████████| 810/810 [00:00<00:00, 206779.44it/s]
100%|██████████| 40777/40777 [00:00<00:00, 164638.55it/s]
100%|██████████| 810/810 [00:00<00:00, 244662.70it/s]


{'train': 10036, 'test': 825, 'train_nostp': 9857, 'test_nostp': 736}

In [6]:
get_vocab_info("n3")

100%|██████████| 150325/150325 [00:00<00:00, 702886.36it/s]
100%|██████████| 16539/16539 [00:00<00:00, 972093.91it/s]
100%|██████████| 2561/2561 [00:00<00:00, 785263.00it/s]


{'train': 9706,
 'val': 2933,
 'test': 1096,
 'train_nostp': 9488,
 'val_nostp': 2775,
 'test_nostp': 996}

In [None]:
get_vocab_info("n5")