#### these scripts will mismatch original BERT vocab file

In [None]:
from vocab_mismatch_utils import *
from data_formatter_utils import *
from datasets import DatasetDict
from datasets import Dataset
from datasets import load_dataset
import transformers
import pandas as pd
import operator
from collections import OrderedDict
from tqdm import tqdm, trange

import collections
import os
import unicodedata
from typing import List, Optional, Tuple

from transformers.tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
from transformers.utils import logging
import torch
logger = logging.get_logger(__name__)
import numpy as np
import copy
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer() 
from word_forms.word_forms import get_word_forms

seed = 42
# set seeds again at start
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

In [251]:
# this tokenizer helps you to get piece length for each token
modified_tokenizer = ModifiedBertTokenizer(
    vocab_file="../data-files/bert_vocab.txt")
modified_basic_tokenizer = ModifiedBasicTokenizer()

# translator to try it out!
def corrupt_translator(in_string, tokenizer, vocab_match):
    tokens = tokenizer.tokenize(in_string)
    translate_tokens = [vocab_match[t] for t in tokens]
    out_string = " ".join(translate_tokens).replace(" ##", "").strip()
    return out_string

### Get mismatched vocab for each dataset first!

**SST-3**: Random Corrupt with token piece length control only

In [None]:
# get normal sst-3
sst_train_ternary = process_sst(os.path.join(sst_dirname, "train.txt"),
                                full_ternary_class_func)
sst_dev_ternary = process_sst(os.path.join(sst_dirname, "dev.txt"),
                              full_ternary_class_func,
                              include_subtrees=False)
sst_test_ternary = process_sst(os.path.join(sst_dirname, "test.txt"),
                               full_ternary_class_func,
                               include_subtrees=False)
# these are the original processed sst-tenary data
write_tsv(sst_train_ternary, output_filename=os.path.join(external_output_dirname, "sst-tenary", "sst-tenary-train.tsv"))
write_tsv(sst_dev_ternary, output_filename=os.path.join(external_output_dirname, "sst-tenary", "sst-tenary-dev.tsv"))
write_tsv(sst_test_ternary, output_filename=os.path.join(external_output_dirname, "sst-tenary", "sst-tenary-test.tsv"))

In [None]:
# let us corrupt SST3 in the same way as before
train_df = pd.read_csv(os.path.join(external_output_dirname, "sst-tenary", "sst-tenary-train.tsv"), delimiter="\t")
eval_df = pd.read_csv(os.path.join(external_output_dirname, "sst-tenary", "sst-tenary-dev.tsv"), delimiter="\t")
test_df = pd.read_csv(os.path.join(external_output_dirname, "sst-tenary", "sst-tenary-test.tsv"), delimiter="\t")

train_df = Dataset.from_pandas(train_df)
eval_df = Dataset.from_pandas(eval_df)
test_df = Dataset.from_pandas(test_df)

token_by_length = {} # overwrite this everytime for a new dataset
train_df = train_df.map(token_length_mapping)
eval_df = eval_df.map(token_length_mapping)
test_df = test_df.map(token_length_mapping)

vocab_match_by_piece_length = generate_vocab_match(token_by_length) # you can also save this to avoid reprocess it again

In [239]:
# a little quality control
count = 0
for k, v in vocab_match_by_piece_length.items():
    if k == v:
        count += 1
count/len(vocab_match_by_piece_length)
assert len(list(set(vocab_match_by_piece_length.keys()))) == \
            len(list(set(vocab_match_by_piece_length.values())))

In [246]:
corrupt_translator("this movie is great!", modified_basic_tokenizer, vocab_match_by_piece_length)

'breed fashion toast decidedly received'

**SST-3**: Random Corrupt with token piece length control and frequency and lemma

In [None]:
# setups
token_frequency_map = {} # overwrite this everytime for a new dataset
train_df = train_df.map(token_frequency_mapping)
eval_df = eval_df.map(token_frequency_mapping)
test_df = test_df.map(token_frequency_mapping)
token_frequency_map = sorted(token_frequency_map.items(), key=operator.itemgetter(1), reverse=True)
token_frequency_map = OrderedDict(token_frequency_map)

# also get lemma matching to prevent same lemma matching
token_lemma_map = token_lemma_mapping(vocab_match_by_piece_length)

vocab_match_by_piece_length_frequency_lemma = generate_vocab_match(token_by_length, token_frequency_map, token_lemma_map)

In [237]:
# a little quality control
count = 0
for k, v in vocab_match_by_piece_length_frequency_lemma.items():
    if k == v:
        count += 1
count/len(vocab_match_by_piece_length_frequency_lemma)
assert len(list(set(vocab_match_by_piece_length_frequency_lemma.keys()))) == \
            len(list(set(vocab_match_by_piece_length_frequency_lemma.values())))

In [250]:
corrupt_translator("this movie is great!", modified_basic_tokenizer, vocab_match_by_piece_length_frequency_lemma)

't you film re cold'

**MNLI**: Random Corrupt with token piece length control only

In [238]:
# TODOs

### Now, let us corrupt all datasets!

#### Corrupted SST-3 
by token piece length only

In [252]:
def random_corrupt(example):
    original_sentence = example['text']
    corrupted_sentence = corrupt_translator(original_sentence, modified_basic_tokenizer, vocab_match_by_piece_length)
    example['text'] = corrupted_sentence
    return example

corrupted_train_dataset = train_df.map(random_corrupt)
corrupted_validation_dataset = eval_df.map(random_corrupt)
corrupted_test_dataset = test_df.map(random_corrupt)

corrupted_datasets = DatasetDict({"train":corrupted_train_dataset, 
                                  "validation":corrupted_validation_dataset, 
                                  "test":corrupted_test_dataset})
corrupted_datasets.save_to_disk("../data-files/sst-tenary-corrupted-length")

HBox(children=(FloatProgress(value=0.0, max=159274.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1100.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2210.0), HTML(value='')))




#### Corrupted SST-3 
by frequency as well

In [253]:
def random_corrupt_new(example):
    original_sentence = example['text']
    corrupted_sentence = corrupt_translator(original_sentence, modified_basic_tokenizer, vocab_match_by_piece_length_frequency_lemma)
    example['text'] = corrupted_sentence
    return example

corrupted_train_dataset = train_df.map(random_corrupt_new)
corrupted_validation_dataset = eval_df.map(random_corrupt_new)
corrupted_test_dataset = test_df.map(random_corrupt_new)

corrupted_datasets = DatasetDict({"train":corrupted_train_dataset, 
                                  "validation":corrupted_validation_dataset, 
                                  "test":corrupted_test_dataset})
corrupted_datasets.save_to_disk("../data-files/sst-tenary-corrupted-freq")

HBox(children=(FloatProgress(value=0.0, max=159274.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1100.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2210.0), HTML(value='')))




In [254]:
# TODO: update the rest!

#### Corrupted MNLI

In [None]:
mnli_map = {'entailment': 0, 'neutral' : 1, 'contradiction' : 2}
def process_mnli(dataset, split):
    data_all = []
    for example in dataset[split]:
        premise = example['premise']
        hypothesis = example['hypothesis']
        label = example['label']
        if label in [0,1,2]:
            data = {"premise" : premise, 
                    "hypothesis" : hypothesis, 
                    "label" : label}
        data_all.append(data)
    return data_all

def mnli_write_tsv(*datasets, output_filename):
    all_data = []
    for dataset in datasets:
        all_data += dataset
    random.shuffle(all_data)
    with open(output_filename, "wt") as f:
        writer = csv.DictWriter(f, delimiter="\t", fieldnames=['premise', 'hypothesis', 'label'])
        writer.writeheader()
        writer.writerows(all_data)
        
def mnli_random_corrupt(example):
    original_premise = example['premise']
    original_hypothesis = example['hypothesis']
    if original_hypothesis == None:
        original_hypothesis = ""
    try:
        original_premise_tokens = original_tokenizer.tokenize(original_premise)
        original_hypothesis_tokens = original_tokenizer.tokenize(original_hypothesis)
    except:
        print("Please debug these sequence...")
        print(original_premise)
        print(original_hypothesis)
    corrupted_premise_tokens = []
    corrupted_hypothesis_tokens = []
    for ori_t in original_premise_tokens:
        if ori_t in token_mapping.keys():
            cor_t = token_mapping[ori_t]
        else:
            cor_t = ori_t
        corrupted_premise_tokens.append(cor_t)
    for ori_t in original_hypothesis_tokens:
        if ori_t in token_mapping.keys():
            cor_t = token_mapping[ori_t]
        else:
            cor_t = ori_t
        corrupted_hypothesis_tokens.append(cor_t)

    example['premise'] = original_tokenizer.convert_tokens_to_string(corrupted_premise_tokens)
    example['hypothesis'] = original_tokenizer.convert_tokens_to_string(corrupted_hypothesis_tokens)
    return example

In [None]:
dataset = load_dataset('glue', 'mnli')

In [None]:
mnli_train = process_mnli(dataset, "train")
mnli_validation_matched = process_mnli(dataset, "validation_matched")

In [None]:
mnli_write_tsv(mnli_train, output_filename=os.path.join(external_output_dirname, "mnli", "mnli-train.tsv"))
mnli_write_tsv(mnli_validation_matched, output_filename=os.path.join(external_output_dirname, "mnli", "mnli-dev.tsv"))

In [None]:
# let us corrupt SST3 in the same way as before
train_df = pd.read_csv(os.path.join(external_output_dirname, "mnli", "mnli-train.tsv"), delimiter="\t")
eval_df = pd.read_csv(os.path.join(external_output_dirname, "mnli", "mnli-dev.tsv"), delimiter="\t")

train_df = Dataset.from_pandas(train_df)
eval_df = Dataset.from_pandas(eval_df)

corrupted_train_dataset = train_df.map(mnli_random_corrupt)
corrupted_validation_dataset = eval_df.map(mnli_random_corrupt)

corrupted_datasets = DatasetDict({"train":corrupted_train_dataset, 
                                  "validation":corrupted_validation_dataset})
corrupted_datasets.save_to_disk("../data-files/mnli-corrupted")

In [None]:
corrupted_validation_dataset[1]

#### Corrupted CoLA

In [None]:
def process_cola(dataset, split):
    data_all = []
    for example in dataset[split]:
        sentence = example['sentence']
        label = example['label']
        data = {"sentence" : sentence,
                "label" : label}
        data_all.append(data)
    return data_all

def cola_write_tsv(*datasets, output_filename):
    all_data = []
    for dataset in datasets:
        all_data += dataset
    random.shuffle(all_data)
    with open(output_filename, "wt") as f:
        writer = csv.DictWriter(f, delimiter="\t", fieldnames=['sentence', 'label'])
        writer.writeheader()
        writer.writerows(all_data)
        
def cola_random_corrupt(example):
    original_sentence = example['sentence']
    original_tokens = original_tokenizer.tokenize(original_sentence)
    corrupted_tokens = []
    for ori_t in original_tokens:
        if ori_t in token_mapping.keys():
            cor_t = token_mapping[ori_t]
        else:
            cor_t = ori_t
        corrupted_tokens.append(cor_t)
    example['sentence'] = original_tokenizer.convert_tokens_to_string(corrupted_tokens)
    return example

In [None]:
dataset = load_dataset('glue', 'cola')

In [None]:
cola_train = process_cola(dataset, "train")
cola_validation = process_cola(dataset, "validation")
cola_test = process_cola(dataset, "test")

In [None]:
cola_write_tsv(cola_train, output_filename=os.path.join(external_output_dirname, "cola", "cola-train.tsv"))
cola_write_tsv(cola_validation, output_filename=os.path.join(external_output_dirname, "cola", "cola-dev.tsv"))
cola_write_tsv(cola_test, output_filename=os.path.join(external_output_dirname, "cola", "cola-test.tsv"))

In [None]:
# let us corrupt SST3 in the same way as before
train_df = pd.read_csv(os.path.join(external_output_dirname, "cola", "cola-train.tsv"), delimiter="\t")
eval_df = pd.read_csv(os.path.join(external_output_dirname, "cola", "cola-dev.tsv"), delimiter="\t")
test_df = pd.read_csv(os.path.join(external_output_dirname, "cola", "cola-test.tsv"), delimiter="\t")

train_df = Dataset.from_pandas(train_df)
eval_df = Dataset.from_pandas(eval_df)
test_df = Dataset.from_pandas(test_df)

corrupted_train_dataset = train_df.map(cola_random_corrupt)
corrupted_validation_dataset = eval_df.map(cola_random_corrupt)
corrupted_test_dataset = test_df.map(cola_random_corrupt)

corrupted_datasets = DatasetDict({"train":corrupted_train_dataset, 
                                  "validation":corrupted_validation_dataset, 
                                  "test":corrupted_test_dataset})
corrupted_datasets.save_to_disk("../data-files/cola-corrupted")

#### Corrupted SNLI

In [None]:
mnli_map = {'entailment': 0, 'neutral' : 1, 'contradiction' : 2}
def process_snli(dataset, split):
    data_all = []
    for example in dataset[split]:
        premise = example['premise']
        hypothesis = example['hypothesis']
        label = example['label']
        if label in [0,1,2]:
            data = {"premise" : premise, 
                    "hypothesis" : hypothesis, 
                    "label" : label}
            data_all.append(data)
    return data_all

def snli_write_tsv(*datasets, output_filename):
    all_data = []
    for dataset in datasets:
        all_data += dataset
    random.shuffle(all_data)
    with open(output_filename, "wt") as f:
        writer = csv.DictWriter(f, delimiter="\t", fieldnames=['premise', 'hypothesis', 'label'])
        writer.writeheader()
        writer.writerows(all_data)
        
def snli_random_corrupt(example):
    original_premise = example['premise']
    original_hypothesis = example['hypothesis']
    if original_hypothesis == None:
        original_hypothesis = ""
    try:
        original_premise_tokens = original_tokenizer.tokenize(original_premise)
        original_hypothesis_tokens = original_tokenizer.tokenize(original_hypothesis)
    except:
        print("Please debug these sequence...")
        print(original_premise)
        print(original_hypothesis)
    corrupted_premise_tokens = []
    corrupted_hypothesis_tokens = []
    for ori_t in original_premise_tokens:
        if ori_t in token_mapping.keys():
            cor_t = token_mapping[ori_t]
        else:
            cor_t = ori_t
        corrupted_premise_tokens.append(cor_t)
    for ori_t in original_hypothesis_tokens:
        if ori_t in token_mapping.keys():
            cor_t = token_mapping[ori_t]
        else:
            cor_t = ori_t
        corrupted_hypothesis_tokens.append(cor_t)

    example['premise'] = original_tokenizer.convert_tokens_to_string(corrupted_premise_tokens)
    example['hypothesis'] = original_tokenizer.convert_tokens_to_string(corrupted_hypothesis_tokens)
    return example

In [None]:
dataset = load_dataset('snli')

In [None]:
snli_train = process_snli(dataset, "train")
snli_validation = process_snli(dataset, "validation")
snli_test = process_snli(dataset, "test")

In [None]:
snli_write_tsv(snli_train, output_filename=os.path.join(external_output_dirname, "snli", "snli-train.tsv"))
snli_write_tsv(snli_validation, output_filename=os.path.join(external_output_dirname, "snli", "snli-dev.tsv"))
snli_write_tsv(snli_test, output_filename=os.path.join(external_output_dirname, "snli", "snli-test.tsv"))

In [None]:
# let us corrupt SST3 in the same way as before
train_df = pd.read_csv(os.path.join(external_output_dirname, "snli", "snli-train.tsv"), delimiter="\t")
eval_df = pd.read_csv(os.path.join(external_output_dirname, "snli", "snli-dev.tsv"), delimiter="\t")
test_df = pd.read_csv(os.path.join(external_output_dirname, "snli", "snli-test.tsv"), delimiter="\t")

train_df = Dataset.from_pandas(train_df)
eval_df = Dataset.from_pandas(eval_df)
test_df = Dataset.from_pandas(test_df)

corrupted_train_dataset = train_df.map(snli_random_corrupt)
corrupted_validation_dataset = eval_df.map(snli_random_corrupt)
corrupted_test_dataset = test_df.map(snli_random_corrupt)

corrupted_datasets = DatasetDict({"train":corrupted_train_dataset, 
                                  "validation":corrupted_validation_dataset,
                                  "test":corrupted_test_dataset})
corrupted_datasets.save_to_disk("../data-files/snli-corrupted")

#### MRPC

In [None]:
def process_mrpc(dataset, split):
    data_all = []
    for example in dataset[split]:
        premise = example['sentence1']
        hypothesis = example['sentence2']
        label = example['label']
        data = {"sentence1" : premise, 
                "sentence2" : hypothesis, 
                "label" : label}
        if label in [0,1]:
            data_all.append(data)
        else:
            # print("Some data to look into...")
            # print(data)
            continue
    return data_all

def mrpc_write_tsv(*datasets, output_filename):
    all_data = []
    for dataset in datasets:
        all_data += dataset
    random.shuffle(all_data)
    with open(output_filename, "wt") as f:
        writer = csv.DictWriter(f, delimiter="\t", fieldnames=['sentence1', 'sentence2', 'label'])
        writer.writeheader()
        writer.writerows(all_data)
        
def mrpc_random_corrupt(example):
    original_premise = example['sentence1']
    original_hypothesis = example['sentence2']
    if original_hypothesis == None:
        original_hypothesis = ""
    try:
        original_premise_tokens = original_tokenizer.tokenize(original_premise)
        original_hypothesis_tokens = original_tokenizer.tokenize(original_hypothesis)
    except:
        print("Please debug these sequence...")
        print(original_premise)
        print(original_hypothesis)
    corrupted_premise_tokens = []
    corrupted_hypothesis_tokens = []
    for ori_t in original_premise_tokens:
        if ori_t in token_mapping.keys():
            cor_t = token_mapping[ori_t]
        else:
            cor_t = ori_t
        corrupted_premise_tokens.append(cor_t)
    for ori_t in original_hypothesis_tokens:
        if ori_t in token_mapping.keys():
            cor_t = token_mapping[ori_t]
        else:
            cor_t = ori_t
        corrupted_hypothesis_tokens.append(cor_t)

    example['sentence1'] = original_tokenizer.convert_tokens_to_string(corrupted_premise_tokens)
    example['sentence2'] = original_tokenizer.convert_tokens_to_string(corrupted_hypothesis_tokens)
    return example

In [None]:
mrpc_dataset = load_dataset('glue', 'mrpc')

In [None]:
mrpc_train = process_mrpc(mrpc_dataset, "train")
mrpc_validation = process_mrpc(mrpc_dataset, "validation")
mrpc_test = process_mrpc(mrpc_dataset, "test")

In [None]:
mrpc_write_tsv(mrpc_train, output_filename=os.path.join(external_output_dirname, "mrpc", "mrpc-train.tsv"))
mrpc_write_tsv(mrpc_validation, output_filename=os.path.join(external_output_dirname, "mrpc", "mrpc-dev.tsv"))
mrpc_write_tsv(mrpc_test, output_filename=os.path.join(external_output_dirname, "mrpc", "mrpc-test.tsv"))

In [None]:
# let us corrupt SST3 in the same way as before
train_df = pd.read_csv(os.path.join(external_output_dirname, "mrpc", "mrpc-train.tsv"), delimiter="\t")
eval_df = pd.read_csv(os.path.join(external_output_dirname, "mrpc", "mrpc-dev.tsv"), delimiter="\t")
test_df = pd.read_csv(os.path.join(external_output_dirname, "mrpc", "mrpc-test.tsv"), delimiter="\t")

train_df = Dataset.from_pandas(train_df)
eval_df = Dataset.from_pandas(eval_df)
test_df = Dataset.from_pandas(test_df)

corrupted_train_dataset = train_df.map(mrpc_random_corrupt)
corrupted_validation_dataset = eval_df.map(mrpc_random_corrupt)
corrupted_test_dataset = test_df.map(mrpc_random_corrupt)

corrupted_datasets = DatasetDict({"train":corrupted_train_dataset, 
                                  "validation":corrupted_validation_dataset,
                                  "test":corrupted_test_dataset})
corrupted_datasets.save_to_disk("../data-files/mrpc-corrupted")

#### QNLI

In [None]:
def process_qnli(dataset, split):
    data_all = []
    for example in dataset[split]:
        premise = example['question']
        hypothesis = example['sentence']
        label = example['label']
        data = {"question" : premise, 
                "sentence" : hypothesis, 
                "label" : label}
        if label in [0,1]:
            data_all.append(data)
        else:
            # print("Some data to look into...")
            # print(data)
            continue
    return data_all

def qnli_write_tsv(*datasets, output_filename):
    all_data = []
    for dataset in datasets:
        all_data += dataset
    random.shuffle(all_data)
    with open(output_filename, "wt") as f:
        writer = csv.DictWriter(f, delimiter="\t", fieldnames=['question', 'sentence', 'label'])
        writer.writeheader()
        writer.writerows(all_data)
        
def qnli_random_corrupt(example):
    original_premise = example['question']
    original_hypothesis = example['sentence']
    if original_hypothesis == None:
        original_hypothesis = ""
    try:
        original_premise_tokens = original_tokenizer.tokenize(original_premise)
        original_hypothesis_tokens = original_tokenizer.tokenize(original_hypothesis)
    except:
        print("Please debug these sequence...")
        print(original_premise)
        print(original_hypothesis)
    corrupted_premise_tokens = []
    corrupted_hypothesis_tokens = []
    for ori_t in original_premise_tokens:
        if ori_t in token_mapping.keys():
            cor_t = token_mapping[ori_t]
        else:
            cor_t = ori_t
        corrupted_premise_tokens.append(cor_t)
    for ori_t in original_hypothesis_tokens:
        if ori_t in token_mapping.keys():
            cor_t = token_mapping[ori_t]
        else:
            cor_t = ori_t
        corrupted_hypothesis_tokens.append(cor_t)

    example['question'] = original_tokenizer.convert_tokens_to_string(corrupted_premise_tokens)
    example['sentence'] = original_tokenizer.convert_tokens_to_string(corrupted_hypothesis_tokens)
    return example

In [None]:
qnli_dataset = load_dataset('glue', 'qnli')

In [None]:
qnli_train = process_qnli(qnli_dataset, "train")
qnli_validation = process_qnli(qnli_dataset, "validation")
qnli_test = process_qnli(qnli_dataset, "test")

In [None]:
qnli_write_tsv(qnli_train, output_filename=os.path.join(external_output_dirname, "qnli", "qnli-train.tsv"))
qnli_write_tsv(qnli_validation, output_filename=os.path.join(external_output_dirname, "qnli", "qnli-dev.tsv"))
qnli_write_tsv(qnli_test, output_filename=os.path.join(external_output_dirname, "qnli", "qnli-test.tsv"))

In [None]:
# let us corrupt SST3 in the same way as before
train_df = pd.read_csv(os.path.join(external_output_dirname, "qnli", "qnli-train.tsv"), delimiter="\t")
eval_df = pd.read_csv(os.path.join(external_output_dirname, "qnli", "qnli-dev.tsv"), delimiter="\t")
test_df = pd.read_csv(os.path.join(external_output_dirname, "qnli", "qnli-test.tsv"), delimiter="\t")

train_df = Dataset.from_pandas(train_df)
eval_df = Dataset.from_pandas(eval_df)
test_df = Dataset.from_pandas(test_df)

corrupted_train_dataset = train_df.map(qnli_random_corrupt)
corrupted_validation_dataset = eval_df.map(qnli_random_corrupt)
corrupted_test_dataset = test_df.map(qnli_random_corrupt)

corrupted_datasets = DatasetDict({"train":corrupted_train_dataset, 
                                  "validation":corrupted_validation_dataset,
                                  "test":corrupted_test_dataset})
corrupted_datasets.save_to_disk("../data-files/qnli-corrupted")