#### these scripts will mismatch original BERT vocab file

In [2]:
from vocab_mismatch_utils import *
from data_formatter_utils import *
from datasets import DatasetDict
from datasets import Dataset
from datasets import load_dataset
import transformers
import pandas as pd

In [26]:
original_vocab = load_bert_vocab("../data-files/bert_vocab.txt")
corrupted_vocab = load_bert_vocab("../data-files/bert_vocab_mismatch.txt")

original_tokens = []
corrupted_tokens = []
for k, v in original_vocab.items():
    original_tokens.append(k)
for k, v in corrupted_vocab.items():
    corrupted_tokens.append(k)

token_mapping = dict(zip(original_tokens, corrupted_tokens))

In [27]:
original_tokenizer = transformers.BertTokenizer(
    vocab_file="../data-files/bert_vocab.txt")

def random_corrupt(example):
    original_sentence = example['text']
    original_tokens = original_tokenizer.tokenize(original_sentence)
    corrupted_tokens = []
    for ori_t in original_tokens:
        if ori_t in token_mapping.keys():
            cor_t = token_mapping[ori_t]
        else:
            cor_t = ori_t
        corrupted_tokens.append(cor_t)
    example['text'] = original_tokenizer.convert_tokens_to_string(corrupted_tokens)
    return example

#### Corrupted WikiText-103

In [None]:
datasets = DatasetDict.load_from_disk("../data-files/wikitext-15M")
train_dataset = datasets["train"]
validation_dataset = datasets["validation"]
test_dataset = datasets["test"]

the following lines will corrupt the actual dataset, which takes long

In [None]:
corrupted_train_dataset = train_dataset.map(random_corrupt)
corrupted_validation_dataset = validation_dataset.map(random_corrupt)
corrupted_test_dataset = test_dataset.map(random_corrupt)

In [None]:
# save the corrupted datasets for pretraining!
datasets["train"] = corrupted_train_dataset
datasets["validation"] = corrupted_validation_dataset
datasets["test"] = corrupted_test_dataset
datasets.save_to_disk("../data-files/wikitext-15M-corrupted")

#### Corrupted SST3

In [4]:
sst_train_ternary = process_sst(os.path.join(sst_dirname, "train.txt"),
                                full_ternary_class_func)
sst_dev_ternary = process_sst(os.path.join(sst_dirname, "dev.txt"),
                              full_ternary_class_func,
                              include_subtrees=False)
sst_test_ternary = process_sst(os.path.join(sst_dirname, "test.txt"),
                               full_ternary_class_func,
                               include_subtrees=False)

In [26]:
# these are the original processed sst-tenary data
write_tsv(sst_train_ternary, output_filename=os.path.join(external_output_dirname, "sst-tenary", "sst-tenary-train.tsv"))
write_tsv(sst_dev_ternary, output_filename=os.path.join(external_output_dirname, "sst-tenary", "sst-tenary-dev.tsv"))
write_tsv(sst_test_ternary, output_filename=os.path.join(external_output_dirname, "sst-tenary", "sst-tenary-test.tsv"))

In [6]:
# let us corrupt SST3 in the same way as before
train_df = pd.read_csv(os.path.join(external_output_dirname, "sst-tenary", "sst-tenary-train.tsv"), delimiter="\t")
eval_df = pd.read_csv(os.path.join(external_output_dirname, "sst-tenary", "sst-tenary-dev.tsv"), delimiter="\t")
test_df = pd.read_csv(os.path.join(external_output_dirname, "sst-tenary", "sst-tenary-test.tsv"), delimiter="\t")

train_df = Dataset.from_pandas(train_df)
eval_df = Dataset.from_pandas(eval_df)
test_df = Dataset.from_pandas(test_df)

corrupted_train_dataset = train_df.map(random_corrupt)
corrupted_validation_dataset = eval_df.map(random_corrupt)
corrupted_test_dataset = test_df.map(random_corrupt)

corrupted_datasets = DatasetDict({"train":corrupted_train_dataset, 
                                  "validation":corrupted_validation_dataset, 
                                  "test":corrupted_test_dataset})
corrupted_datasets.save_to_disk("../data-files/sst-tenary-corrupted")

HBox(children=(FloatProgress(value=0.0, max=159274.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1100.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2210.0), HTML(value='')))




#### Corrupted MNLI

In [36]:
mnli_map = {'entailment': 0, 'neutral' : 1, 'contradiction' : 2}
def process_mnli(dataset, split):
    data_all = []
    for example in dataset[split]:
        premise = example['premise']
        hypothesis = example['hypothesis']
        label = example['label']
        if label in [0,1,2]:
            data = {"premise" : premise, 
                    "hypothesis" : hypothesis, 
                    "label" : label}
        data_all.append(data)
    return data_all

def mnli_write_tsv(*datasets, output_filename):
    all_data = []
    for dataset in datasets:
        all_data += dataset
    random.shuffle(all_data)
    with open(output_filename, "wt") as f:
        writer = csv.DictWriter(f, delimiter="\t", fieldnames=['premise', 'hypothesis', 'label'])
        writer.writeheader()
        writer.writerows(all_data)
        
def mnli_random_corrupt(example):
    original_premise = example['premise']
    original_hypothesis = example['hypothesis']
    if original_hypothesis == None:
        original_hypothesis = ""
    try:
        original_premise_tokens = original_tokenizer.tokenize(original_premise)
        original_hypothesis_tokens = original_tokenizer.tokenize(original_hypothesis)
    except:
        print("Please debug these sequence...")
        print(original_premise)
        print(original_hypothesis)
    corrupted_premise_tokens = []
    corrupted_hypothesis_tokens = []
    for ori_t in original_premise_tokens:
        if ori_t in token_mapping.keys():
            cor_t = token_mapping[ori_t]
        else:
            cor_t = ori_t
        corrupted_premise_tokens.append(cor_t)
    for ori_t in original_hypothesis_tokens:
        if ori_t in token_mapping.keys():
            cor_t = token_mapping[ori_t]
        else:
            cor_t = ori_t
        corrupted_hypothesis_tokens.append(cor_t)

    example['premise'] = original_tokenizer.convert_tokens_to_string(corrupted_premise_tokens)
    example['hypothesis'] = original_tokenizer.convert_tokens_to_string(corrupted_hypothesis_tokens)
    return example

In [3]:
dataset = load_dataset('glue', 'mnli')

Reusing dataset glue (/afs/cs.stanford.edu/u/wuzhengx/.cache/huggingface/datasets/glue/mnli/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)


In [21]:
mnli_train = process_mnli(dataset, "train")
mnli_validation_matched = process_mnli(dataset, "validation_matched")

In [22]:
mnli_write_tsv(mnli_train, output_filename=os.path.join(external_output_dirname, "mnli", "mnli-train.tsv"))
mnli_write_tsv(mnli_validation_matched, output_filename=os.path.join(external_output_dirname, "mnli", "mnli-dev.tsv"))

In [37]:
# let us corrupt SST3 in the same way as before
train_df = pd.read_csv(os.path.join(external_output_dirname, "mnli", "mnli-train.tsv"), delimiter="\t")
eval_df = pd.read_csv(os.path.join(external_output_dirname, "mnli", "mnli-dev.tsv"), delimiter="\t")

train_df = Dataset.from_pandas(train_df)
eval_df = Dataset.from_pandas(eval_df)

corrupted_train_dataset = train_df.map(mnli_random_corrupt)
corrupted_validation_dataset = eval_df.map(mnli_random_corrupt)

corrupted_datasets = DatasetDict({"train":corrupted_train_dataset, 
                                  "validation":corrupted_validation_dataset})
corrupted_datasets.save_to_disk("../data-files/mnli-corrupted")

HBox(children=(FloatProgress(value=0.0, max=392702.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=9815.0), HTML(value='')))




In [38]:
corrupted_validation_dataset[1]

{'hypothesis': 'a manga adf finish fry 99wartax 875 nazi 53 crops "',
 'label': 0,
 'premise': 'a $ c gee defines vu 99 bopax mlsax 875 nazi 53 crops 99df reaching lila 1724 bkwart 341 pitt'}

#### Corrupted CoLA

In [48]:
def process_cola(dataset, split):
    data_all = []
    for example in dataset[split]:
        sentence = example['sentence']
        label = example['label']
        data = {"sentence" : sentence,
                "label" : label}
        data_all.append(data)
    return data_all

def cola_write_tsv(*datasets, output_filename):
    all_data = []
    for dataset in datasets:
        all_data += dataset
    random.shuffle(all_data)
    with open(output_filename, "wt") as f:
        writer = csv.DictWriter(f, delimiter="\t", fieldnames=['sentence', 'label'])
        writer.writeheader()
        writer.writerows(all_data)
        
def cola_random_corrupt(example):
    original_sentence = example['sentence']
    original_tokens = original_tokenizer.tokenize(original_sentence)
    corrupted_tokens = []
    for ori_t in original_tokens:
        if ori_t in token_mapping.keys():
            cor_t = token_mapping[ori_t]
        else:
            cor_t = ori_t
        corrupted_tokens.append(cor_t)
    example['sentence'] = original_tokenizer.convert_tokens_to_string(corrupted_tokens)
    return example

In [39]:
dataset = load_dataset('glue', 'cola')

Reusing dataset glue (/afs/cs.stanford.edu/u/wuzhengx/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)


In [49]:
cola_train = process_cola(dataset, "train")
cola_validation = process_cola(dataset, "validation")
cola_test = process_cola(dataset, "test")

In [50]:
cola_write_tsv(cola_train, output_filename=os.path.join(external_output_dirname, "cola", "cola-train.tsv"))
cola_write_tsv(cola_validation, output_filename=os.path.join(external_output_dirname, "cola", "cola-dev.tsv"))
cola_write_tsv(cola_test, output_filename=os.path.join(external_output_dirname, "cola", "cola-test.tsv"))

In [51]:
# let us corrupt SST3 in the same way as before
train_df = pd.read_csv(os.path.join(external_output_dirname, "cola", "cola-train.tsv"), delimiter="\t")
eval_df = pd.read_csv(os.path.join(external_output_dirname, "cola", "cola-dev.tsv"), delimiter="\t")
test_df = pd.read_csv(os.path.join(external_output_dirname, "cola", "cola-test.tsv"), delimiter="\t")

train_df = Dataset.from_pandas(train_df)
eval_df = Dataset.from_pandas(eval_df)
test_df = Dataset.from_pandas(test_df)

corrupted_train_dataset = train_df.map(cola_random_corrupt)
corrupted_validation_dataset = eval_df.map(cola_random_corrupt)
corrupted_test_dataset = test_df.map(cola_random_corrupt)

corrupted_datasets = DatasetDict({"train":corrupted_train_dataset, 
                                  "validation":corrupted_validation_dataset, 
                                  "test":corrupted_test_dataset})
corrupted_datasets.save_to_disk("../data-files/cola-corrupted")

HBox(children=(FloatProgress(value=0.0, max=8551.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1043.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1063.0), HTML(value='')))




#### Corrupted SNLI

In [71]:
mnli_map = {'entailment': 0, 'neutral' : 1, 'contradiction' : 2}
def process_snli(dataset, split):
    data_all = []
    for example in dataset[split]:
        premise = example['premise']
        hypothesis = example['hypothesis']
        label = example['label']
        if label in [0,1,2]:
            data = {"premise" : premise, 
                    "hypothesis" : hypothesis, 
                    "label" : label}
        data_all.append(data)
    return data_all

def snli_write_tsv(*datasets, output_filename):
    all_data = []
    for dataset in datasets:
        all_data += dataset
    random.shuffle(all_data)
    with open(output_filename, "wt") as f:
        writer = csv.DictWriter(f, delimiter="\t", fieldnames=['premise', 'hypothesis', 'label'])
        writer.writeheader()
        writer.writerows(all_data)
        
def snli_random_corrupt(example):
    original_premise = example['premise']
    original_hypothesis = example['hypothesis']
    if original_hypothesis == None:
        original_hypothesis = ""
    try:
        original_premise_tokens = original_tokenizer.tokenize(original_premise)
        original_hypothesis_tokens = original_tokenizer.tokenize(original_hypothesis)
    except:
        print("Please debug these sequence...")
        print(original_premise)
        print(original_hypothesis)
    corrupted_premise_tokens = []
    corrupted_hypothesis_tokens = []
    for ori_t in original_premise_tokens:
        if ori_t in token_mapping.keys():
            cor_t = token_mapping[ori_t]
        else:
            cor_t = ori_t
        corrupted_premise_tokens.append(cor_t)
    for ori_t in original_hypothesis_tokens:
        if ori_t in token_mapping.keys():
            cor_t = token_mapping[ori_t]
        else:
            cor_t = ori_t
        corrupted_hypothesis_tokens.append(cor_t)

    example['premise'] = original_tokenizer.convert_tokens_to_string(corrupted_premise_tokens)
    example['hypothesis'] = original_tokenizer.convert_tokens_to_string(corrupted_hypothesis_tokens)
    return example

In [72]:
dataset = load_dataset('snli')

Reusing dataset snli (/afs/cs.stanford.edu/u/wuzhengx/.cache/huggingface/datasets/snli/plain_text/1.0.0/bb1102591c6230bd78813e229d5dd4c7fbf4fc478cec28f298761eb69e5b537c)


In [73]:
snli_train = process_snli(dataset, "train")
snli_validation = process_snli(dataset, "validation")
snli_test = process_snli(dataset, "test")

In [74]:
snli_write_tsv(snli_train, output_filename=os.path.join(external_output_dirname, "snli", "snli-train.tsv"))
snli_write_tsv(snli_validation, output_filename=os.path.join(external_output_dirname, "snli", "snli-dev.tsv"))
snli_write_tsv(snli_test, output_filename=os.path.join(external_output_dirname, "snli", "snli-test.tsv"))

In [75]:
# let us corrupt SST3 in the same way as before
train_df = pd.read_csv(os.path.join(external_output_dirname, "snli", "snli-train.tsv"), delimiter="\t")
eval_df = pd.read_csv(os.path.join(external_output_dirname, "snli", "snli-dev.tsv"), delimiter="\t")
test_df = pd.read_csv(os.path.join(external_output_dirname, "snli", "snli-test.tsv"), delimiter="\t")

train_df = Dataset.from_pandas(train_df)
eval_df = Dataset.from_pandas(eval_df)
test_df = Dataset.from_pandas(test_df)

corrupted_train_dataset = train_df.map(snli_random_corrupt)
corrupted_validation_dataset = eval_df.map(snli_random_corrupt)
corrupted_test_dataset = test_df.map(snli_random_corrupt)

corrupted_datasets = DatasetDict({"train":corrupted_train_dataset, 
                                  "validation":corrupted_validation_dataset,
                                  "test":corrupted_test_dataset})
corrupted_datasets.save_to_disk("../data-files/snli-corrupted")

HBox(children=(FloatProgress(value=0.0, max=550152.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


