#### these scripts will mismatch original BERT vocab file

In [None]:
from vocab_mismatch_utils import *
from data_formatter_utils import *
from datasets import DatasetDict
from datasets import Dataset
from datasets import load_dataset
import transformers
import pandas as pd
import operator
from collections import OrderedDict
from tqdm import tqdm, trange

import collections
import os
import unicodedata
from typing import List, Optional, Tuple

from transformers.tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
from transformers.utils import logging
import torch
logger = logging.get_logger(__name__)
import numpy as np
import copy
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer() 
from word_forms.word_forms import get_word_forms

seed = 42
# set seeds again at start
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

In [251]:
# this tokenizer helps you to get piece length for each token
modified_tokenizer = ModifiedBertTokenizer(
    vocab_file="../data-files/bert_vocab.txt")
modified_basic_tokenizer = ModifiedBasicTokenizer()

# translator to try it out!
def corrupt_translator(in_string, tokenizer, vocab_match):
    tokens = tokenizer.tokenize(in_string)
    translate_tokens = [vocab_match[t] for t in tokens]
    out_string = " ".join(translate_tokens).replace(" ##", "").strip()
    return out_string

### Get mismatched vocab for each dataset first!

**SST-3**: Random Corrupt with token piece length control only

In [None]:
# get normal sst-3
sst_train_ternary = process_sst(os.path.join(sst_dirname, "train.txt"),
                                full_ternary_class_func)
sst_dev_ternary = process_sst(os.path.join(sst_dirname, "dev.txt"),
                              full_ternary_class_func,
                              include_subtrees=False)
sst_test_ternary = process_sst(os.path.join(sst_dirname, "test.txt"),
                               full_ternary_class_func,
                               include_subtrees=False)
# these are the original processed sst-tenary data
write_tsv(sst_train_ternary, output_filename=os.path.join(external_output_dirname, "sst-tenary", "sst-tenary-train.tsv"))
write_tsv(sst_dev_ternary, output_filename=os.path.join(external_output_dirname, "sst-tenary", "sst-tenary-dev.tsv"))
write_tsv(sst_test_ternary, output_filename=os.path.join(external_output_dirname, "sst-tenary", "sst-tenary-test.tsv"))

In [None]:
# let us corrupt SST3 in the same way as before
train_df = pd.read_csv(os.path.join(external_output_dirname, "sst-tenary", "sst-tenary-train.tsv"), delimiter="\t")
eval_df = pd.read_csv(os.path.join(external_output_dirname, "sst-tenary", "sst-tenary-dev.tsv"), delimiter="\t")
test_df = pd.read_csv(os.path.join(external_output_dirname, "sst-tenary", "sst-tenary-test.tsv"), delimiter="\t")

train_df = Dataset.from_pandas(train_df)
eval_df = Dataset.from_pandas(eval_df)
test_df = Dataset.from_pandas(test_df)

token_by_length = {} # overwrite this everytime for a new dataset
train_df = train_df.map(token_length_mapping)
eval_df = eval_df.map(token_length_mapping)
test_df = test_df.map(token_length_mapping)

vocab_match_by_piece_length = generate_vocab_match(token_by_length) # you can also save this to avoid reprocess it again

In [239]:
# a little quality control
count = 0
for k, v in vocab_match_by_piece_length.items():
    if k == v:
        count += 1
count/len(vocab_match_by_piece_length)
assert len(list(set(vocab_match_by_piece_length.keys()))) == \
            len(list(set(vocab_match_by_piece_length.values())))

In [246]:
corrupt_translator("this movie is great!", modified_basic_tokenizer, vocab_match_by_piece_length)

'breed fashion toast decidedly received'

**SST-3**: Random Corrupt with token piece length control and frequency and lemma

In [264]:
# setups
token_frequency_map = {} # overwrite this everytime for a new dataset
train_df = train_df.map(token_frequency_mapping)
eval_df = eval_df.map(token_frequency_mapping)
test_df = test_df.map(token_frequency_mapping)
token_frequency_map = sorted(token_frequency_map.items(), key=operator.itemgetter(1), reverse=True)
token_frequency_map = OrderedDict(token_frequency_map)

# also get lemma matching to prevent same lemma matching
token_lemma_map = token_lemma_mapping(vocab_match_by_piece_length)

vocab_match_by_piece_length_frequency_lemma = generate_vocab_match(token_by_length, token_frequency_map, token_lemma_map)

HBox(children=(FloatProgress(value=0.0, max=159274.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1100.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2210.0), HTML(value='')))


examining length = 1, candidates length = 6924
examining length = 3, candidates length = 1097
examining length = 2, candidates length = 2114
examining length = 4, candidates length = 328
examining length = 5, candidates length = 45
examining length = 6, candidates length = 4


In [237]:
# a little quality control
count = 0
for k, v in vocab_match_by_piece_length_frequency_lemma.items():
    if k == v:
        count += 1
count/len(vocab_match_by_piece_length_frequency_lemma)
assert len(list(set(vocab_match_by_piece_length_frequency_lemma.keys()))) == \
            len(list(set(vocab_match_by_piece_length_frequency_lemma.values())))

In [250]:
corrupt_translator("this movie is great!", modified_basic_tokenizer, vocab_match_by_piece_length_frequency_lemma)

't you film re cold'

**MNLI**: Random Corrupt with token piece length control only

In [238]:
# TODOs

### Now, let us corrupt all datasets!

#### Corrupted SST-3 
by token piece length only

In [252]:
def random_corrupt(example):
    original_sentence = example['text']
    corrupted_sentence = corrupt_translator(original_sentence, modified_basic_tokenizer, vocab_match_by_piece_length)
    example['text'] = corrupted_sentence
    return example

corrupted_train_dataset = train_df.map(random_corrupt)
corrupted_validation_dataset = eval_df.map(random_corrupt)
corrupted_test_dataset = test_df.map(random_corrupt)

corrupted_datasets = DatasetDict({"train":corrupted_train_dataset, 
                                  "validation":corrupted_validation_dataset, 
                                  "test":corrupted_test_dataset})
corrupted_datasets.save_to_disk("../data-files/sst-tenary-corrupted-length")

HBox(children=(FloatProgress(value=0.0, max=159274.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1100.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2210.0), HTML(value='')))




#### Corrupted SST-3 
by frequency as well

In [253]:
def random_corrupt_new(example):
    original_sentence = example['text']
    corrupted_sentence = corrupt_translator(original_sentence, modified_basic_tokenizer, vocab_match_by_piece_length_frequency_lemma)
    example['text'] = corrupted_sentence
    return example

corrupted_train_dataset = train_df.map(random_corrupt_new)
corrupted_validation_dataset = eval_df.map(random_corrupt_new)
corrupted_test_dataset = test_df.map(random_corrupt_new)

corrupted_datasets = DatasetDict({"train":corrupted_train_dataset, 
                                  "validation":corrupted_validation_dataset, 
                                  "test":corrupted_test_dataset})
corrupted_datasets.save_to_disk("../data-files/sst-tenary-corrupted-freq")

HBox(children=(FloatProgress(value=0.0, max=159274.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1100.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2210.0), HTML(value='')))




In [266]:
for k, v in vocab_match_by_piece_length_frequency_lemma.items():
    print(k, v)
    print(token_frequency_map[k], token_frequency_map[v])

ki wong
3 3
hovering trading
3 3
otto mastering
2 3
glazed tentative
3 3
nursery blushing
3 3
mana carey
2 3
wong ki
3 3
trading hovering
3 3
mastering otto
3 2
tentative glazed
3 3
blushing nursery
3 3
carey mana
3 2
smackdown courtney
4 4
fairness launch
4 4
aaa victor
4 4
ritual trusts
4 4
hooked summary
4 4
compete hairy
4 4
balkans rejection
4 4
bergman alain
4 4
javier ken
4 4
babies buyers
4 4
supplied alt
4 4
dominates joshua
4 4
implication counting
4 4
fertility dodge
4 4
causes reward
4 4
courtney smackdown
4 4
launch fairness
4 4
victor aaa
4 4
trusts ritual
4 4
summary hooked
4 4
hairy compete
4 4
rejection balkans
4 4
alain bergman
4 4
ken javier
4 4
buyers babies
4 4
alt supplied
4 4
joshua dominates
4 4
counting implication
4 4
dodge fertility
4 4
reward causes
4 4
gaunt exclusively
5 5
lux whistle
5 5
demographics persistence
5 5
tenderly blinding
5 5
entitled catholics
5 5
helpful comprise
5 5
deposited coarse
5 5
attractions ivy
5 5
disgusted tres
5 5
wealth fools
5 

collar integrating
10 10
127 faux
10 10
chatter smuggling
10 10
elves secular
10 10
skeleton brady
10 10
needing saddam
10 10
creator mckay
10 10
perceptions unrelated
10 10
surge mrs
10 10
admiration spicy
10 10
precedent pamela
10 10
absent gathering
10 10
dire organizing
10 10
position partnership
10 10
patriot rode
10 10
muttering avon
10 10
gears berkeley
10 10
posthumously occupation
10 10
pardon rake
10 10
hussein debating
10 10
plea reese
10 10
doors tango
10 10
attitudes weddings
10 10
heavyweight batting
10 10
counterpart bastard
10 10
prints landing
10 10
plan avalanche
10 10
proceed wrapping
10 10
glued welcomed
10 10
fifth ferris
10 10
assign dolly
10 10
morris astronauts
10 10
stereo bordering
10 10
onion geek
10 10
developers limerick
10 10
simulation reworked
10 10
confuse spill
10 10
buying therapeutic
10 10
vile trajectory
10 10
reassure hottest
10 10
experiments link
10 10
sally ecstasy
10 10
hammering arrogance
10 10
clicks specialized
10 10
runaway nighttime
10 10


thorough wasp
14 14
backed quit
14 14
dismay indicative
14 14
resolved pains
14 14
inhuman scared
14 14
transporting peas
14 14
underwear efficiently
14 14
appearing pawn
14 14
chips shudder
14 14
ye borg
14 14
hispanic clements
14 14
recognition prescription
14 14
26 ignoring
14 14
rebel laurence
14 14
address pizza
14 14
flashing sunshine
14 14
dip tightened
14 14
clocks fluids
14 14
delhi undo
14 14
ace recruiting
14 14
godzilla tempered
14 14
considerably sufficient
14 14
fringe hears
14 14
exceeds sam
14 14
updated documenting
14 14
encompasses resident
14 14
participate frenzy
14 14
boost photographer
14 14
retiring kidnapping
14 14
founding revolutionary
14 14
activity amir
14 14
blowing founders
14 14
diplomat flashback
14 14
portuguese norm
14 14
depicts millennium
14 14
windows discontent
14 14
secondary abbott
14 14
thoughts begging
14 14
corners graham
14 14
populace previously
14 14
magnetic wen
14 14
annex memorial
14 14
tie reductions
14 14
void dungeons
14 14
mouse spee

wander backgrounds
18 18
spectrum accuracy
18 18
advertised owned
18 18
ambitions iq
18 18
journalists architect
18 18
85 batch
18 18
collect presenting
18 18
scribe bender
18 18
salle captured
18 18
shut bubble
18 18
fueled photos
18 18
cushion nationwide
18 18
competence mug
18 18
pipeline proposal
18 18
escaped phillip
18 18
ominous gracefully
18 18
fragmented doc
18 18
objects broader
18 18
butt killers
18 18
nails reduces
18 18
noticeable releasing
18 18
promoter dual
18 18
pollution admitted
18 18
stanley symbolism
18 18
fees seal
18 18
letterman forest
18 18
monday addressing
18 18
mystic arm
18 18
swedish preceded
18 18
elm creeping
18 18
majority mall
18 18
helm hurts
18 18
shapes performer
18 18
110 alexandre
18 18
nationally gaps
18 18
shorts pencil
18 18
bush restrictive
18 18
sensation funeral
18 18
handling tucked
18 18
elevated homicide
18 18
projection facial
18 18
narrated windshield
18 18
lurking supermarket
18 18
endings volcano
18 18
blunt karen
18 18
tax composer
1

23 23
ironically taps
23 23
83 unprecedented
23 23
populated afghan
23 23
determine espn
23 23
reducing closure
23 23
folly transparent
23 23
flame believer
23 23
evolved atop
23 23
ours freddy
23 23
tag 000
23 23
profession canvas
23 23
underneath sung
23 23
privy screwed
23 23
thirteen induced
24 24
steady timed
24 24
challenges nonetheless
24 24
nyc neighbor
24 24
update forgiven
24 24
millions vin
24 24
expressions formal
24 24
enjoying bout
24 24
wary sexually
24 24
civic spread
24 24
fearless dealers
24 24
eats firmly
24 24
golden gaza
24 24
midway mira
24 24
weep durable
24 24
executive mistress
24 24
subcontinent tolkien
24 24
affected scorpion
24 24
steals sons
24 24
4 depends
24 24
informed agency
24 24
wilder abc
24 24
exchange beats
24 24
tall smooth
24 24
nick duration
24 24
immature tailor
24 24
surf resistance
24 24
realities lark
24 24
survivors harbor
24 24
moderately traditionally
24 24
pearl headed
24 24
infinitely discovering
24 24
fires purposes
24 24
painted snatc

stabbing warriors
31 31
research juliette
31 31
solutions pink
31 31
bull motions
31 31
laden rules
31 31
heights sonny
31 31
dover coma
31 31
twin passes
31 31
partner raises
31 31
sinks cipher
31 31
juicy repetition
31 31
bollywood directions
31 31
limp wrestling
31 31
storylines charge
31 31
maze max
31 31
whip suggestion
31 31
bowling undercover
31 31
chelsea steal
31 31
adolescence bands
31 31
proficient avant
31 31
scarcely available
31 31
row naturally
31 31
established achieve
31 31
access characteristic
32 32
sociology drunken
32 32
singer bodily
32 32
catches dim
32 32
composed universe
32 32
darkness seemed
32 32
cube jump
32 32
bracing covers
32 32
sticky flashes
32 32
stronger crosses
32 32
youthful avoids
32 32
fix expressive
32 32
resembles climax
32 32
shout concession
32 32
nair ingredients
32 32
qualities sleeve
32 32
texture julia
32 32
musicians debate
32 32
garde tends
32 32
revealed reduced
32 32
consuming kwan
32 32
temple prefer
32 32
lights dinner
32 32
ease gl

42 42
lord decided
42 42
trees witness
42 42
steve radar
42 42
layered enormous
43 43
kick definitive
43 43
maintains chest
43 43
garbage outcome
43 43
hungry concert
43 43
expectation league
43 43
stylistic disappointed
43 43
diesel severe
43 43
faces hatred
43 43
deserved reaction
43 43
confident stealing
43 43
traditions walked
43 43
moody q
43 43
chuckles primary
43 43
sandra brisk
43 43
photography baked
43 43
hat irresistible
43 43
unconventional edition
43 43
plight named
43 43
succeed simplicity
43 43
danger layered
43 43
enormous kick
43 43
definitive maintains
43 43
chest garbage
43 43
outcome hungry
43 43
concert expectation
43 43
league stylistic
43 43
disappointed diesel
43 43
severe faces
43 43
hatred deserved
43 43
reaction confident
43 43
stealing traditions
43 43
walked moody
43 43
q chuckles
43 43
primary sandra
43 43
brisk photography
43 43
baked hat
43 43
irresistible unconventional
43 43
edition plight
43 43
named succeed
43 43
simplicity danger
43 43
korean broodi

peculiar physical
59 59
heads bone
59 59
area below
59 59
abandon becoming
59 59
generate gangs
60 60
remembered triangle
60 60
window asks
60 60
throws rough
60 60
abuse wrote
60 60
vague comfort
60 60
meat breaks
60 60
pre communal
60 60
lush comedian
60 60
irony aliens
60 60
thematic crap
60 60
12 propaganda
60 60
consequences portrayal
60 60
consider determination
60 60
allow surely
60 60
element souls
60 60
actual vibrant
60 60
scratch bow
60 60
credibility potent
60 60
gangs generate
60 60
triangle remembered
60 60
asks window
60 60
rough throws
60 60
wrote abuse
60 60
comfort vague
60 60
breaks meat
60 60
communal pre
60 60
comedian lush
60 60
aliens irony
60 60
crap thematic
60 60
propaganda 12
60 60
portrayal consequences
60 60
determination consider
60 60
surely allow
60 60
souls element
60 60
vibrant actual
60 60
bow scratch
60 60
potent credibility
60 60
assassin lifetime
61 61
patience amounts
61 61
silver oh
61 61
intricate consciousness
61 61
sub ponder
61 61
portrays sc

98 98
document finish
98 98
intellectual joy
98 98
glimpse bottom
98 98
annoying god
98 98
fish marks
98 98
concerned dimensional
99 99
kevin hoffman
99 99
genuinely whom
99 99
refreshing revolution
99 99
energetic fatal
99 99
blue concerned
99 99
dimensional kevin
99 99
hoffman genuinely
99 99
whom refreshing
99 99
revolution energetic
99 99
fatal blue
99 99
screenwriter ryan
100 100
families turning
100 100
superficial o
100 100
created clarity
100 100
frustrating tells
100 100
generally team
100 100
snow oddly
100 100
desire longer
100 100
group screenwriter
100 100
ryan families
100 100
turning superficial
100 100
o created
100 100
clarity frustrating
100 100
tells generally
100 100
team snow
100 100
oddly desire
100 100
longer group
100 100
realistic source
102 102
missing ordinary
102 101
nasty finding
101 101
connect news
102 101
experiences hits
101 101
present street
102 102
efforts answers
102 101
handed superior
101 101
wedding stunning
102 102
source realistic
102 102
ordin

manages mr
315 313
portrait opera
311 310
death probably
311 310
surprisingly intelligent
314 314
actor despite
311 313
line manages
315 315
mr portrait
313 311
opera death
310 311
probably surprisingly
310 314
intelligent actor
314 311
despite line
313 315
become social
317 322
fresh head
317 322
goes stories
320 318
effort filmmaking
320 318
perfect serious
322 321
m become
321 317
social fresh
322 317
head goes
322 320
stories effort
318 320
filmmaking perfect
318 322
serious m
321 321
past live
327 326
beautiful john
324 328
spirit instead
328 328
classic political
323 328
french wit
324 323
girl romance
327 325
thought past
323 327
live beautiful
326 324
john spirit
328 328
instead classic
328 323
political french
328 324
wit girl
323 327
romance thought
325 323
middle next
330 332
hours tone
335 331
woman camera
333 331
himself obvious
329 332
engaging powerful
335 329
next middle
332 330
tone hours
331 335
camera woman
331 333
obvious himself
332 329
powerful engaging
329 335
co

13 13
profundity marivaux
13 13
nesbitt dramatization
13 13
hennings flakeball
13 13
creaky oftentimes
13 13
dictums stagnation
13 13
begrudge savory
13 13
impudent vardalos
13 13
precarious unfilmable
13 13
phocion expiry
13 13
megaplexes storyteller
13 13
meticulously volletta
13 13
grubbers unappealing
13 13
fantasized fidgeted
13 13
overwhelm entendre
13 13
smutty negligible
13 13
disguising overdoing
13 13
odour discouraging
13 13
undermining reeks
14 14
illogic blaring
14 14
intermezzo captivates
14 14
zingers ricocheting
14 14
gandalf holographic
14 14
armageddon italicized
14 14
sneering detract
14 14
deform mongrel
14 14
mawkish pubescent
14 14
squander congratulate
14 14
echelons fascinates
14 14
languidly exalted
14 14
impassive injects
14 14
shockwaves sputters
14 14
tidings treachery
14 14
stasis strenuously
14 14
bolero boorishness
14 14
celibacy travesty
14 14
switchblade bloodsucker
14 14
debrauwer travail
14 14
clunkiness whirl
14 14
artefact implodes
14 14
instilled r

6 6
vies evaded
6 6
biscuit cosby
6 6
molehill aloft
6 6
blandly locusts
6 6
dispatching ballsy
6 6
lumps shum
6 6
threefold talkiness
6 6
cassel wiel
6 6
wittier sixed
6 6
flurries defuses
6 6
infectiously botching
6 6
togetherness devious
6 6
regimented starship
6 6
crave spittingly
6 6
piecing assures
6 6
clocked affections
6 6
sulking argentinean
6 6
slc englishmen
6 6
leanest revolting
6 6
uncovers putters
6 6
qual unpleasantly
6 6
fruition morose
6 6
abundantly defeatingly
6 6
vulgarities dosage
6 6
gushing thoughtless
6 6
sentimentalist restate
6 6
bungle fearlessly
6 6
digested qutting
6 6
zips lightest
6 6
shag guiltless
6 6
estela agape
6 6
hollowness galled
6 6
plumbed inducingly
6 6
steinis demeo
6 6
cloaked decorous
6 6
roughage barbs
6 6
strikingly competently
6 6
immaculately byplay
6 6
dynamited chesterton
6 6
evaded vies
6 6
cosby biscuit
6 6
aloft molehill
6 6
locusts blandly
6 6
ballsy dispatching
6 6
shum lumps
6 6
talkiness threefold
6 6
wiel cassel
6 6
sixed witti

pageants crushingly
15 15
handsomely fondly
15 15
erects painterly
15 15
dampened bracingly
15 15
gloomy bloodshed
15 15
hankies intelligentsia
15 15
mugs luckiest
15 15
willies exoticism
15 15
personas bores
15 15
millennial sentimentalizing
15 15
dullard serrault
15 15
brio cockney
15 15
strives creepiest
15 15
reworking jackal
15 15
elicit rerun
15 15
zucker quaking
15 15
ichi pauly
15 15
deliciously meander
15 15
outrageousness alienating
15 15
gunplay fanboy
15 15
gratefully patrolman
15 15
laurice misplaced
15 15
lumbering ver
15 15
fatale humanizing
15 15
janine filmgoing
15 15
motherhood lovefest
15 15
awfulness platter
15 15
goodly eccentrics
15 15
impart laconic
15 15
sickly natter
15 15
segal somethings
15 15
scooping thumpingly
15 15
churn blowout
15 15
thumbing coolness
15 15
latches pared
15 15
rea shekhar
15 15
reverence hardass
15 15
kittenish gutless
15 15
undermines collage
15 15
fitfully corniest
15 15
ditched derailed
15 15
crackers miike
15 15
crushingly pageants
1

reaffirms twirls
5 3
schnieder unembarrassing
2 4
blisteringly ineffable
5 4
lethargically hjelje
5 5
confounding unspeakably
3 5
uncommitted misdirected
6 6
lackadaisical unrelieved
6 6
improbabilities horrifyingly
6 6
unreachable snooze
6 6
wewannour disreputable
6 6
suffocate inexpressible
6 6
illuminates uncommitted
6 6
misdirected lackadaisical
6 6
unrelieved improbabilities
6 6
horrifyingly unreachable
6 6
snooze wewannour
6 6
disreputable suffocate
6 6
inexpressible illuminates
6 6
credulity attributable
7 7
unhappiness unremittingly
7 7
obsessively unanswered
7 7
solipsistic heartstrings
7 7
arbitrarily underdramatized
7 7
unparalleled repugnance
7 7
beseechingly degenerating
7 7
blethyn schticky
7 7
serendipity credulity
7 7
attributable unhappiness
7 7
unremittingly obsessively
7 7
unanswered solipsistic
7 7
heartstrings arbitrarily
7 7
underdramatized unparalleled
7 7
repugnance beseechingly
7 7
degenerating blethyn
7 7
schticky serendipity
7 7
ayatollah musketeer
8 8
unshap

In [267]:
vocab_match_by_piece_length_frequency_lemma["the"]

'-'

In [260]:
# TODO: update the rest!

{'text': 'The action is reasonably well-done ...', 'label': 1, 'source': 'sst'}

#### Corrupted MNLI

In [None]:
mnli_map = {'entailment': 0, 'neutral' : 1, 'contradiction' : 2}
def process_mnli(dataset, split):
    data_all = []
    for example in dataset[split]:
        premise = example['premise']
        hypothesis = example['hypothesis']
        label = example['label']
        if label in [0,1,2]:
            data = {"premise" : premise, 
                    "hypothesis" : hypothesis, 
                    "label" : label}
        data_all.append(data)
    return data_all

def mnli_write_tsv(*datasets, output_filename):
    all_data = []
    for dataset in datasets:
        all_data += dataset
    random.shuffle(all_data)
    with open(output_filename, "wt") as f:
        writer = csv.DictWriter(f, delimiter="\t", fieldnames=['premise', 'hypothesis', 'label'])
        writer.writeheader()
        writer.writerows(all_data)
        
def mnli_random_corrupt(example):
    original_premise = example['premise']
    original_hypothesis = example['hypothesis']
    if original_hypothesis == None:
        original_hypothesis = ""
    try:
        original_premise_tokens = original_tokenizer.tokenize(original_premise)
        original_hypothesis_tokens = original_tokenizer.tokenize(original_hypothesis)
    except:
        print("Please debug these sequence...")
        print(original_premise)
        print(original_hypothesis)
    corrupted_premise_tokens = []
    corrupted_hypothesis_tokens = []
    for ori_t in original_premise_tokens:
        if ori_t in token_mapping.keys():
            cor_t = token_mapping[ori_t]
        else:
            cor_t = ori_t
        corrupted_premise_tokens.append(cor_t)
    for ori_t in original_hypothesis_tokens:
        if ori_t in token_mapping.keys():
            cor_t = token_mapping[ori_t]
        else:
            cor_t = ori_t
        corrupted_hypothesis_tokens.append(cor_t)

    example['premise'] = original_tokenizer.convert_tokens_to_string(corrupted_premise_tokens)
    example['hypothesis'] = original_tokenizer.convert_tokens_to_string(corrupted_hypothesis_tokens)
    return example

In [None]:
dataset = load_dataset('glue', 'mnli')

In [None]:
mnli_train = process_mnli(dataset, "train")
mnli_validation_matched = process_mnli(dataset, "validation_matched")

In [None]:
mnli_write_tsv(mnli_train, output_filename=os.path.join(external_output_dirname, "mnli", "mnli-train.tsv"))
mnli_write_tsv(mnli_validation_matched, output_filename=os.path.join(external_output_dirname, "mnli", "mnli-dev.tsv"))

In [None]:
# let us corrupt SST3 in the same way as before
train_df = pd.read_csv(os.path.join(external_output_dirname, "mnli", "mnli-train.tsv"), delimiter="\t")
eval_df = pd.read_csv(os.path.join(external_output_dirname, "mnli", "mnli-dev.tsv"), delimiter="\t")

train_df = Dataset.from_pandas(train_df)
eval_df = Dataset.from_pandas(eval_df)

corrupted_train_dataset = train_df.map(mnli_random_corrupt)
corrupted_validation_dataset = eval_df.map(mnli_random_corrupt)

corrupted_datasets = DatasetDict({"train":corrupted_train_dataset, 
                                  "validation":corrupted_validation_dataset})
corrupted_datasets.save_to_disk("../data-files/mnli-corrupted")

In [None]:
corrupted_validation_dataset[1]

#### Corrupted CoLA

In [None]:
def process_cola(dataset, split):
    data_all = []
    for example in dataset[split]:
        sentence = example['sentence']
        label = example['label']
        data = {"sentence" : sentence,
                "label" : label}
        data_all.append(data)
    return data_all

def cola_write_tsv(*datasets, output_filename):
    all_data = []
    for dataset in datasets:
        all_data += dataset
    random.shuffle(all_data)
    with open(output_filename, "wt") as f:
        writer = csv.DictWriter(f, delimiter="\t", fieldnames=['sentence', 'label'])
        writer.writeheader()
        writer.writerows(all_data)
        
def cola_random_corrupt(example):
    original_sentence = example['sentence']
    original_tokens = original_tokenizer.tokenize(original_sentence)
    corrupted_tokens = []
    for ori_t in original_tokens:
        if ori_t in token_mapping.keys():
            cor_t = token_mapping[ori_t]
        else:
            cor_t = ori_t
        corrupted_tokens.append(cor_t)
    example['sentence'] = original_tokenizer.convert_tokens_to_string(corrupted_tokens)
    return example

In [None]:
dataset = load_dataset('glue', 'cola')

In [None]:
cola_train = process_cola(dataset, "train")
cola_validation = process_cola(dataset, "validation")
cola_test = process_cola(dataset, "test")

In [None]:
cola_write_tsv(cola_train, output_filename=os.path.join(external_output_dirname, "cola", "cola-train.tsv"))
cola_write_tsv(cola_validation, output_filename=os.path.join(external_output_dirname, "cola", "cola-dev.tsv"))
cola_write_tsv(cola_test, output_filename=os.path.join(external_output_dirname, "cola", "cola-test.tsv"))

In [None]:
# let us corrupt SST3 in the same way as before
train_df = pd.read_csv(os.path.join(external_output_dirname, "cola", "cola-train.tsv"), delimiter="\t")
eval_df = pd.read_csv(os.path.join(external_output_dirname, "cola", "cola-dev.tsv"), delimiter="\t")
test_df = pd.read_csv(os.path.join(external_output_dirname, "cola", "cola-test.tsv"), delimiter="\t")

train_df = Dataset.from_pandas(train_df)
eval_df = Dataset.from_pandas(eval_df)
test_df = Dataset.from_pandas(test_df)

corrupted_train_dataset = train_df.map(cola_random_corrupt)
corrupted_validation_dataset = eval_df.map(cola_random_corrupt)
corrupted_test_dataset = test_df.map(cola_random_corrupt)

corrupted_datasets = DatasetDict({"train":corrupted_train_dataset, 
                                  "validation":corrupted_validation_dataset, 
                                  "test":corrupted_test_dataset})
corrupted_datasets.save_to_disk("../data-files/cola-corrupted")

#### Corrupted SNLI

In [None]:
mnli_map = {'entailment': 0, 'neutral' : 1, 'contradiction' : 2}
def process_snli(dataset, split):
    data_all = []
    for example in dataset[split]:
        premise = example['premise']
        hypothesis = example['hypothesis']
        label = example['label']
        if label in [0,1,2]:
            data = {"premise" : premise, 
                    "hypothesis" : hypothesis, 
                    "label" : label}
            data_all.append(data)
    return data_all

def snli_write_tsv(*datasets, output_filename):
    all_data = []
    for dataset in datasets:
        all_data += dataset
    random.shuffle(all_data)
    with open(output_filename, "wt") as f:
        writer = csv.DictWriter(f, delimiter="\t", fieldnames=['premise', 'hypothesis', 'label'])
        writer.writeheader()
        writer.writerows(all_data)
        
def snli_random_corrupt(example):
    original_premise = example['premise']
    original_hypothesis = example['hypothesis']
    if original_hypothesis == None:
        original_hypothesis = ""
    try:
        original_premise_tokens = original_tokenizer.tokenize(original_premise)
        original_hypothesis_tokens = original_tokenizer.tokenize(original_hypothesis)
    except:
        print("Please debug these sequence...")
        print(original_premise)
        print(original_hypothesis)
    corrupted_premise_tokens = []
    corrupted_hypothesis_tokens = []
    for ori_t in original_premise_tokens:
        if ori_t in token_mapping.keys():
            cor_t = token_mapping[ori_t]
        else:
            cor_t = ori_t
        corrupted_premise_tokens.append(cor_t)
    for ori_t in original_hypothesis_tokens:
        if ori_t in token_mapping.keys():
            cor_t = token_mapping[ori_t]
        else:
            cor_t = ori_t
        corrupted_hypothesis_tokens.append(cor_t)

    example['premise'] = original_tokenizer.convert_tokens_to_string(corrupted_premise_tokens)
    example['hypothesis'] = original_tokenizer.convert_tokens_to_string(corrupted_hypothesis_tokens)
    return example

In [None]:
dataset = load_dataset('snli')

In [None]:
snli_train = process_snli(dataset, "train")
snli_validation = process_snli(dataset, "validation")
snli_test = process_snli(dataset, "test")

In [None]:
snli_write_tsv(snli_train, output_filename=os.path.join(external_output_dirname, "snli", "snli-train.tsv"))
snli_write_tsv(snli_validation, output_filename=os.path.join(external_output_dirname, "snli", "snli-dev.tsv"))
snli_write_tsv(snli_test, output_filename=os.path.join(external_output_dirname, "snli", "snli-test.tsv"))

In [None]:
# let us corrupt SST3 in the same way as before
train_df = pd.read_csv(os.path.join(external_output_dirname, "snli", "snli-train.tsv"), delimiter="\t")
eval_df = pd.read_csv(os.path.join(external_output_dirname, "snli", "snli-dev.tsv"), delimiter="\t")
test_df = pd.read_csv(os.path.join(external_output_dirname, "snli", "snli-test.tsv"), delimiter="\t")

train_df = Dataset.from_pandas(train_df)
eval_df = Dataset.from_pandas(eval_df)
test_df = Dataset.from_pandas(test_df)

corrupted_train_dataset = train_df.map(snli_random_corrupt)
corrupted_validation_dataset = eval_df.map(snli_random_corrupt)
corrupted_test_dataset = test_df.map(snli_random_corrupt)

corrupted_datasets = DatasetDict({"train":corrupted_train_dataset, 
                                  "validation":corrupted_validation_dataset,
                                  "test":corrupted_test_dataset})
corrupted_datasets.save_to_disk("../data-files/snli-corrupted")

#### MRPC

In [None]:
def process_mrpc(dataset, split):
    data_all = []
    for example in dataset[split]:
        premise = example['sentence1']
        hypothesis = example['sentence2']
        label = example['label']
        data = {"sentence1" : premise, 
                "sentence2" : hypothesis, 
                "label" : label}
        if label in [0,1]:
            data_all.append(data)
        else:
            # print("Some data to look into...")
            # print(data)
            continue
    return data_all

def mrpc_write_tsv(*datasets, output_filename):
    all_data = []
    for dataset in datasets:
        all_data += dataset
    random.shuffle(all_data)
    with open(output_filename, "wt") as f:
        writer = csv.DictWriter(f, delimiter="\t", fieldnames=['sentence1', 'sentence2', 'label'])
        writer.writeheader()
        writer.writerows(all_data)
        
def mrpc_random_corrupt(example):
    original_premise = example['sentence1']
    original_hypothesis = example['sentence2']
    if original_hypothesis == None:
        original_hypothesis = ""
    try:
        original_premise_tokens = original_tokenizer.tokenize(original_premise)
        original_hypothesis_tokens = original_tokenizer.tokenize(original_hypothesis)
    except:
        print("Please debug these sequence...")
        print(original_premise)
        print(original_hypothesis)
    corrupted_premise_tokens = []
    corrupted_hypothesis_tokens = []
    for ori_t in original_premise_tokens:
        if ori_t in token_mapping.keys():
            cor_t = token_mapping[ori_t]
        else:
            cor_t = ori_t
        corrupted_premise_tokens.append(cor_t)
    for ori_t in original_hypothesis_tokens:
        if ori_t in token_mapping.keys():
            cor_t = token_mapping[ori_t]
        else:
            cor_t = ori_t
        corrupted_hypothesis_tokens.append(cor_t)

    example['sentence1'] = original_tokenizer.convert_tokens_to_string(corrupted_premise_tokens)
    example['sentence2'] = original_tokenizer.convert_tokens_to_string(corrupted_hypothesis_tokens)
    return example

In [None]:
mrpc_dataset = load_dataset('glue', 'mrpc')

In [None]:
mrpc_train = process_mrpc(mrpc_dataset, "train")
mrpc_validation = process_mrpc(mrpc_dataset, "validation")
mrpc_test = process_mrpc(mrpc_dataset, "test")

In [None]:
mrpc_write_tsv(mrpc_train, output_filename=os.path.join(external_output_dirname, "mrpc", "mrpc-train.tsv"))
mrpc_write_tsv(mrpc_validation, output_filename=os.path.join(external_output_dirname, "mrpc", "mrpc-dev.tsv"))
mrpc_write_tsv(mrpc_test, output_filename=os.path.join(external_output_dirname, "mrpc", "mrpc-test.tsv"))

In [None]:
# let us corrupt SST3 in the same way as before
train_df = pd.read_csv(os.path.join(external_output_dirname, "mrpc", "mrpc-train.tsv"), delimiter="\t")
eval_df = pd.read_csv(os.path.join(external_output_dirname, "mrpc", "mrpc-dev.tsv"), delimiter="\t")
test_df = pd.read_csv(os.path.join(external_output_dirname, "mrpc", "mrpc-test.tsv"), delimiter="\t")

train_df = Dataset.from_pandas(train_df)
eval_df = Dataset.from_pandas(eval_df)
test_df = Dataset.from_pandas(test_df)

corrupted_train_dataset = train_df.map(mrpc_random_corrupt)
corrupted_validation_dataset = eval_df.map(mrpc_random_corrupt)
corrupted_test_dataset = test_df.map(mrpc_random_corrupt)

corrupted_datasets = DatasetDict({"train":corrupted_train_dataset, 
                                  "validation":corrupted_validation_dataset,
                                  "test":corrupted_test_dataset})
corrupted_datasets.save_to_disk("../data-files/mrpc-corrupted")

#### QNLI

In [None]:
def process_qnli(dataset, split):
    data_all = []
    for example in dataset[split]:
        premise = example['question']
        hypothesis = example['sentence']
        label = example['label']
        data = {"question" : premise, 
                "sentence" : hypothesis, 
                "label" : label}
        if label in [0,1]:
            data_all.append(data)
        else:
            # print("Some data to look into...")
            # print(data)
            continue
    return data_all

def qnli_write_tsv(*datasets, output_filename):
    all_data = []
    for dataset in datasets:
        all_data += dataset
    random.shuffle(all_data)
    with open(output_filename, "wt") as f:
        writer = csv.DictWriter(f, delimiter="\t", fieldnames=['question', 'sentence', 'label'])
        writer.writeheader()
        writer.writerows(all_data)
        
def qnli_random_corrupt(example):
    original_premise = example['question']
    original_hypothesis = example['sentence']
    if original_hypothesis == None:
        original_hypothesis = ""
    try:
        original_premise_tokens = original_tokenizer.tokenize(original_premise)
        original_hypothesis_tokens = original_tokenizer.tokenize(original_hypothesis)
    except:
        print("Please debug these sequence...")
        print(original_premise)
        print(original_hypothesis)
    corrupted_premise_tokens = []
    corrupted_hypothesis_tokens = []
    for ori_t in original_premise_tokens:
        if ori_t in token_mapping.keys():
            cor_t = token_mapping[ori_t]
        else:
            cor_t = ori_t
        corrupted_premise_tokens.append(cor_t)
    for ori_t in original_hypothesis_tokens:
        if ori_t in token_mapping.keys():
            cor_t = token_mapping[ori_t]
        else:
            cor_t = ori_t
        corrupted_hypothesis_tokens.append(cor_t)

    example['question'] = original_tokenizer.convert_tokens_to_string(corrupted_premise_tokens)
    example['sentence'] = original_tokenizer.convert_tokens_to_string(corrupted_hypothesis_tokens)
    return example

In [None]:
qnli_dataset = load_dataset('glue', 'qnli')

In [None]:
qnli_train = process_qnli(qnli_dataset, "train")
qnli_validation = process_qnli(qnli_dataset, "validation")
qnli_test = process_qnli(qnli_dataset, "test")

In [None]:
qnli_write_tsv(qnli_train, output_filename=os.path.join(external_output_dirname, "qnli", "qnli-train.tsv"))
qnli_write_tsv(qnli_validation, output_filename=os.path.join(external_output_dirname, "qnli", "qnli-dev.tsv"))
qnli_write_tsv(qnli_test, output_filename=os.path.join(external_output_dirname, "qnli", "qnli-test.tsv"))

In [None]:
# let us corrupt SST3 in the same way as before
train_df = pd.read_csv(os.path.join(external_output_dirname, "qnli", "qnli-train.tsv"), delimiter="\t")
eval_df = pd.read_csv(os.path.join(external_output_dirname, "qnli", "qnli-dev.tsv"), delimiter="\t")
test_df = pd.read_csv(os.path.join(external_output_dirname, "qnli", "qnli-test.tsv"), delimiter="\t")

train_df = Dataset.from_pandas(train_df)
eval_df = Dataset.from_pandas(eval_df)
test_df = Dataset.from_pandas(test_df)

corrupted_train_dataset = train_df.map(qnli_random_corrupt)
corrupted_validation_dataset = eval_df.map(qnli_random_corrupt)
corrupted_test_dataset = test_df.map(qnli_random_corrupt)

corrupted_datasets = DatasetDict({"train":corrupted_train_dataset, 
                                  "validation":corrupted_validation_dataset,
                                  "test":corrupted_test_dataset})
corrupted_datasets.save_to_disk("../data-files/qnli-corrupted")