In [None]:
# Imports
import stanza
from stanza.utils.conll import CoNLL
# stanza.download('en')
from nltk.corpus import wordnet

In [None]:
from transformers import (
    AutoConfig,
    AutoTokenizer,
)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    "bert-base-chinese", use_fast=False
)

In [None]:
len(tokenizer)

In [None]:
tokenizer.get_vocab()

In [None]:
from datasets import DatasetDict
from datasets import Dataset
from datasets import load_dataset
from datasets import list_datasets

import matplotlib.pyplot as plt

# Utils
def get_sentence_doc(sentence_in):
    doc = nlp(sentence_in)
    return doc

def get_postag_token(sentence_in):
    ret = []
    doc = nlp(sentence_in)
    for sent in doc.sentences:
        for word in sent.words:
            ret  += [(word.text, word.upos, word.xpos,)]
    return ret

# Stanza
# nlp = stanza.Pipeline('en', processors='tokenize,pos')

#### Demo for different aspects we studied with mid-tuning pipeline.

In [None]:
original_sentence = 'Every move Google makes brings this particular future closer .'

#### 1. Tokenization Differences

Loading RoBERTa model with different tokenizers.

In [None]:
wordpeice_tokenizer = AutoTokenizer.from_pretrained(
    "bert-base-cased",
    use_fast=False,
    cache_dir="../huggingface_cache"
)

bpe_tokenizer = AutoTokenizer.from_pretrained(
    "roberta-base",
    use_fast=False,
    cache_dir="../huggingface_cache"
)

sentpeice_tokenizer = AutoTokenizer.from_pretrained(
    'xlnet-base-cased',
    use_fast=False,
    cache_dir="../huggingface_cache"
)

In [None]:
original_sentence = " The ability to compositionally map language to referents, relations, and actions is an essential component of language understanding. "

In [None]:
wordpeice_tokenizer.tokenize(original_sentence)

In [None]:
bpe_tokenizer.tokenize(original_sentence)

In [None]:
sentpeice_tokenizer.tokenize(original_sentence)

#### 2. Semantics Shifts

Synonym Shift - Nouns.

In [None]:
# batch annotation using Stanza
documents = ["", "I wrote another document for fun."]
in_docs = [stanza.Document([], text=d) for d in documents]
doc = nlp(in_docs)

In [None]:
doc[0].sentences

In [None]:
postags = get_postag_token(original_sentence)
shifted_sentence = []
for p in postags:
    if p[-1] == "NN":
        shifted = False
        syns = wordnet.synsets(p[0])
        for syn in syns:
            shift_w = syn.lemmas()[0].name()
            if p[0] != shift_w:
                shifted_sentence += [shift_w]
                shifted = True
                break
        if not shifted:
            shifted_sentence += [p[0]]
    else:
        shifted_sentence += [p[0]]
" ".join(shifted_sentence)

Scrambling Shift - Nouns.

In [None]:
word_identity_map = {
    'Google' : 'Facebook',
    'move' : 'book',
    'future' : 'internet'
}
shifted_sentence = []
for p in postags:
    if p[-1] == "NN" and p[0] in word_identity_map.keys():
        shifted_sentence += [word_identity_map[p[0]]]
    else:
        shifted_sentence += [p[0]]
" ".join(shifted_sentence)

Concept Merging and Splitting - 1. random merging and random splitting with -X format..

Synonym Shift - Nouns: Exploring (1) word embedddings, (2) wordnet nbrs.

Other Relevant Shift: FastText Nbrs.
* Get all nouns in the wikitext dataset.
* Get fasttext embeddings of those nouns.
* We need to get the lemma of those words (i.e., "books" cannot be swapped with "book" since they are considered as the same word lemma).
* Using 1-NN matching algorithm to pair up words to swap meanings. We will have some rules to break ties.

In [None]:
import fasttext
ft = fasttext.load_model('./data-files/cc.en.300.bin')

In [None]:
ft.get_nearest_neighbors('book', k=5)

In [None]:
wiki_datasets = DatasetDict.load_from_disk("./data-files/wikitext-15M/")

In [None]:
collected_nouns = set([])
count = 0
total_count = len(wiki_datasets["train"])
for sentence in wiki_datasets["train"]:
    if count % 1000 == 0:
        print(f"completed:{count}/{total_count}")
    postags = get_postag_token(sentence['text'])
    for p in postags:
        if p[-1] == 'NN':
            collected_nouns.add(p[0])
    count += 1

In [None]:
collected_nouns

#### 3. Dependency Shifts

Conllu Files Generation

In [None]:
# read-in sentences from different files.
wiki_datasets = DatasetDict.load_from_disk("../data-files/wikitext-15M/")

In [None]:
s = wiki_datasets["test"][3]

In [None]:
if len(s["text"].strip()) > 0:
    clean_s = []
    for t in s["text"].strip().split(" "):
        if len(t.strip()) > 0:
            clean_s += [t.strip()]

In [None]:
s = " ".join(clean_s)

In [None]:
s

In [None]:
in_docs = [stanza.Document([], text=s)]
docs = nlp(in_docs)

In [None]:
len(docs[0].sentences)

In [None]:
CoNLL.write_doc2conll(docs[0], "./test.conllu", mode="w")

Galatic Dependency

In [None]:
# read-in sentences from different files.
wiki_datasets = DatasetDict.load_from_disk("../data-files/wikitext-15M/")

In [None]:
wiki_datasets_fr_fr = DatasetDict.load_from_disk("../data-files/wikitext-15M-en~fr@N~fr@V/")

In [None]:
wiki_datasets_ja_ja = DatasetDict.load_from_disk("../data-files/wikitext-15M-en~ja_ktc@N~ja_ktc@V/")

In [None]:
wiki_datasets_fr_ja = DatasetDict.load_from_disk("../data-files/wikitext-15M-en~fr@N~ja_ktc@V/")

In [None]:
wiki_datasets["test"][:10]

In [None]:
wiki_datasets_fr_fr["test"][:10]

In [None]:
task_datasets = DatasetDict.load_from_disk("../data-files/sst3-en~ja_ktc@N~ja_ktc@V/")

In [None]:
task_datasets["validation"][:100]

In [None]:
task_datasets["train"][:100]

Demos

In [None]:
# read-in sentences from different files.
sst2_datasets = DatasetDict.load_from_disk("../data-files/sst2/")

In [None]:
sst2_datasets

In [None]:
sst2_datasets_var_1 = DatasetDict.load_from_disk("../data-files/sst2-en~fr@N~fr@V/")
sst2_datasets_var_2 = DatasetDict.load_from_disk("../data-files/sst2-en~jaktc@N~jaktc@V/")
sst2_datasets_var_3 = DatasetDict.load_from_disk("../data-files/sst2-en~fr@N~jaktc@V/")

In [None]:
for example in sst2_datasets["train"]:
    words = example["sentence"].split(" ")
    pickout = "one of the best films of the year with its exquisite acting , inventive screenplay , mesmerizing music , and many inimitable scenes of tenderness , loss , discontent , and yearning . "
    pickout_words = pickout.split(" ")

    count = 0
    for w in pickout_words:
        if w in words:
            count += 1
    if count >= len(pickout_words)*0.3:
        print(example)

In [None]:
idx = 10134

In [None]:
sst2_datasets["train"][idx]

In [None]:
sst2_datasets_var_1["train"][idx]

In [None]:
sst2_datasets_var_2["train"][idx]

In [None]:
sst2_datasets_var_3["train"][idx]

In [None]:
import random
li = sst2_datasets["train"][idx]["sentence"].split(" ")
random.shuffle(li)
" ".join(li)

In [None]:
li = sst2_datasets["train"][idx]["sentence"].split(" ")
" ".join(li[::-1])

In [None]:
# "GroNLP/bert-base-dutch-cased"
# "roberta-base"
# "bert-base-uncased"
# "albert-base-v2"
# "flaubert/flaubert_base_cased"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    "GroNLP/bert-base-dutch-cased",
    cache_dir="./huggingface_inoculation_cache/",
    use_fast=False,
)

In [None]:
" ".join(tokenizer.tokenize(sst2_datasets["train"][idx]["sentence"]))

In [None]:
import json, copy
from vocab_mismatch_utils import *
token_frequency_map = json.load(open("../data-files/wikitext-15M-vocab.json"))
wikitext_vocab = list(set(token_frequency_map.keys()))
# sort so we have consistent map.
wikitext_vocab.sort()
wikitext_vocab_copy = copy.deepcopy(wikitext_vocab)
random.Random(42).shuffle(wikitext_vocab_copy)
word_swap_map = {}
for i in range(len(wikitext_vocab)):
    word_swap_map[wikitext_vocab[i]] = wikitext_vocab_copy[i]

In [None]:
modified_basic_tokenizer = ModifiedBasicTokenizer()

In [None]:
corrupt_translator(sst2_datasets["train"][idx]["sentence"], modified_basic_tokenizer, word_swap_map)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    "roberta-base",
    cache_dir="./huggingface_inoculation_cache/",
    use_fast=False,
)

In [None]:
token_li = list(tokenizer.get_vocab().keys())
random.shuffle(token_li)
original_token_li = list(tokenizer.get_vocab().keys())
token_swap_map = {}
for i in range(len(original_token_li)):
    token_swap_map[original_token_li[i]] = token_li[i]

In [None]:
" ".join([token_swap_map[t] for t in tokenizer.tokenize(sst2_datasets["train"][idx]["sentence"])])

Tokenizer Vocab Overlapping Check

Tokenizer

In [None]:
# read-in sentences from different files.
sst2_datasets = DatasetDict.load_from_disk("../data-files/sst2/")

In [None]:
dutch_tokenizer = AutoTokenizer.from_pretrained(
    "GroNLP/bert-base-dutch-cased",
    cache_dir="./huggingface_inoculation_cache/",
    use_fast=False,
)

In [None]:
flaubert_tokenizer = AutoTokenizer.from_pretrained(
    "flaubert/flaubert_base_cased",
    cache_dir="./huggingface_inoculation_cache/",
    use_fast=False,
)

In [None]:
bert_tokenizer = AutoTokenizer.from_pretrained(
    "bert-base-cased",
    cache_dir="./huggingface_inoculation_cache/",
    use_fast=False,
)

In [None]:
albert_tokenizer = AutoTokenizer.from_pretrained(
    "albert-base-v2",
    cache_dir="./huggingface_inoculation_cache/",
    use_fast=False,
)

In [None]:
roberta_tokenizer = AutoTokenizer.from_pretrained(
    "roberta-base",
    cache_dir="./huggingface_inoculation_cache/",
    use_fast=False,
)

In [None]:
dutch_lens = []
flaubert_lens = []
bert_lens = []
albert_lens = []
roberta_lens = []
for example in sst2_datasets["train"]:
    dutch_lens += [len(dutch_tokenizer.tokenize(example["sentence"]))]
    flaubert_lens += [len(flaubert_tokenizer.tokenize(example["sentence"]))]
    bert_lens += [len(bert_tokenizer.tokenize(example["sentence"]))]
    albert_lens += [len(albert_tokenizer.tokenize(example["sentence"]))]
    roberta_lens += [len(roberta_tokenizer.tokenize(example["sentence"]))]

In [None]:
plt.rcParams["font.family"] = "DejaVu Serif"
font = {'family' : 'DejaVu Serif',
        'size'   : 12}
plt.rc('font', **font)

with plt.rc_context({'axes.edgecolor':'black', 'xtick.color':'black', 'ytick.color':'black', 'figure.facecolor':'white'}):

    fig = plt.figure(figsize=(6, 2.5))
    ax = fig.add_axes([0,0,1,1])

    ax.set_title('Sequence Lengths', fontsize=20)
    ax.boxplot(
        [roberta_lens, bert_lens, albert_lens, dutch_lens, flaubert_lens], widths = 0.3,
        showfliers=False,
        boxprops=dict(color='#117733',linewidth=2),
        medianprops=dict(color='#117733',linewidth=2),
        capprops=dict(linewidth=2, color='#117733'),
        whiskerprops=dict(linewidth=2,linestyle='--', color='#117733')
    )
    plt.xticks([1, 2, 3, 4, 5], ["RoBERTa", "BERT", "Albert", "FlauBERT", "DutchBERT"], fontsize=12)
    ax.spines["top"].set_linewidth(2)
    ax.spines["bottom"].set_linewidth(2)
    ax.spines["left"].set_linewidth(2)
    ax.spines["right"].set_linewidth(2)
    ax.spines["top"].set_linewidth(2)
    ax.spines["bottom"].set_linewidth(2)
    ax.spines["left"].set_linewidth(2)
    ax.spines["right"].set_linewidth(2)
    ax.xaxis.grid(color='grey', linestyle='-.', linewidth=1, alpha=0.5)
    ax.yaxis.grid(color='grey', linestyle='-.', linewidth=1, alpha=0.5)
    ax.set_ylabel('Lengths', fontsize=20)
    # plt.show()
    plt.savefig("../data-files/tokenizer-seq-len.png",dpi=1000, bbox_inches='tight')
    
    

In [None]:
sum(flaubert_lens)/len(flaubert_lens)

In [None]:
sum(roberta_lens)/len(roberta_lens)

In [None]:
(15.096853702356382-12.359604448469911)/12.359604448469911

In [None]:
bert_tokenizer = AutoTokenizer.from_pretrained(
    "microsoft/deberta-v3-base",
    cache_dir="./huggingface_inoculation_cache/",
    use_fast=False,
)

In [None]:
len(bert_tokenizer)