In [2]:
# Imports
import stanza
from stanza.utils.conll import CoNLL
# stanza.download('en')
from nltk.corpus import wordnet

from transformers import (
    AutoConfig,
    AutoTokenizer,
)

from datasets import DatasetDict
from datasets import Dataset
from datasets import load_dataset
from datasets import list_datasets

# Utils
def get_sentence_doc(sentence_in):
    doc = nlp(sentence_in)
    return doc

def get_postag_token(sentence_in):
    ret = []
    doc = nlp(sentence_in)
    for sent in doc.sentences:
        for word in sent.words:
            ret  += [(word.text, word.upos, word.xpos,)]
    return ret

# Stanza
nlp = stanza.Pipeline('en', processors='tokenize,pos')

2021-08-05 01:48:49 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| pos       | combined |

2021-08-05 01:48:49 INFO: Use device: gpu
2021-08-05 01:48:49 INFO: Loading: tokenize
2021-08-05 01:48:55 INFO: Loading: pos
2021-08-05 01:48:56 INFO: Done loading processors!


#### Demo for different aspects we studied with mid-tuning pipeline.

In [None]:
original_sentence = 'Every move Google makes brings this particular future closer .'

#### 1. Tokenization Differences

Loading RoBERTa model with different tokenizers.

In [None]:
wordpeice_tokenizer = AutoTokenizer.from_pretrained(
    "bert-base-cased",
    use_fast=False,
    cache_dir="../huggingface_cache"
)

bpe_tokenizer = AutoTokenizer.from_pretrained(
    "roberta-base",
    use_fast=False,
    cache_dir="../huggingface_cache"
)

sentpeice_tokenizer = AutoTokenizer.from_pretrained(
    'xlnet-base-cased',
    use_fast=False,
    cache_dir="../huggingface_cache"
)

In [None]:
original_sentence = " The ability to compositionally map language to referents, relations, and actions is an essential component of language understanding. "

In [None]:
wordpeice_tokenizer.tokenize(original_sentence)

In [None]:
bpe_tokenizer.tokenize(original_sentence)

In [None]:
sentpeice_tokenizer.tokenize(original_sentence)

#### 2. Semantics Shifts

Synonym Shift - Nouns.

In [None]:
# batch annotation using Stanza
documents = ["", "I wrote another document for fun."]
in_docs = [stanza.Document([], text=d) for d in documents]
doc = nlp(in_docs)

In [None]:
doc[0].sentences

In [None]:
postags = get_postag_token(original_sentence)
shifted_sentence = []
for p in postags:
    if p[-1] == "NN":
        shifted = False
        syns = wordnet.synsets(p[0])
        for syn in syns:
            shift_w = syn.lemmas()[0].name()
            if p[0] != shift_w:
                shifted_sentence += [shift_w]
                shifted = True
                break
        if not shifted:
            shifted_sentence += [p[0]]
    else:
        shifted_sentence += [p[0]]
" ".join(shifted_sentence)

Scrambling Shift - Nouns.

In [None]:
word_identity_map = {
    'Google' : 'Facebook',
    'move' : 'book',
    'future' : 'internet'
}
shifted_sentence = []
for p in postags:
    if p[-1] == "NN" and p[0] in word_identity_map.keys():
        shifted_sentence += [word_identity_map[p[0]]]
    else:
        shifted_sentence += [p[0]]
" ".join(shifted_sentence)

Concept Merging and Splitting - 1. random merging and random splitting with -X format..

Synonym Shift - Nouns: Exploring (1) word embedddings, (2) wordnet nbrs.

Other Relevant Shift: FastText Nbrs.
* Get all nouns in the wikitext dataset.
* Get fasttext embeddings of those nouns.
* We need to get the lemma of those words (i.e., "books" cannot be swapped with "book" since they are considered as the same word lemma).
* Using 1-NN matching algorithm to pair up words to swap meanings. We will have some rules to break ties.

In [None]:
import fasttext
ft = fasttext.load_model('./data-files/cc.en.300.bin')

In [None]:
ft.get_nearest_neighbors('book', k=5)

In [None]:
wiki_datasets = DatasetDict.load_from_disk("./data-files/wikitext-15M/")

In [None]:
collected_nouns = set([])
count = 0
total_count = len(wiki_datasets["train"])
for sentence in wiki_datasets["train"]:
    if count % 1000 == 0:
        print(f"completed:{count}/{total_count}")
    postags = get_postag_token(sentence['text'])
    for p in postags:
        if p[-1] == 'NN':
            collected_nouns.add(p[0])
    count += 1

In [None]:
collected_nouns

#### 3. Dependency Shifts

Conllu Files Generation

In [12]:
# read-in sentences from different files.
wiki_datasets = DatasetDict.load_from_disk("../data-files/wikitext-15M/")

In [13]:
s = wiki_datasets["test"][3]

In [14]:
if len(s["text"].strip()) > 0:
    clean_s = []
    for t in s["text"].strip().split(" "):
        if len(t.strip()) > 0:
            clean_s += [t.strip()]

In [16]:
s = " ".join(clean_s)

In [17]:
s

'Robert Boulter is an English film , television and theatre actor . He had a guest @-@ starring role on the television series The Bill in 2000 . This was followed by a starring role in the play Herons written by Simon Stephens , which was performed in 2001 at the Royal Court Theatre . He had a guest role in the television series Judge John Deed in 2002 . In 2004 Boulter landed a role as " Craig " in the episode " Teddy \'s Story " of the television series The Long Firm ; he starred alongside actors Mark Strong and Derek Jacobi . He was cast in the 2005 theatre productions of the Philip Ridley play Mercury Fur , which was performed at the Drum Theatre in Plymouth and the <unk> Chocolate Factory in London . He was directed by John Tiffany and starred alongside Ben Whishaw , Shane Zaza , Harry Kent , Fraser Ayres , Sophie Stanton and Dominic Hall .'

In [18]:
in_docs = [stanza.Document([], text=s)]
docs = nlp(in_docs)

In [19]:
len(docs[0].sentences)

7

In [35]:
CoNLL.write_doc2conll(docs[0], "./test.conllu", mode="w")

Galatic Dependency

In [21]:
# read-in sentences from different files.
wiki_datasets = DatasetDict.load_from_disk("../data-files/wikitext-15M/")

In [22]:
wiki_datasets_fr_fr = DatasetDict.load_from_disk("../data-files/wikitext-15M-en~fr@N~fr@V/")

In [23]:
wiki_datasets_ja_ja = DatasetDict.load_from_disk("../data-files/wikitext-15M-en~ja_ktc@N~ja_ktc@V/")

In [24]:
wiki_datasets_fr_ja = DatasetDict.load_from_disk("../data-files/wikitext-15M-en~fr@N~ja_ktc@V/")

In [25]:
wiki_datasets["test"][:10]

{'text': ['',
  ' = Robert Boulter = \n',
  '',
  ' Robert Boulter is an English film , television and theatre actor . He had a guest @-@ starring role on the television series The Bill in 2000 . This was followed by a starring role in the play Herons written by Simon Stephens , which was performed in 2001 at the Royal Court Theatre . He had a guest role in the television series Judge John Deed in 2002 . In 2004 Boulter landed a role as " Craig " in the episode " Teddy \'s Story " of the television series The Long Firm ; he starred alongside actors Mark Strong and Derek Jacobi . He was cast in the 2005 theatre productions of the Philip Ridley play Mercury Fur , which was performed at the Drum Theatre in Plymouth and the <unk> Chocolate Factory in London . He was directed by John Tiffany and starred alongside Ben Whishaw , Shane Zaza , Harry Kent , Fraser Ayres , Sophie Stanton and Dominic Hall . \n',
  ' In 2006 , Boulter starred alongside Whishaw in the play Citizenship written by Mar

In [26]:
wiki_datasets_fr_fr["test"][:10]

{'text': ['= Robert Boulter =',
  'Robert Boulter is an English , and theatre . actor television film a guest @-@ starring role on the television The Bill in 2000 . series had He This was by a in the play by , which was in 2001 at the Royal Court Theatre . performed Stephens Simon written Herons role starring followed had a in the television series Judge John in 2002 . Deed role guest He In 2004 a as " Craig " in the " Teddy \'s " of the television The Long ; starred alongside Strong and Derek . Jacobi Mark actors he Firm series Story episode role landed Boulter He was in the 2005 of the play Fur , which was at the Drum in and the < unk > Chocolate Factory in London . Plymouth Theatre performed Mercury Ridley Philip productions theatre cast He was by and alongside Ben Whishaw , Zaza , Harry Kent , Fraser , Sophie Stanton and Dominic . Hall Ayres Shane starred Tiffany John directed',
  'In 2006 , Boulter alongside in the Citizenship by Mark Ravenhill . written play Whishaw starred on a 