In [2]:
# Imports
import stanza
from stanza.utils.conll import CoNLL
# stanza.download('en')
from nltk.corpus import wordnet

from transformers import (
    AutoConfig,
    AutoTokenizer,
)

from datasets import DatasetDict
from datasets import Dataset
from datasets import load_dataset
from datasets import list_datasets

# Utils
def get_sentence_doc(sentence_in):
    doc = nlp(sentence_in)
    return doc

def get_postag_token(sentence_in):
    ret = []
    doc = nlp(sentence_in)
    for sent in doc.sentences:
        for word in sent.words:
            ret  += [(word.text, word.upos, word.xpos,)]
    return ret

# Stanza
nlp = stanza.Pipeline('en', processors='tokenize,pos')

2021-08-04 12:51:08 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| pos       | combined |

2021-08-04 12:51:08 INFO: Use device: gpu
2021-08-04 12:51:08 INFO: Loading: tokenize
2021-08-04 12:51:14 INFO: Loading: pos
2021-08-04 12:51:26 INFO: Done loading processors!


#### Demo for different aspects we studied with mid-tuning pipeline.

In [None]:
original_sentence = 'Every move Google makes brings this particular future closer .'

#### 1. Tokenization Differences

Loading RoBERTa model with different tokenizers.

In [None]:
wordpeice_tokenizer = AutoTokenizer.from_pretrained(
    "bert-base-cased",
    use_fast=False,
    cache_dir="../huggingface_cache"
)

bpe_tokenizer = AutoTokenizer.from_pretrained(
    "roberta-base",
    use_fast=False,
    cache_dir="../huggingface_cache"
)

sentpeice_tokenizer = AutoTokenizer.from_pretrained(
    'xlnet-base-cased',
    use_fast=False,
    cache_dir="../huggingface_cache"
)

In [None]:
original_sentence = " The ability to compositionally map language to referents, relations, and actions is an essential component of language understanding. "

In [None]:
wordpeice_tokenizer.tokenize(original_sentence)

In [None]:
bpe_tokenizer.tokenize(original_sentence)

In [None]:
sentpeice_tokenizer.tokenize(original_sentence)

#### 2. Semantics Shifts

Synonym Shift - Nouns.

In [None]:
# batch annotation using Stanza
documents = ["", "I wrote another document for fun."]
in_docs = [stanza.Document([], text=d) for d in documents]
doc = nlp(in_docs)

In [None]:
doc[0].sentences

In [None]:
postags = get_postag_token(original_sentence)
shifted_sentence = []
for p in postags:
    if p[-1] == "NN":
        shifted = False
        syns = wordnet.synsets(p[0])
        for syn in syns:
            shift_w = syn.lemmas()[0].name()
            if p[0] != shift_w:
                shifted_sentence += [shift_w]
                shifted = True
                break
        if not shifted:
            shifted_sentence += [p[0]]
    else:
        shifted_sentence += [p[0]]
" ".join(shifted_sentence)

Scrambling Shift - Nouns.

In [None]:
word_identity_map = {
    'Google' : 'Facebook',
    'move' : 'book',
    'future' : 'internet'
}
shifted_sentence = []
for p in postags:
    if p[-1] == "NN" and p[0] in word_identity_map.keys():
        shifted_sentence += [word_identity_map[p[0]]]
    else:
        shifted_sentence += [p[0]]
" ".join(shifted_sentence)

Concept Merging and Splitting - 1. random merging and random splitting with -X format..

Synonym Shift - Nouns: Exploring (1) word embedddings, (2) wordnet nbrs.

Other Relevant Shift: FastText Nbrs.
* Get all nouns in the wikitext dataset.
* Get fasttext embeddings of those nouns.
* We need to get the lemma of those words (i.e., "books" cannot be swapped with "book" since they are considered as the same word lemma).
* Using 1-NN matching algorithm to pair up words to swap meanings. We will have some rules to break ties.

In [None]:
import fasttext
ft = fasttext.load_model('./data-files/cc.en.300.bin')

In [None]:
ft.get_nearest_neighbors('book', k=5)

In [None]:
wiki_datasets = DatasetDict.load_from_disk("./data-files/wikitext-15M/")

In [None]:
collected_nouns = set([])
count = 0
total_count = len(wiki_datasets["train"])
for sentence in wiki_datasets["train"]:
    if count % 1000 == 0:
        print(f"completed:{count}/{total_count}")
    postags = get_postag_token(sentence['text'])
    for p in postags:
        if p[-1] == 'NN':
            collected_nouns.add(p[0])
    count += 1

In [None]:
collected_nouns

#### 3. Dependency Shifts

Conllu Files Generation

In [13]:
# read-in sentences from different files.
wiki_datasets = DatasetDict.load_from_disk("../data-files/wikitext-15M/")

In [18]:
s = wiki_datasets["train"][3]

In [19]:
if len(s["text"].strip()) > 0:
    clean_s = []
    for t in s["text"].strip().split(" "):
        if len(t.strip()) > 0:
            clean_s += [t.strip()]

In [23]:
s = " ".join(clean_s)/

In [28]:
s

'Senjō no Valkyria 3 : <unk> Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . Employing the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " <unk> Raven " .'

In [24]:
in_docs = [stanza.Document([], text=s)]
docs = nlp(in_docs)

In [38]:
len(docs[0].sentences)

3

In [35]:
CoNLL.write_doc2conll(docs[0], "./test.conllu", mode="w")

Galatic Dependency

In [3]:
# read-in sentences from different files.
wiki_datasets = DatasetDict.load_from_disk("../data-files/wikitext-15M/")

In [4]:
wiki_datasets_fr_fr = DatasetDict.load_from_disk("../data-files/wikitext-15M-en~fr@N~fr@V/")

In [5]:
wiki_datasets_ja_ja = DatasetDict.load_from_disk("../data-files/wikitext-15M-en~ja_ktc@N~ja_ktc@V/")

In [6]:
wiki_datasets_fr_ja = DatasetDict.load_from_disk("../data-files/wikitext-15M-en~fr@N~ja_ktc@V/")

In [12]:
wiki_datasets["validation"][:10]

{'text': ['',
  ' = Homarus gammarus = \n',
  '',
  ' Homarus gammarus , known as the European lobster or common lobster , is a species of clawed lobster from the eastern Atlantic Ocean , Mediterranean Sea and parts of the Black Sea . It is closely related to the American lobster , H. americanus . It may grow to a length of 60 cm ( 24 in ) and a mass of 6 kilograms ( 13 lb ) , and bears a conspicuous pair of claws . In life , the lobsters are blue , only becoming " lobster red " on cooking . Mating occurs in the summer , producing eggs which are carried by the females for up to a year before hatching into planktonic larvae . Homarus gammarus is a highly esteemed food , and is widely caught using lobster pots , mostly around the British Isles . \n',
  '',
  ' = = Description = = \n',
  '',
  ' Homarus gammarus is a large crustacean , with a body length up to 60 centimetres ( 24 in ) and weighing up to 5 – 6 kilograms ( 11 – 13 lb ) , although the lobsters caught in lobster pots are usua

In [11]:
wiki_datasets_ja_ja["validation"][:10]

{'text': ['= Homarus gammarus =',
  'Homarus gammarus , known as the European lobster or common lobster , is a species of lobster from the eastern Atlantic Ocean , Mediterranean Sea and parts of the Black Sea . It is closely related to the American lobster , americanus . It may grow to a length of 60 cm ( 24 in ) and a mass of 6 kilograms ( 13 lb ) , and a conspicuous pair of claws . In life , the lobsters are blue , only becoming " red " on cooking . Mating occurs in the summer , which are by the females for up to a year before hatching into planktonic larvae . Homarus gammarus is a highly esteemed food , and is widely pots , mostly around the British Isles . lobster using caught carried eggs producing lobster bears H. clawed',
  '= = Description = =',
  'Homarus gammarus is a large crustacean , with a length up to 60 centimetres ( 24 in ) and up to 5 – 6 kilograms ( 11 – 13 lb ) , although the caught in pots are usually 23 – 38 cm ( 9 – 15 in ) long and 0 7 – 2 @.@ 2 kg ( 1 @.@ 5 – 4