In [1]:
# Imports
import stanza
from stanza.utils.conll import CoNLL
# stanza.download('en')
from nltk.corpus import wordnet

from transformers import (
    AutoConfig,
    AutoTokenizer,
)

from datasets import DatasetDict
from datasets import Dataset
from datasets import load_dataset
from datasets import list_datasets

# Utils
def get_sentence_doc(sentence_in):
    doc = nlp(sentence_in)
    return doc

def get_postag_token(sentence_in):
    ret = []
    doc = nlp(sentence_in)
    for sent in doc.sentences:
        for word in sent.words:
            ret  += [(word.text, word.upos, word.xpos,)]
    return ret

# Stanza
nlp = stanza.Pipeline('en', processors='tokenize,pos')

2021-07-27 00:27:44 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| pos       | combined |

2021-07-27 00:27:44 INFO: Use device: gpu
2021-07-27 00:27:44 INFO: Loading: tokenize
2021-07-27 00:27:56 INFO: Loading: pos
2021-07-27 00:28:10 INFO: Done loading processors!


#### Demo for different aspects we studied with mid-tuning pipeline.

In [15]:
original_sentence = 'Every move Google makes brings this particular future closer .'

#### 1. Tokenization Differences

Loading RoBERTa model with different tokenizers.

In [16]:
wordpeice_tokenizer = AutoTokenizer.from_pretrained(
    "bert-base-cased",
    use_fast=False,
    cache_dir="../huggingface_cache"
)

bpe_tokenizer = AutoTokenizer.from_pretrained(
    "roberta-base",
    use_fast=False,
    cache_dir="../huggingface_cache"
)

sentpeice_tokenizer = AutoTokenizer.from_pretrained(
    'xlnet-base-cased',
    use_fast=False,
    cache_dir="../huggingface_cache"
)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=570.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=29.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435797.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=481.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898823.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1355863.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=760.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=798011.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1382015.0, style=ProgressStyle(descript…




In [20]:
original_sentence = " The ability to compositionally map language to referents, relations, and actions is an essential component of language understanding. "

In [21]:
wordpeice_tokenizer.tokenize(original_sentence)

['The',
 'ability',
 'to',
 'composition',
 '##ally',
 'map',
 'language',
 'to',
 'refer',
 '##ents',
 ',',
 'relations',
 ',',
 'and',
 'actions',
 'is',
 'an',
 'essential',
 'component',
 'of',
 'language',
 'understanding',
 '.']

In [22]:
bpe_tokenizer.tokenize(original_sentence)

['ĠThe',
 'Ġability',
 'Ġto',
 'Ġcomposition',
 'ally',
 'Ġmap',
 'Ġlanguage',
 'Ġto',
 'Ġrefere',
 'nt',
 's',
 ',',
 'Ġrelations',
 ',',
 'Ġand',
 'Ġactions',
 'Ġis',
 'Ġan',
 'Ġessential',
 'Ġcomponent',
 'Ġof',
 'Ġlanguage',
 'Ġunderstanding',
 '.',
 'Ġ']

In [19]:
sentpeice_tokenizer.tokenize(original_sentence)

['▁Every',
 '▁move',
 '▁Google',
 '▁makes',
 '▁brings',
 '▁this',
 '▁particular',
 '▁future',
 '▁closer',
 '▁',
 '.']

#### 2. Semantics Shifts

Synonym Shift - Nouns.

In [None]:
# batch annotation using Stanza
documents = ["", "I wrote another document for fun."]
in_docs = [stanza.Document([], text=d) for d in documents]
doc = nlp(in_docs)

In [None]:
doc[0].sentences

In [None]:
postags = get_postag_token(original_sentence)
shifted_sentence = []
for p in postags:
    if p[-1] == "NN":
        shifted = False
        syns = wordnet.synsets(p[0])
        for syn in syns:
            shift_w = syn.lemmas()[0].name()
            if p[0] != shift_w:
                shifted_sentence += [shift_w]
                shifted = True
                break
        if not shifted:
            shifted_sentence += [p[0]]
    else:
        shifted_sentence += [p[0]]
" ".join(shifted_sentence)

Scrambling Shift - Nouns.

In [None]:
word_identity_map = {
    'Google' : 'Facebook',
    'move' : 'book',
    'future' : 'internet'
}
shifted_sentence = []
for p in postags:
    if p[-1] == "NN" and p[0] in word_identity_map.keys():
        shifted_sentence += [word_identity_map[p[0]]]
    else:
        shifted_sentence += [p[0]]
" ".join(shifted_sentence)

Concept Merging and Splitting - 1. random merging and random splitting with -X format..

Synonym Shift - Nouns: Exploring (1) word embedddings, (2) wordnet nbrs.

Other Relevant Shift: FastText Nbrs.
* Get all nouns in the wikitext dataset.
* Get fasttext embeddings of those nouns.
* We need to get the lemma of those words (i.e., "books" cannot be swapped with "book" since they are considered as the same word lemma).
* Using 1-NN matching algorithm to pair up words to swap meanings. We will have some rules to break ties.

In [None]:
import fasttext
ft = fasttext.load_model('./data-files/cc.en.300.bin')

In [None]:
ft.get_nearest_neighbors('book', k=5)

In [None]:
wiki_datasets = DatasetDict.load_from_disk("./data-files/wikitext-15M/")

In [None]:
collected_nouns = set([])
count = 0
total_count = len(wiki_datasets["train"])
for sentence in wiki_datasets["train"]:
    if count % 1000 == 0:
        print(f"completed:{count}/{total_count}")
    postags = get_postag_token(sentence['text'])
    for p in postags:
        if p[-1] == 'NN':
            collected_nouns.add(p[0])
    count += 1

In [None]:
collected_nouns

#### 3. Dependency Shifts

Galatic Dependency

In [None]:
# using Stanza to get a conllu file for a sentence.
sent_doc = get_sentence_doc(original_sentence)
CoNLL.write_doc2conll(sent_doc, "./data-files/sample.conllu")

# runing the command to get galactic dependency.
! GALACTIC_ROOT=./submodules/gdtreebank/ ./submodules/gdtreebank/bin/gd-translate --input ./data-files/sample.conllu --spec en~fr@N~hi@V

# getting the synthetic sentence.
to_sent_doc = CoNLL.conll2doc("./data-files/sample-en~fr@N~hi@V.conllu")
" ".join([item.text for item in to_sent_doc.sentences[0].words])


Random Ordering

In [3]:
wiki_datasets = DatasetDict.load_from_disk("../data-files/wikitext-15M/")

In [17]:
len(wiki_datasets["test"])

4358

In [23]:
# getting the synthetic sentence.
to_sent_doc = CoNLL.conll2doc("../data-files/wikitext-15M-conllu/wikitext-15M-test.conllu")

In [24]:
" ".join([item.text for item in to_sent_doc.sentences[0].words])


'= Robert Boulter ='

In [9]:
for s in to_sent_doc.sentences:
    print(s)
    break

[
  {
    "id": 1,
    "text": "=",
    "upos": "PUNCT",
    "xpos": "NFP",
    "head": 0,
    "misc": "",
    "start_char": 0,
    "end_char": 1
  },
  {
    "id": 2,
    "text": "Robert",
    "upos": "PROPN",
    "xpos": "NNP",
    "feats": "Number=Sing",
    "head": 1,
    "misc": "",
    "start_char": 2,
    "end_char": 8
  },
  {
    "id": 3,
    "text": "Boulter",
    "upos": "PROPN",
    "xpos": "NNP",
    "feats": "Number=Sing",
    "head": 2,
    "misc": "",
    "start_char": 9,
    "end_char": 16
  },
  {
    "id": 4,
    "text": "=",
    "upos": "PUNCT",
    "xpos": ",",
    "head": 3,
    "misc": "",
    "start_char": 17,
    "end_char": 18
  }
]


In [25]:
len(to_sent_doc.sentences)

2891

In [22]:
stanza.Document(sentences=[stanza.Sentence(text="hhhh")])

AttributeError: module 'stanza' has no attribute 'Sentence'