In [2]:
# Imports
import stanza
from stanza.utils.conll import CoNLL
# stanza.download('en')
from nltk.corpus import wordnet

from transformers import (
    AutoConfig,
    AutoTokenizer,
)

# Utils
def get_sentence_doc(sentence_in):
    doc = nlp(sentence_in)
    return doc

def get_postag_token(sentence_in):
    ret = []
    doc = nlp(sentence_in)
    for sent in doc.sentences:
        for word in sent.words:
            ret  += [(word.text, word.upos, word.xpos,)]
    return ret

# Stanza
nlp = stanza.Pipeline('en')

2021-07-14 01:59:56 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| pos       | combined  |
| lemma     | combined  |
| depparse  | combined  |
| sentiment | sstplus   |
| ner       | ontonotes |

2021-07-14 01:59:56 INFO: Use device: cpu
2021-07-14 01:59:56 INFO: Loading: tokenize
2021-07-14 01:59:56 INFO: Loading: pos
2021-07-14 01:59:56 INFO: Loading: lemma
2021-07-14 01:59:56 INFO: Loading: depparse
2021-07-14 01:59:58 INFO: Loading: sentiment
2021-07-14 01:59:58 INFO: Loading: ner
2021-07-14 01:59:59 INFO: Done loading processors!


#### Demo for different aspects we studied with mid-tuning pipeline.

In [13]:
original_sentence = 'Every move Google makes brings this particular future closer .'

#### 1. Tokenization Differences

Loading RoBERTa model with different tokenizers.

In [20]:
wordpeice_tokenizer = AutoTokenizer.from_pretrained(
    "bert-base-cased",
    use_fast=False,
    cache_dir="../huggingface_cache"
)

bpe_tokenizer = AutoTokenizer.from_pretrained(
    "roberta-base",
    use_fast=False,
    cache_dir="../huggingface_cache"
)

sentpeice_tokenizer = AutoTokenizer.from_pretrained(
    'xlnet-base-cased',
    use_fast=False,
    cache_dir="../huggingface_cache"
)

In [21]:
wordpeice_tokenizer.tokenize(original_sentence)

['Every',
 'move',
 'Google',
 'makes',
 'brings',
 'this',
 'particular',
 'future',
 'closer',
 '.']

In [22]:
bpe_tokenizer.tokenize(original_sentence)

['Every',
 'Ġmove',
 'ĠGoogle',
 'Ġmakes',
 'Ġbrings',
 'Ġthis',
 'Ġparticular',
 'Ġfuture',
 'Ġcloser',
 'Ġ.']

In [23]:
sentpeice_tokenizer.tokenize(original_sentence)

['▁Every',
 '▁move',
 '▁Google',
 '▁makes',
 '▁brings',
 '▁this',
 '▁particular',
 '▁future',
 '▁closer',
 '▁',
 '.']

#### 2. Semantics Shifts

Synonym Shift - Nouns.

In [32]:
postags = get_postag_token(original_sentence)
shifted_sentence = []
for p in postags:
    if p[-1] == "NN":
        shifted = False
        syns = wordnet.synsets(p[0])
        for syn in syns:
            shift_w = syn.lemmas()[0].name()
            if p[0] != shift_w:
                shifted_sentence += [shift_w]
                shifted = True
                break
        if not shifted:
            shifted_sentence += [p[0]]
    else:
        shifted_sentence += [p[0]]
" ".join(shifted_sentence)

'Every motion Google makes brings this particular future closer .'

Scrambling Shift - Nouns.

In [39]:
word_identity_map = {
    'Google' : 'Facebook',
    'move' : 'book',
    'future' : 'internet'
}
shifted_sentence = []
for p in postags:
    if p[-1] == "NN" and p[0] in word_identity_map.keys():
        shifted_sentence += [word_identity_map[p[0]]]
    else:
        shifted_sentence += [p[0]]
" ".join(shifted_sentence)

'Every book Google makes brings this particular internet closer .'

Concept Merging and Splitting - Nouns.

Synonym Shift - Nouns: Exploring (1) word embedddings, (2) wordnet nbrs.

#### 3. Dependency Shifts

Galatic Dependency

In [43]:
# using Stanza to get a conllu file for a sentence.
sent_doc = get_sentence_doc(original_sentence)
CoNLL.write_doc2conll(sent_doc, "./data-files/sample.conllu")

# runing the command to get galactic dependency.
! GALACTIC_ROOT=./submodules/gdtreebank/ ./submodules/gdtreebank/bin/gd-translate --input ./data-files/sample.conllu --spec en~fr@N~hi@V

# getting the synthetic sentence.
to_sent_doc = CoNLL.conll2doc("./data-files/sample-en~fr@N~hi@V.conllu")
" ".join([item.text for item in to_sent_doc.sentences[0].words])


java -cp ./submodules/gdtreebank//bin/gdgen.jar -Xmx8g datagen.GalacticGen --task test --inputTB ./data-files/sample.conllu --outputTB ./data-files/sample-en~fr@N~hi@V.conllu --verbose 1 --seed 0 --supStrateModelNOUN ./submodules/gdtreebank//models/GD_French/fr@N.orm --subStrateModelNOUN ./submodules/gdtreebank//models/GD_English/en@N.orm --supStrateModelVERB ./submodules/gdtreebank//models/GD_Hindi/hi@V.orm --subStrateModelVERB ./submodules/gdtreebank//models/GD_English/en@V.orm
0        INFO                GalacticGen - Running with args: --task test --inputTB ./data-files/sample.conllu --outputTB ./data-files/sample-en~fr@N~hi@V.conllu --verbose 1 --seed 0 --supStrateModelNOUN ./submodules/gdtreebank//models/GD_French/fr@N.orm --subStrateModelNOUN ./submodules/gdtreebank//models/GD_English/en@N.orm --supStrateModelVERB ./submodules/gdtreebank//models/GD_Hindi/hi@V.orm --subStrateModelVERB ./submodules/gdtreebank//models/GD_English/en@V.orm
SEED=123456789101112
SEED=0
57 [main] INFO 

'this particular future Every move Google makes closer brings .'

Random Ordering