In [1]:
from util import *
from models import *
import spacy


def print_dict(dict_name, indent=2):
    print(f"{dict_name}: \n" + json.dumps(eval(dict_name),indent=indent))

def print_var(var_name):
    print(f"{var_name}: {eval(var_name)}")

In [2]:
def tokenize_example_cbow():
    spacy_tokenizer = spacy.load('en')
    
    global example, preprocessed_example, tokenized_example
    example = dict(
        original_sent = "What if <Sociologists/> Had as Much Influence as Economists ?",
        edit_word = "donkeys",
    )
    print_dict("example")

    i=example["original_sent"]
    old_word = re.findall("<(.*)/>", i)[0]
    l = re.findall("(.*)<.*/>(.*)", i)[0]
    l = [j.strip() for j in l]
    context = " ".join(l).strip()


    preprocessed_example = dict(
        original_word = old_word,
        edit_word = example['edit_word'],
        context_word = context, 
    )
    print_dict("preprocessed_example")


    tokenized_example = {k: np.array(spacy_tokenizer(v)).astype('str').tolist() for k,v in preprocessed_example.items()}
    print_dict("tokenized_example")


In [3]:
# cbow example
tokenize_example_cbow()

example: 
{
  "original_sent": "What if <Sociologists/> Had as Much Influence as Economists ?",
  "edit_word": "donkeys"
}
preprocessed_example: 
{
  "original_word": "Sociologists",
  "edit_word": "donkeys",
  "context_word": "What if Had as Much Influence as Economists ?"
}
tokenized_example: 
{
  "original_word": [
    "Sociologists"
  ],
  "edit_word": [
    "donkeys"
  ],
  "context_word": [
    "What",
    "if",
    "Had",
    "as",
    "Much",
    "Influence",
    "as",
    "Economists",
    "?"
  ]
}


In [17]:
def tokenize_example_transformer(tokenizer, example):
    
    
    global special_tokens, preprocessed_example, tokenized_example, idx_example
    
    print_dict("example")


    special_tokens = dict(
        mask_token = tokenizer.mask_token,
        sep_token = tokenizer.sep_token
    )
    print_dict("special_tokens")


    new_sent = re.sub("<.*/>", f"<{example['edit_word']}/>", example["original_sent"])
    mask_sent = re.sub("<.*/>", f"<{tokenizer.mask_token}/>",  example["original_sent"])
    preprocessed_example = dict(
        original_sent = example["original_sent"],
        edit_sent = new_sent ,
        mask_sent = mask_sent,
    )
    print_dict("preprocessed_example")

    
    tokenized_example = {k: tokenize_and_cut(v, tokenizer) for k,v in preprocessed_example.items()}
    print_dict("tokenized_example")


    idx_example = {k: tokenizer.convert_tokens_to_ids(v) for k,v in tokenized_example.items()}
    print_var("tokenizer.sep_token_id")
    print_var("tokenizer.mask_token_id")
    print('')
    print_dict("idx_example")


In [5]:
# load bert
bert_tokenizer, bert = init_transformers('bert-base-uncased')

# load roberta
roberta_tokenizer, roberta = init_transformers('roberta-base')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=481.0, style=ProgressStyle(description_…




In [6]:
example = dict(
    original_sent = "What if <Sociologists/> Had as Much Influence as Economists ?",
    edit_word = "donkeys",
)

# bert example, word level BPE
tokenizer = bert_tokenizer
tokenize_example_transformer(tokenizer, example)

example: 
{
  "original_sent": "What if <Sociologists/> Had as Much Influence as Economists ?",
  "edit_word": "donkeys"
}
special_tokens: 
{
  "mask_token": "[MASK]",
  "sep_token": "[SEP]"
}
preprocessed_example: 
{
  "original_sent": "What if <Sociologists/> Had as Much Influence as Economists ?",
  "edit_sent": "What if <donkeys/> Had as Much Influence as Economists ?",
  "mask_sent": "What if <[MASK]/> Had as Much Influence as Economists ?"
}
tokenized_example: 
{
  "original_sent": [
    "what",
    "if",
    "[SEP]",
    "sociologist",
    "##s",
    "[SEP]",
    "had",
    "as",
    "much",
    "influence",
    "as",
    "economists",
    "?"
  ],
  "edit_sent": [
    "what",
    "if",
    "[SEP]",
    "donkey",
    "##s",
    "[SEP]",
    "had",
    "as",
    "much",
    "influence",
    "as",
    "economists",
    "?"
  ],
  "mask_sent": [
    "what",
    "if",
    "[SEP]",
    "[MASK]",
    "[SEP]",
    "had",
    "as",
    "much",
    "influence",
    "as",
    "economists"

In [18]:
example = dict(
    original_sent = "What if <Sociologists/> Had as Much Influence as Economists ?",
    edit_word = "donkeys",
)

# roberta example, byte level BPE
tokenizer = roberta_tokenizer
tokenize_example_transformer(tokenizer, example)

example: 
{
  "original_sent": "What if <Sociologists/> Had as Much Influence as Economists ?",
  "edit_word": "donkeys"
}
special_tokens: 
{
  "mask_token": "<mask>",
  "sep_token": "</s>"
}
preprocessed_example: 
{
  "original_sent": "What if <Sociologists/> Had as Much Influence as Economists ?",
  "edit_sent": "What if <donkeys/> Had as Much Influence as Economists ?",
  "mask_sent": "What if <<mask>/> Had as Much Influence as Economists ?"
}
tokenized_example: 
{
  "original_sent": [
    "What",
    "\u0120if",
    "</s>",
    "\u0120Soci",
    "ologists",
    "</s>",
    "\u0120Had",
    "\u0120as",
    "\u0120Much",
    "\u0120Influence",
    "\u0120as",
    "\u0120Econom",
    "ists",
    "\u0120?"
  ],
  "edit_sent": [
    "What",
    "\u0120if",
    "</s>",
    "\u0120don",
    "keys",
    "</s>",
    "\u0120Had",
    "\u0120as",
    "\u0120Much",
    "\u0120Influence",
    "\u0120as",
    "\u0120Econom",
    "ists",
    "\u0120?"
  ],
  "mask_sent": [
    "What",
    "\u0120

In [15]:
# bert_tokenizer.pretrained_vocab_files_map
# bert_vocab = bert_tokenizer.get_vocab()
# len(bert_vocab)

# roberta_tokenizer.pretrained_vocab_files_map
# roberta_vocab = roberta_tokenizer.get_vocab()
# len(roberta_vocab)
# roberta_vocab

In [16]:
x = [
    "what",
    "if",
    "[SEP]",
    "sociologist",
    "##s",
    "[SEP]",
    "had",
    "as",
    "much",
    "influence",
    "as",
    "economists",
    "?"
  ]

In [24]:
example = dict(
    original_sent = "California and <President Trump/> are going to war with each other",
    edit_word = "monkeys",
)

# roberta example, byte level BPE
tokenizer = bert_tokenizer
tokenize_example_transformer(tokenizer, example)

example: 
{
  "original_sent": "California and <President Trump/> are going to war with each other",
  "edit_word": "monkeys"
}
special_tokens: 
{
  "mask_token": "[MASK]",
  "sep_token": "[SEP]"
}
preprocessed_example: 
{
  "original_sent": "California and <President Trump/> are going to war with each other",
  "edit_sent": "California and <monkeys/> are going to war with each other",
  "mask_sent": "California and <[MASK]/> are going to war with each other"
}
tokenized_example: 
{
  "original_sent": [
    "california",
    "and",
    "[SEP]",
    "president",
    "trump",
    "[SEP]",
    "are",
    "going",
    "to",
    "war",
    "with",
    "each",
    "other"
  ],
  "edit_sent": [
    "california",
    "and",
    "[SEP]",
    "monkeys",
    "[SEP]",
    "are",
    "going",
    "to",
    "war",
    "with",
    "each",
    "other"
  ],
  "mask_sent": [
    "california",
    "and",
    "[SEP]",
    "[MASK]",
    "[SEP]",
    "are",
    "going",
    "to",
    "war",
    "with",
    