In [1]:
from util import *
from models import *

In [2]:
def tokenize_example(tokenizer):
    
    
    global example, special_tokens, preprocessed_example, tokenized_example, idx_example
    
    def print_dict(dict_name, indent=2):
        print(f"{dict_name}: \n" + json.dumps(eval(dict_name),indent=indent))

    def print_var(var_name):
        print(f"{var_name}: {eval(var_name)}")
    
    example = dict(
        original_sent = "What if <Sociologists/> Had as Much Influence as Economists ?",
        edit_word = "donkeys",
    )
    print_dict("example")


    special_tokens = dict(
        mask_token = tokenizer.mask_token,
        sep_token = tokenizer.sep_token
    )
    print_dict("special_tokens")


    new_sent = re.sub("<.*/>", f"<{example['edit_word']}/>", example["original_sent"])
    mask_sent = re.sub("<.*/>", f"<{tokenizer.mask_token}/>",  example["original_sent"])
    preprocessed_example = dict(
        original_sent = example["original_sent"],
        edit_sent = new_sent ,
        mask_sent = mask_sent,
    )
    print_dict("preprocessed_example")

    
    tokenized_example = {k: tokenize_and_cut(v, tokenizer) for k,v in preprocessed_example.items()}
    print_dict("tokenized_example")


    idx_example = {k: tokenizer.convert_tokens_to_ids(v) for k,v in tokenized_example.items()}
    print_var("tokenizer.sep_token_id")
    print_var("tokenizer.mask_token_id")
    print('')
    print_dict("idx_example")


In [3]:
# load bert
bert_tokenizer, bert = init_transformers('bert-base-uncased')

# load roberta
roberta_tokenizer, roberta = init_transformers('roberta-base')

In [4]:
# bert example, word level BPE
tokenizer = bert_tokenizer
tokenize_example(tokenizer)

example: 
{
  "original_sent": "What if <Sociologists/> Had as Much Influence as Economists ?",
  "edit_word": "donkeys"
}
special_tokens: 
{
  "mask_token": "[MASK]",
  "sep_token": "[SEP]"
}
preprocessed_example: 
{
  "original_sent": "What if <Sociologists/> Had as Much Influence as Economists ?",
  "edit_sent": "What if <donkeys/> Had as Much Influence as Economists ?",
  "mask_sent": "What if <[MASK]/> Had as Much Influence as Economists ?"
}
tokenized_example: 
{
  "original_sent": [
    "what",
    "if",
    "[SEP]",
    "sociologist",
    "##s",
    "[SEP]",
    "had",
    "as",
    "much",
    "influence",
    "as",
    "economists",
    "?"
  ],
  "edit_sent": [
    "what",
    "if",
    "[SEP]",
    "donkey",
    "##s",
    "[SEP]",
    "had",
    "as",
    "much",
    "influence",
    "as",
    "economists",
    "?"
  ],
  "mask_sent": [
    "what",
    "if",
    "[SEP]",
    "[MASK]",
    "[SEP]",
    "had",
    "as",
    "much",
    "influence",
    "as",
    "economists"

In [5]:
# roberta example, byte level BPE
tokenizer = roberta_tokenizer
tokenize_example(tokenizer)

example: 
{
  "original_sent": "What if <Sociologists/> Had as Much Influence as Economists ?",
  "edit_word": "donkeys"
}
special_tokens: 
{
  "mask_token": "<mask>",
  "sep_token": "</s>"
}
preprocessed_example: 
{
  "original_sent": "What if <Sociologists/> Had as Much Influence as Economists ?",
  "edit_sent": "What if <donkeys/> Had as Much Influence as Economists ?",
  "mask_sent": "What if <<mask>/> Had as Much Influence as Economists ?"
}
tokenized_example: 
{
  "original_sent": [
    "What",
    "\u0120if",
    "</s>",
    "\u0120Soci",
    "ologists",
    "</s>",
    "\u0120Had",
    "\u0120as",
    "\u0120Much",
    "\u0120Influence",
    "\u0120as",
    "\u0120Econom",
    "ists",
    "\u0120?"
  ],
  "edit_sent": [
    "What",
    "\u0120if",
    "</s>",
    "\u0120don",
    "keys",
    "</s>",
    "\u0120Had",
    "\u0120as",
    "\u0120Much",
    "\u0120Influence",
    "\u0120as",
    "\u0120Econom",
    "ists",
    "\u0120?"
  ],
  "mask_sent": [
    "What",
    "\u0120

In [12]:
bert_tokenizer.pretrained_vocab_files_map

{'vocab_file': {'bert-base-uncased': 'https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt',
  'bert-large-uncased': 'https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt',
  'bert-base-cased': 'https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt',
  'bert-large-cased': 'https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt',
  'bert-base-multilingual-uncased': 'https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt',
  'bert-base-multilingual-cased': 'https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt',
  'bert-base-chinese': 'https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt',
  'bert-base-german-cased': 'https://int-deepset-models-bert.s3.eu-central-1.amazonaws.com/pytorch/bert-base-german-cased-vocab.txt',
  'bert-large-uncased-whole-word-masking': 'https://s3.amazonaws.c

In [11]:
bert_vocab = bert_tokenizer.get_vocab()
len(bert_vocab)


{'[PAD]': 0,
 '[unused0]': 1,
 '[unused1]': 2,
 '[unused2]': 3,
 '[unused3]': 4,
 '[unused4]': 5,
 '[unused5]': 6,
 '[unused6]': 7,
 '[unused7]': 8,
 '[unused8]': 9,
 '[unused9]': 10,
 '[unused10]': 11,
 '[unused11]': 12,
 '[unused12]': 13,
 '[unused13]': 14,
 '[unused14]': 15,
 '[unused15]': 16,
 '[unused16]': 17,
 '[unused17]': 18,
 '[unused18]': 19,
 '[unused19]': 20,
 '[unused20]': 21,
 '[unused21]': 22,
 '[unused22]': 23,
 '[unused23]': 24,
 '[unused24]': 25,
 '[unused25]': 26,
 '[unused26]': 27,
 '[unused27]': 28,
 '[unused28]': 29,
 '[unused29]': 30,
 '[unused30]': 31,
 '[unused31]': 32,
 '[unused32]': 33,
 '[unused33]': 34,
 '[unused34]': 35,
 '[unused35]': 36,
 '[unused36]': 37,
 '[unused37]': 38,
 '[unused38]': 39,
 '[unused39]': 40,
 '[unused40]': 41,
 '[unused41]': 42,
 '[unused42]': 43,
 '[unused43]': 44,
 '[unused44]': 45,
 '[unused45]': 46,
 '[unused46]': 47,
 '[unused47]': 48,
 '[unused48]': 49,
 '[unused49]': 50,
 '[unused50]': 51,
 '[unused51]': 52,
 '[unused52]': 53,

In [13]:
roberta_tokenizer.pretrained_vocab_files_map

{'vocab_file': {'roberta-base': 'https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json',
  'roberta-large': 'https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json',
  'roberta-large-mnli': 'https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-vocab.json',
  'distilroberta-base': 'https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-vocab.json',
  'roberta-base-openai-detector': 'https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json',
  'roberta-large-openai-detector': 'https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json'},
 'merges_file': {'roberta-base': 'https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt',
  'roberta-large': 'https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt',
  'roberta-large-mnli': 'https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-merges.txt',
  'distilroberta-base':

In [14]:
roberta_vocab = roberta_tokenizer.get_vocab()
len(roberta_vocab)

50265

In [15]:
roberta_vocab

{'<s>': 0,
 '<pad>': 1,
 '</s>': 2,
 '<unk>': 3,
 '.': 4,
 'Ġthe': 5,
 ',': 6,
 'Ġto': 7,
 'Ġand': 8,
 'Ġof': 9,
 'Ġa': 10,
 'Ġin': 11,
 '-': 12,
 'Ġfor': 13,
 'Ġthat': 14,
 'Ġon': 15,
 'Ġis': 16,
 'âĢ': 17,
 "'s": 18,
 'Ġwith': 19,
 'ĠThe': 20,
 'Ġwas': 21,
 'Ġ"': 22,
 'Ġat': 23,
 'Ġit': 24,
 'Ġas': 25,
 'Ġsaid': 26,
 'Ļ': 27,
 'Ġbe': 28,
 's': 29,
 'Ġby': 30,
 'Ġfrom': 31,
 'Ġare': 32,
 'Ġhave': 33,
 'Ġhas': 34,
 ':': 35,
 'Ġ(': 36,
 'Ġhe': 37,
 'ĠI': 38,
 'Ġhis': 39,
 'Ġwill': 40,
 'Ġan': 41,
 'Ġthis': 42,
 ')': 43,
 'ĠâĢ': 44,
 'Ġnot': 45,
 'Ŀ': 46,
 'Ġyou': 47,
 'ľ': 48,
 'Ġtheir': 49,
 'Ġor': 50,
 'Ġthey': 51,
 'Ġwe': 52,
 'Ġbut': 53,
 'Ġwho': 54,
 'Ġmore': 55,
 'Ġhad': 56,
 'Ġbeen': 57,
 'Ġwere': 58,
 'Ġabout': 59,
 ',"': 60,
 'Ġwhich': 61,
 'Ġup': 62,
 'Ġits': 63,
 'Ġcan': 64,
 'Ġone': 65,
 'Ġout': 66,
 'Ġalso': 67,
 'Ġ$': 68,
 'Ġher': 69,
 'Ġall': 70,
 'Ġafter': 71,
 '."': 72,
 '/': 73,
 'Ġwould': 74,
 "'t": 75,
 'Ġyear': 76,
 'Ġwhen': 77,
 'Ġfirst': 78,
 'Ġshe': 79,
 'Ġtwo': 

In [None]:
def tokenize_example_cbow():
    
    
    global example, special_tokens, preprocessed_example, tokenized_example, idx_example
    
    def print_dict(dict_name, indent=2):
        print(f"{dict_name}: \n" + json.dumps(eval(dict_name),indent=indent))

    def print_var(var_name):
        print(f"{var_name}: {eval(var_name)}")
    
    example = dict(
        original_sent = "What if <Sociologists/> Had as Much Influence as Economists ?",
        edit_word = "donkeys",
    )
    print_dict("example")


    special_tokens = dict(
        mask_token = tokenizer.mask_token,
        sep_token = tokenizer.sep_token
    )
    print_dict("special_tokens")


    new_sent = re.sub("<.*/>", f"<{example['edit_word']}/>", example["original_sent"])
    mask_sent = re.sub("<.*/>", f"<{tokenizer.mask_token}/>",  example["original_sent"])
    preprocessed_example = dict(
        original_sent = example["original_sent"],
        edit_sent = new_sent ,
        mask_sent = mask_sent,
    )
    print_dict("preprocessed_example")

    
    tokenized_example = {k: tokenize_and_cut(v, tokenizer) for k,v in preprocessed_example.items()}
    print_dict("tokenized_example")


    idx_example = {k: tokenizer.convert_tokens_to_ids(v) for k,v in tokenized_example.items()}
    print_var("tokenizer.sep_token_id")
    print_var("tokenizer.mask_token_id")
    print('')
    print_dict("idx_example")



In [None]:
        for i in original:
            old_word = re.findall("<(.*)/>", i)[0]
            l = re.findall("(.*)<.*/>(.*)", i)[0]
            l = [j.strip() for j in l]
            context = " ".join(l).strip()
            old_word_list.append(old_word)
            context_list.append(context)