In [1]:
import torch
from transformers import RobertaTokenizer, RobertaForMaskedLM
import pandas as pd

# load RoBERTa model and tokenizer
model = RobertaForMaskedLM.from_pretrained('roberta-base')
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# read the tsv file
filename = "./data/trial/tsar2022_en_trial_none.tsv"
data = pd.read_csv(filename, sep='\t', header=None, names=["sentence", "complex_word"])

# in each row, mask the complex word and generate substitutes
for index, row in data.iterrows():
    sentence, complex_word = row["sentence"], row["complex_word"]
    
    # in the sentence, replace the complex word with a masked word
    sentence_maskedword = sentence.replace(complex_word, "<mask>")
    
    # tokenize the sentence with the masked word
    sentence_maskedword_tokenized = tokenizer.encode(sentence_maskedword, return_tensors='pt')
    
    # find the masked word in the tokenized sentence
    mask_location = torch.where(sentence_maskedword_tokenized == tokenizer.mask_token_id)[1].item()

    # generate predictions for the masked word
    with torch.no_grad():
        outputs = model(sentence_maskedword_tokenized)
        predictions = outputs.logits

    # # get the top-k predictions
    top_k = 30
    top_tokens = torch.topk(predictions[0, mask_location], top_k).indices

   # decode the top_tokens 
    substitutes = [tokenizer.decode(token.item()).strip() for token in top_tokens]

    print(f"Sentence: {sentence}")
    print(f"Complex word: {complex_word}")
    print(f"Top {top_k} substitutes: {substitutes}\n")


Sentence: A Spanish government source, however, later said that banks able to cover by themselves losses on their toxic property assets will not be forced to remove them from their books while it will be compulsory for those receiving public help.
Complex word: compulsory
Top 30 substitutes: ['possible', 'done', 'reserved', 'easier', 'available', 'true', 'allowed', 'difficult', 'necessary', 'so', 'mandatory', 'legal', 'only', 'different', 'fine', 'appropriate', 'easy', 'compulsory', 'required', 'applied', 'enforced', 'waived', 'obligatory', 'provided', 'impossible', 'harder', 'permitted', 'cheaper', 'beneficial', 'free']

Sentence: Rajoy's conservative government had instilled markets with a brief dose of confidence by stepping into Bankia, performing a U-turn on its refusal to spend public money to rescue banks.
Complex word: instilled
Top 30 substitutes: ['provided', 'injected', 'supplied', 'presented', 'surprised', 'left', 'infused', 'hit', 'rewarded', 'shocked', 'struck', 'pumped',