In [1]:
import torch
from transformers import RobertaTokenizer, RobertaForMaskedLM
import pandas as pd
import spacy
from nltk.corpus import wordnet as wn

# for SS step: use spacy for lemmatizing in steps a-d
nlp = spacy.load("en_core_web_sm")

# load RoBERTa model and tokenizer
model = RobertaForMaskedLM.from_pretrained('roberta-large')
tokenizer = RobertaTokenizer.from_pretrained('roberta-large')

# read the tsv file
filename = "./data/trial/tsar2022_en_trial_none.tsv"
data = pd.read_csv(filename, sep='\t', header=None, names=["sentence", "complex_word"])

In [20]:
# 1. substitute generation 

# in each row, mask the complex word and generate substitutes
for index, row in data.iterrows():
    sentence, complex_word = row["sentence"], row["complex_word"]
    
    # in the sentence, replace the complex word with a masked word
    sentence_masked_word = sentence.replace(complex_word, "<mask>")
    
    # concatenate the sentence with the complex word and the sentence with the masked word 
    sentences_concat = f"{sentence} {tokenizer.sep_token} {sentence_masked_word}"
    
    # tokenize the concatenated sentence
    sentences_concat_tokenized = tokenizer.encode(sentences_concat, return_tensors='pt')
    
    # find the masked word in the tokenized sentence
    mask_location = torch.where( sentences_concat_tokenized == tokenizer.mask_token_id)[1].item()

    # generate predictions for the masked word
    with torch.no_grad():
        outputs = model(sentences_concat_tokenized)
        predictions = outputs.logits

    # get the top-k predictions
    top_k = 30
    top_tokens = torch.topk(predictions[0, mask_location], top_k).indices

     ## decode the top-k substitutes and lowercase them
    substitutes = [tokenizer.decode(token.item()).strip().lower() for token in top_tokens]
    
    
    # print sentence, complex word, and the top_k substitutes for the complex word
    print(f"Sentence: {sentence}")
    print(f"Complex word: {complex_word}")
    print(f"SG step: generated substitutes: {substitutes}\n")
    
    # 2. Substitute Selection (SS):   
    
    # a) remove duplicates within the substitute list from the substitute list 
    
    substitutes_no_dupl = []
    for sub in substitutes:
        if sub not in substitutes_no_dupl:
            substitutes_no_dupl.append(sub)
    print(f"SS step: a) substitute list without duplicates: {substitutes_no_dupl}\n")

   
    # b) remove duplicates and inflected forms of the complex word from the substitute list
    ## Lemmatize the complex word with spaCy, in order to compare it with the lemmatized substitute later to see if their mutual lemmas are the same
    doc_complex_word = nlp(complex_word)
    complex_word_lemma = doc_complex_word[0].lemma_
    print(f"complex_word_lemma for complex word '{complex_word}': {complex_word_lemma}\n")


    ## remove duplicates and inflected forms of the complex word from the list with substitutes
    substitutes_no_dupl_complex_word = []
    for substitute in substitutes_no_dupl:
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_
        if substitute_lemma != complex_word_lemma:
            substitutes_no_dupl_complex_word.append(substitute)
    print(f"SS step: b) substitute list without duplicates and inflected forms of the complex word: {substitutes_no_dupl_complex_word}\n")

    # c) remove antonyms of the complex word from the substitute list
    substitutes_no_dupl_complex_word_no_antonym = []
    for substitute in substitutes_no_dupl_complex_word:
        syn = wn.synsets(complex_word_lemma)
        if syn:
            syn = syn[0]
            for lemma in syn.lemmas():
                if lemma.antonyms() and lemma.name() == substitute_lemma:
                    print(f"Antonym removed (lemma): {lemma.antonyms()[0].name()}")
                    break
            else:
                substitutes_no_dupl_complex_word_no_antonym.append(substitute)
        else:
            substitutes_no_dupl_complex_word_no_antonym.append(substitute)
    print(f"SS step: c): substitute list without antonyms of the complex word: {substitutes_no_dupl_complex_word_no_antonym}\n")
    
    # create sentences with the complex word replaced by the substitutes
    sentences_with_substitutes = [sentence.replace(complex_word, sub) for sub in substitutes_no_dupl_complex_word_no_antonym]
    #print(f"SG step: sentences with substitutes: {sentences_with_substitutes}\n")
    
    
    print('-----------------------------------------------------------------------------------------')

   
  
    
    

Sentence: A Spanish government source, however, later said that banks able to cover by themselves losses on their toxic property assets will not be forced to remove them from their books while it will be compulsory for those receiving public help.
Complex word: compulsory
SG step: generated substitutes: ['compulsory', 'mandatory', 'mandated', 'voluntary', 'obligatory', 'statutory', 'redundant', 'enforced', 'routine', 'relevant', 'required', 'vital', 'clandestine', 'obliged', 'bureaucratic', 'ministerial', 'mandate', 'necessary', 'lifelong', 'strict', 'gradual', 'demanded', 'lax', 'continuous', 'practicable', 'indispensable', 'forced', 'habitual', 'plain', 'preferable']

SS step: a) substitute list without duplicates: ['compulsory', 'mandatory', 'mandated', 'voluntary', 'obligatory', 'statutory', 'redundant', 'enforced', 'routine', 'relevant', 'required', 'vital', 'clandestine', 'obliged', 'bureaucratic', 'ministerial', 'mandate', 'necessary', 'lifelong', 'strict', 'gradual', 'demanded'

#### update the code to calculate bertscores

In [2]:
import bert_score
from bert_score import score

In [None]:
# # check if this code is still needed

from transformers import AutoTokenizer, AutoModel
# set the device to use
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# load the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained("roberta-large")
model = AutoModel.from_pretrained("roberta-large")
model.eval()
model.to(device)



In [7]:
# 1. substitute generation 

# in each row, mask the complex word and generate substitutes
for index, row in data.iterrows():
    sentence, complex_word = row["sentence"], row["complex_word"]
    
    # in the sentence, replace the complex word with a masked word
    sentence_masked_word = sentence.replace(complex_word, "<mask>")
    
    # concatenate the sentence with the complex word and the sentence with the masked word 
    sentences_concat = f"{sentence} {tokenizer.sep_token} {sentence_masked_word}"
    
    # tokenize the concatenated sentence
    sentences_concat_tokenized = tokenizer.encode(sentences_concat, return_tensors='pt')
    
    # find the masked word in the tokenized sentence
    mask_location = torch.where( sentences_concat_tokenized == tokenizer.mask_token_id)[1].item()

    # generate predictions for the masked word
    with torch.no_grad():
        outputs = model(sentences_concat_tokenized)
        predictions = outputs.logits

    # get the top-k predictions
    top_k = 30
    top_tokens = torch.topk(predictions[0, mask_location], top_k).indices

     ## decode the top-k substitutes and lowercase them
    substitutes = [tokenizer.decode(token.item()).strip().lower() for token in top_tokens]
    
    
    # print sentence, complex word, and the top_k substitutes for the complex word
    print(f"Sentence: {sentence}")
    print(f"Complex word: {complex_word}")
    print(f"SG step: generated substitutes: {substitutes}\n")
    
    
    # 2. Substitute Selection (SS):   
    
    # a) remove duplicates within the substitute list from the substitute list 
    
    substitutes_no_dupl = []
    for sub in substitutes:
        if sub not in substitutes_no_dupl:
            substitutes_no_dupl.append(sub)
    print(f"SS step: a) substitute list without duplicates: {substitutes_no_dupl}\n")

   
    # b) remove duplicates and inflected forms of the complex word from the substitute list
    ## Lemmatize the complex word with spaCy, in order to compare it with the lemmatized substitute later to see if their mutual lemmas are the same
    doc_complex_word = nlp(complex_word)
    complex_word_lemma = doc_complex_word[0].lemma_
    print(f"complex_word_lemma for complex word '{complex_word}': {complex_word_lemma}\n")


    ## remove duplicates and inflected forms of the complex word from the list with substitutes
    substitutes_no_dupl_complex_word = []
    for substitute in substitutes_no_dupl:
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_
        if substitute_lemma != complex_word_lemma:
            substitutes_no_dupl_complex_word.append(substitute)
    print(f"SS step: b) substitute list without duplicates and inflected forms of the complex word: {substitutes_no_dupl_complex_word}\n")

    # c) remove antonyms of the complex word from the substitute list
    substitutes_no_dupl_complex_word_no_antonym = []
    for substitute in substitutes_no_dupl_complex_word:
        syn = wn.synsets(complex_word_lemma)
        if syn:
            syn = syn[0]
            for lemma in syn.lemmas():
                if lemma.antonyms() and lemma.name() == substitute_lemma:
                    print(f"Antonym removed (lemma): {lemma.antonyms()[0].name()}")
                    break
            else:
                substitutes_no_dupl_complex_word_no_antonym.append(substitute)
        else:
            substitutes_no_dupl_complex_word_no_antonym.append(substitute)
    print(f"SS step: c): substitute list without antonyms of the complex word: {substitutes_no_dupl_complex_word_no_antonym}\n")
    
    # create sentences with the complex word replaced by the substitutes
    sentences_with_substitutes = [sentence.replace(complex_word, sub) for sub in substitutes_no_dupl_complex_word_no_antonym]
    #print(f"SG step: sentences with substitutes: {sentences_with_substitutes}\n")
    
          
    # d) use BERTScore for sorting
    scores = bert_score.score([sentence]*len(sentences_with_substitutes), sentences_with_substitutes, lang="en", model_type='roberta-large', verbose=False)
    sorted_substitutes = [substitute for _, substitute in sorted(zip(scores[0].tolist(), substitutes_no_dupl_complex_word_no_antonym), reverse=True)]
    print(f"SS step: d) substitute list sorted by descending BERTScore: {sorted_substitutes}\n")

    

Sentence: A Spanish government source, however, later said that banks able to cover by themselves losses on their toxic property assets will not be forced to remove them from their books while it will be compulsory for those receiving public help.
Complex word: compulsory
SG step: generated substitutes: ['compulsory', 'mandatory', 'mandated', 'voluntary', 'obligatory', 'statutory', 'redundant', 'enforced', 'routine', 'relevant', 'required', 'vital', 'clandestine', 'obliged', 'bureaucratic', 'ministerial', 'mandate', 'necessary', 'lifelong', 'strict', 'gradual', 'demanded', 'lax', 'continuous', 'practicable', 'indispensable', 'forced', 'habitual', 'plain', 'preferable']

SS step: a) substitute list without duplicates: ['compulsory', 'mandatory', 'mandated', 'voluntary', 'obligatory', 'statutory', 'redundant', 'enforced', 'routine', 'relevant', 'required', 'vital', 'clandestine', 'obliged', 'bureaucratic', 'ministerial', 'mandate', 'necessary', 'lifelong', 'strict', 'gradual', 'demanded'

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


SS step: d) substitute list sorted by descending BERTScore: ['mandatory', 'required', 'mandated', 'necessary', 'voluntary', 'enforced', 'obligatory', 'forced', 'preferable', 'vital', 'routine', 'demanded', 'indispensable', 'redundant', 'relevant', 'obliged', 'gradual', 'strict', 'practicable', 'bureaucratic', 'lax', 'statutory', 'continuous', 'plain', 'mandate', 'clandestine', 'habitual', 'ministerial', 'lifelong']

Sentence: Rajoy's conservative government had instilled markets with a brief dose of confidence by stepping into Bankia, performing a U-turn on its refusal to spend public money to rescue banks.
Complex word: instilled
SG step: generated substitutes: ['infused', 'injected', 'filled', 'inst', 'invested', 'illed', 'impressed', 'infected', 'revived', 'endowed', 'gifted', 'reassured', 'implanted', 'infiltrated', 'pumped', 'inject', 'flooded', 'sprinkled', 'installed', 'vested', 'thrilled', 'assured', 'penetrated', 'hit', 'provided', 'insulated', 'vaccinated', 'stocked', 'stoked

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


SS step: d) substitute list sorted by descending BERTScore: ['infused', 'filled', 'injected', 'stoked', 'flooded', 'assured', 'provided', 'pumped', 'reassured', 'gifted', 'endowed', 'infected', 'revived', 'stocked', 'impressed', 'thrilled', 'hit', 'invested', 'sprinkled', 'infiltrated', 'elevated', 'penetrated', 'installed', 'vested', 'insulated', 'implanted', 'vaccinated', 'illed', 'inject', 'inst']

Sentence: #34-3 "War maniacs of the South Korean puppet military made another grave provocation to the DPRK in the central western sector of the front on Thursday afternoon.
Complex word: maniacs
SG step: generated substitutes: ['maniac', 'criminals', 'killers', 'thugs', 'fighters', 'murderers', 'mercenaries', 'militias', 'combatants', 'factions', 'fighters', 'commanders', 'lords', 'jihadists', 'gangs', 'fascists', 'helmets', 'assassins', 'killers', 'crimes', 'squads', 'hunters', 'gunmen', 'partisans', 'detainees', 'killings', 'troops', 'monsters', 'chiefs', 'perpetrators']

SS step: a) s

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


SS step: d) substitute list sorted by descending BERTScore: ['criminals', 'lords', 'troops', 'commanders', 'fighters', 'killers', 'crimes', 'chiefs', 'perpetrators', 'thugs', 'combatants', 'assassins', 'factions', 'gangs', 'killings', 'partisans', 'murderers', 'squads', 'monsters', 'militias', 'mercenaries', 'jihadists', 'gunmen', 'detainees', 'helmets', 'hunters', 'fascists']

Sentence: The daily death toll in Syria has declined as the number of observers has risen, but few experts expect the U.N. plan to succeed in its entirety.
Complex word: observers
SG step: generated substitutes: ['observers', 'monitors', 'demonstrators', 'participants', 'opponents', 'advisors', 'experts', 'observer', 'supervisors', 'analysts', 'operators', 'responders', 'observer', 'reporters', 'witnesses', 'observations', 'inspectors', 'observes', 'investigators', 'dissidents', 'bystanders', 'visitors', 'officials', 'educators', 'airstrikes', 'organizers', 'followers', 'observed', 'outsiders', 'reinforcements']

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


SS step: d) substitute list sorted by descending BERTScore: ['monitors', 'inspectors', 'investigators', 'responders', 'advisors', 'experts', 'observations', 'reporters', 'witnesses', 'officials', 'opponents', 'analysts', 'educators', 'dissidents', 'participants', 'visitors', 'outsiders', 'reinforcements', 'bystanders', 'demonstrators', 'operators', 'followers', 'airstrikes', 'organizers', 'observed', 'supervisors', 'observes']

Sentence: An amateur video showed a young girl who apparently suffered shrapnel wounds in her thigh undergoing treatment in a makeshift Rastan hospital while screaming in pain.
Complex word: shrapnel
SG step: generated substitutes: ['gunshot', 'rapnel', 'bullet', 'gunfire', 'grenade', 'gun', 'rifle', 'mortar', 'sh', 'shell', 'sniper', 'projectile', 'stab', 'multiple', 'shot', 'gunshots', 'several', 'blast', 'arrow', 'a', 'shotgun', 'spear', 'shock', 'dagger', 'blaster', 'the', 'stray', 'explosive', 'unspecified', 'knife']

SS step: a) substitute list without dup

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


SS step: d) substitute list sorted by descending BERTScore: ['bullet', 'stab', 'gunshot', 'projectile', 'knife', 'gunfire', 'multiple', 'blast', 'several', 'sniper', 'gun', 'mortar', 'dagger', 'gunshots', 'shell', 'grenade', 'shotgun', 'shot', 'explosive', 'unspecified', 'rifle', 'arrow', 'shock', 'spear', 'the', 'stray', 'rapnel', 'blaster', 'sh', 'a']

Sentence: A local witness said a separate group of attackers disguised in burqas — the head-to-toe robes worn by conservative Afghan women — then tried to storm the compound.
Complex word: disguised
SG step: generated substitutes: ['disguised', 'cloaked', 'masked', 'dressed', 'concealed', 'clothed', 'disguise', 'clad', 'recognised', 'portrayed', 'shrouded', 'styled', 'wrapped', 'packaged', 'posed', 'adorned', 'imprisoned', 'dispersed', 'united', 'disgu', 'displayed', 'veiled', 'fitted', 'framed', 'armoured', 'frightened', 'formed', 'revealed', 'organised', 'branded']

SS step: a) substitute list without duplicates: ['disguised', 'cloak

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


SS step: d) substitute list sorted by descending BERTScore: ['masked', 'concealed', 'clothed', 'cloaked', 'dressed', 'clad', 'veiled', 'shrouded', 'adorned', 'wrapped', 'posed', 'styled', 'formed', 'portrayed', 'displayed', 'united', 'dispersed', 'fitted', 'organised', 'disguise', 'branded', 'imprisoned', 'framed', 'recognised', 'revealed', 'frightened', 'packaged', 'armoured', 'disgu']

Sentence: Syria's Sunni majority is at the forefront of the uprising against Assad, whose minority Alawite sect is an offshoot of Shi'ite Islam.
Complex word: offshoot
SG step: generated substitutes: ['shoot', 'affiliate', 'extension', 'outpost', 'adherent', 'offspring', 'ally', 'adaptation', 'off', 'off', 'adjunct', 'iteration', 'incarnation', 'enclave', 'arm', 'embryo', 'branch', 'affiliation', 'example', 'acronym', 'outreach', 'extremist', 'outline', 'insurgency', 'affiliated', 'embrace', 'orthodoxy', 'undercut', 'outlet', 'subset']

SS step: a) substitute list without duplicates: ['shoot', 'affilia

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


SS step: d) substitute list sorted by descending BERTScore: ['extension', 'affiliate', 'arm', 'offspring', 'iteration', 'incarnation', 'outpost', 'adaptation', 'ally', 'branch', 'adjunct', 'embryo', 'adherent', 'affiliation', 'acronym', 'enclave', 'embrace', 'example', 'outlet', 'extremist', 'orthodoxy', 'outline', 'off', 'subset', 'affiliated', 'insurgency', 'outreach', 'shoot', 'undercut']

Sentence: Although not as rare in the symphonic literature as sharper keys , examples of symphonies in A major are not as numerous as for D major or G major .
Complex word: symphonic
SG step: generated substitutes: ['musical', 'classical', 'harmonic', 'music', 'popular', 'onic', 'sym', 'canonical', 'instrumental', 'piano', 'the', 'of', 'historical', 'dramatic', 'general', 'modern', ',', 'theoretical', 'or', 'and', 'traditional', '</s>', 'concert', 'romantic', 'vocal', 'in', 'musical', 'contemporary', 'formal', 'analytic']

SS step: a) substitute list without duplicates: ['musical', 'classical', 'h

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


SS step: d) substitute list sorted by descending BERTScore: ['classical', 'musical', 'dramatic', 'harmonic', 'formal', 'concert', 'contemporary', 'instrumental', 'music', 'general', 'popular', 'modern', 'traditional', 'historical', 'canonical', 'romantic', 'theoretical', 'the', 'analytic', 'onic', 'piano', 'of', 'vocal', 'in', 'or', 'and', 'sym', ',', '</s>']

Sentence: That prompted the military to deploy its largest warship, the BRP Gregorio del Pilar, which was recently acquired from the United States.
Complex word: deploy
SG step: generated substitutes: ['deploy', 'deployed', 'deployment', 'mobilize', 'dispatch', 'ploy', 'employ', 'deploying', 'deploy', 'maneuver', 'disperse', 'send', 'launch', 'recruit', 'contract', 'use', 'move', 'tether', 'expend', 'monitor', 'deliver', 'secure', 'operate', 'construct', 'wield', 'patrol', 'combat', 'command', 'withdraw', 'transport']

SS step: a) substitute list without duplicates: ['deploy', 'deployed', 'deployment', 'mobilize', 'dispatch', 'pl

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


SS step: d) substitute list sorted by descending BERTScore: ['mobilize', 'dispatch', 'employ', 'use', 'launch', 'operate', 'send', 'deliver', 'command', 'construct', 'secure', 'contract', 'patrol', 'transport', 'recruit', 'move', 'maneuver', 'wield', 'expend', 'monitor', 'disperse', 'withdraw', 'deployment', 'combat', 'ploy', 'tether']

Sentence: #35-14 UK police were expressly forbidden, at a ministerial level, to provide any assistance to Thai authorities as the case involves the death penalty.
Complex word: authorities
SG step: generated substitutes: ['authorities', 'police', 'officials', 'authorities', 'authority', 'regulators', 'governments', 'arrests', 'investigators', 'colleagues', 'superiors', 'officers', 'employees', 'authorities', 'courts', 'residents', 'prosecutors', 'detainees', 'authors', 'paramedics', 'policemen', 'applications', 'forces', 'institutions', 'representatives', 'neighbours', 'bodies', 'regimes', 'individuals', 'requirements']

SS step: a) substitute list with

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


SS step: d) substitute list sorted by descending BERTScore: ['police', 'officials', 'investigators', 'prosecutors', 'policemen', 'courts', 'residents', 'forces', 'detainees', 'governments', 'officers', 'regulators', 'institutions', 'neighbours', 'representatives', 'regimes', 'employees', 'individuals', 'paramedics', 'colleagues', 'bodies', 'superiors', 'authors', 'arrests', 'applications', 'requirements']

