#### Evaluations RoBERTa for the trial set (10 sentences)

In [1]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
import pandas as pd
from transformers import pipeline

# read the tsv file
filename = "./data/trial/tsar2022_en_trial_none_no_noise.tsv"
data = pd.read_csv(filename, sep='\t', header=None, names=["sentence", "complex_word"])

# create an empty dataframe to store the substitutes for evaluation
substitutes_df = pd.DataFrame(columns=["sentence", "complex_word"] + [f"substitute_{i+1}" for i in range(10)])


In [2]:
import logging

In [3]:
# the code below is used for morphological adjustments in step MA
from nltk.corpus import wordnet as wn
import spacy
nlp = spacy.load("en_core_web_sm")

import string

In [4]:
# the code below is used when word embeddings are used in step SS
from transformers import TFAutoModel
import tensorflow as tf
import numpy as np

In [5]:
# the code below is used when Bertscore is used in step SS
import bert_score
from bert_score import score

In [None]:
# set the display.max_rows option to None to display all rows instead of limiting it to 50
pd.set_option('display.max_rows', None)

#### Roberta-base

In [5]:
# Instantiate the tokenizer and the model

# for roberta-base:
lm_tokenizer = AutoTokenizer.from_pretrained("roberta-base")
lm_model = AutoModelForMaskedLM.from_pretrained("roberta-base")


# Instantiate the fill-mask pipeline with the model
fill_mask = pipeline("fill-mask", lm_model, tokenizer = lm_tokenizer)

#### Substitute Generation and Morphological Adaptation:

##### for robertabase model:

In [6]:
# in each row, for each complex word: 
for index, row in data.iterrows():
    
    # print the sentence and the complex word
    sentence, complex_word = row["sentence"], row["complex_word"]
    #print(f"Sentence: {sentence}")
    #print(f"Complex word: {complex_word}")
    
     
       
    # 1. Substitute Generation (SG): perform masking and generate substitutes:

    ## in the sentence, replace the complex word with a masked word
    sentence_masked_word = sentence.replace(complex_word, lm_tokenizer.mask_token)

    ## concatenate the original sentence and the masked sentence
    sentences_concat= f"{sentence} {lm_tokenizer.sep_token} {sentence_masked_word}"

    ## generate and rank candidate substitutes for the masked word using the fill_mask pipeline (removing elements without token_str key; as this gave errors in the ELECTRA models) .
    top_k = 30
    result = fill_mask(sentences_concat, top_k=top_k)
    substitutes = [substitute["token_str"] for substitute in result if "token_str" in substitute]
    #print(f"Substitute Generation step: initial substitute list: {substitutes}\n")


    #2: Morphological Generation and Context Adaptation (Morphological Adaptation):  
    ## a) remove noise in the substitutes, by ignoring generated substitutes that are empty or that have unwanted punctuation characters or that start with '##' (this returned errors with the ELECTRA model), and lowercase the substitutes (as some models don't lowercase by default)
    ## and lowercase all substitutes. Use try/except statement to prevent other character-related problems to happen

    punctuation_set = set(string.punctuation) - set('-') # retained hyphens in case tokenizers don't split on hyphenated compounds
    punctuation_set.update({'“','”'})   # as these curly quotes appeared in the Electra (SG step) results but were not part of the string set

    try:
        substitutes = [substitute["token_str"].lower().strip() for substitute in result if not any(char in punctuation_set for char in substitute["token_str"]) # added .strip as roberta uses a leading space before each substitute
                      and not substitute["token_str"].startswith('##') and substitute["token_str"].strip() != ""]
        # print(f"Morphological Adaptation step a): substitute list without unwanted punctuation characters: {substitutes}\n")
    except TypeError as error:
        continue



    ## b) remove duplicates within the substitute list from the substitute list (duplicates are likely for models that did not lowercase by default)
    ## the last mentioned duplicate is removed on purpose, as this may probably be the (previously) uppercased variant of the lowercased substitute (lowercased subs are most likely higher ranked by the model)
    substitutes_no_dupl = []
    for sub in substitutes:
        if sub not in substitutes_no_dupl:
            substitutes_no_dupl.append(sub)
    #print(f"Morphological Adaptation step b): substitute list without duplicates of substitutes: {substitutes_no_dupl}\n")



    ## c) remove duplicates and inflected forms of the complex word from the substitute list

    ## first Lemmatize the complex word with spaCy, in order to compare it with the lemmatized substitute later to see if their mutual lemmas are the same
    doc_complex_word = nlp(complex_word)
    complex_word_lemma = doc_complex_word[0].lemma_
    #print(f"complex_word_lemma for complex word '{complex_word}': {complex_word_lemma}\n")


    ## then, remove duplicates and inflected forms of the complex word from the list with substitutes
    substitutes_no_dupl_complex_word = []
    for substitute in substitutes_no_dupl:
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_
        if substitute_lemma != complex_word_lemma:
            substitutes_no_dupl_complex_word.append(substitute)
    #print(f"Morphological Adaptation step c): substitute list without duplicates of the complex word nor inflected forms of the complex word: {substitutes_no_dupl_complex_word}\n")


     ## d) remove antonyms of the complex word from the substitute list
    ## step 1: get the antonyms of the complex word
    antonyms_complex_word = []
    for syn in wn.synsets(complex_word_lemma):
        for lemma in syn.lemmas():
            for antonym in lemma.antonyms():
                    antonyms_complex_word.append(antonym.name())

    #print(f"Antonyms for complex word '{complex_word}': {antonyms_complex_word}\n")

    ## step 2: remove antonyms of the complex word from the list with substitutes
    substitutes_no_antonyms = []
    for substitute in substitutes_no_dupl_complex_word:
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_
        if substitute_lemma not in antonyms_complex_word:
            substitutes_no_antonyms.append(substitute)
        # else:
        #     print(f"Removed antonym: {substitute}")
    #print(f"Morphological Adaptation step d): substitute list without antonyms of the complex word: {substitutes_no_antonyms}\n") 
  
    
        
    # limit the substitutes to the 10 highest ranked ones for evaluation
    top_10_substitutes = substitutes_no_antonyms[:10]
    #print(f"top-10 substitutes SG and MA: {top_10_substitutes}\n")
    
    # fill the dataframe with 10 elements even if there are less than 10 in the previous list
    required_for_dataframe = 10
    # pad the list with None until it has 10 elements
    top_10_substitutes += [None] * (required_for_dataframe - len(top_10_substitutes))

    # # add the sentence, complex_word, and the substitutes to the dataframe 
    substitutes_df.loc[index] = [sentence, complex_word] + top_10_substitutes
    
    
    #print('---------------------------------------------------------------------------------------------------------------------------------------------')
      

# export the dataframe to a tsv file for evaluation
substitutes_df.to_csv("./predictions/trial/SG_MA_robertabase.tsv", sep="\t", index=False, header=False)
print("SG_MA_robertabase exported to csv in path './predictions/trial/SG_MA_robertabase.tsv'}\n")


Antonyms for complex word 'compulsory': []

Morphological Adaptation step d): substitute list without antonyms of the complex word for robertabase model: ['mandatory', 'obligatory', 'voluntary', 'required', 'optional', 'obliged', 'uniform', 'necessary', 'available', 'mandated', 'sufficient', 'routine', 'forced', 'customary', 'prerequisite', 'feasible', 'indispensable', 'forthcoming', 'universal', 'requirement', 'involuntary', 'obligated', 'compelled', 'conditional', 'enforced', 'contingent', 'possible', 'compulsion']

Antonyms for complex word 'instilled': []

Morphological Adaptation step d): substitute list without antonyms of the complex word for robertabase model: ['infused', 'injected', 'endowed', 'illed', 'inst', 'furnished', 'supplied', 'bolstered', 'implanted', 'impressed', 'reinforced', 'invested', 'provided', 'filled', 'reassured', 'undermined', 'pumped', 'struck', 'augmented', 'enriched', 'stirred', 'vested', 'imb', 'intoxicated', 'seeded', 'misled', 'stunned', 'inspired', '

python tsar_eval.py --gold_file ./data/trial/tsar2022_en_trial_gold_no_noise.tsv --predictions_file ./predictions/trial/SG_MA_robertabase.tsv --output_file ./output/trial/SG_MA_robertabase.tsv

#### Substitute Generation, Morphological Adaptation, and Contextualized embeddings:

In [None]:
# Calculates similarity between the original sentence and the sentences with candidate substitutes that were retrieved in the SG step 
# creates a list with sentences with substitute words filled in (commented out for oversight purposes)


def calculate_similarity_scores(sentence, sentence_with_substitutes):
    tokenizer = lm_tokenizer
    tf_model = TFAutoModel.from_pretrained("roberta-base")

    def embed_text(text):
        tokens = tokenizer(text, padding=True, truncation=True, return_tensors="tf")
        outputs = tf_model(**tokens)
        embeddings = outputs.last_hidden_state[:, 0, :]
        embeddings = tf.nn.l2_normalize(embeddings, axis=1)
        return embeddings

    original_sentence_embedding = embed_text(sentence)
    substitute_sentence_embeddings = embed_text(sentence_with_substitutes)

    cosine_similarity = np.inner(original_sentence_embedding, substitute_sentence_embeddings)
    similarity_scores = cosine_similarity[0]

    return similarity_scores

In [None]:
# in each row, for each complex word: 
for index, row in data.iterrows():
    
    # print the sentence and the complex word
    sentence, complex_word = row["sentence"], row["complex_word"]
    #print(f"Sentence: {sentence}")
    #print(f"Complex word: {complex_word}")
    
    
       
    # 1. Substitute Generation (SG): perform masking and generate substitutes:

    ## in the sentence, replace the complex word with a masked word
    sentence_masked_word = sentence.replace(complex_word, lm_tokenizer.mask_token)

    ## concatenate the original sentence and the masked sentence
    sentences_concat = f"{sentence} {lm_tokenizer.sep_token} {sentence_masked_word}"

    ## generate and rank candidate substitutes for the masked word using the fill_mask pipeline (removing elements without token_str key; as this gave errors in the ELECTRA models) .
    top_k = 30
    result = fill_mask(sentences_concat, top_k=top_k)
    substitutes = [substitute["token_str"] for substitute in result if "token_str" in substitute]
    #print(f"Substitute Generation step: initial substitute list: {substitutes}\n")


    #2: Morphological Generation and Context Adaptation (Morphological Adaptation):  
    ## a) remove noise in the substitutes, by ignoring generated substitutes that are empty or that have unwanted punctuation characters or that start with '##' (this returned errors with the ELECTRA model), and lowercase the substitutes (as some models don't lowercase by default)
    ## and lowercase all substitutes. Use try/except statement to prevent other character-related problems to happen

    punctuation_set = set(string.punctuation) - set('-') # retained hyphens in case tokenizers don't split on hyphenated compounds
    punctuation_set.update({'“','”'})   # as these curly quotes appeared in the Electra (SG step) results but were not part of the string set

    try:
        substitutes = [substitute["token_str"].lower().strip() for substitute in result if not any(char in punctuation_set for char in substitute["token_str"]) # added .strip as roberta uses a leading space before each substitute
                      and not substitute["token_str"].startswith('##') and substitute["token_str"].strip() != ""]
        # print(f"Morphological Adaptation step a): substitute list without unwanted punctuation characters: {substitutes}\n")
    except TypeError as error:
        continue



    ## b) remove duplicates within the substitute list from the substitute list (duplicates are likely for models that did not lowercase by default)
    ## the last mentioned duplicate is removed on purpose, as this may probably be the (previously) uppercased variant of the lowercased substitute (lowercased subs are most likely higher ranked by the model)
    substitutes_no_dupl = []
    for sub in substitutes:
        if sub not in substitutes_no_dupl:
            substitutes_no_dupl.append(sub)
    #print(f"Morphological Adaptation step b): substitute list without duplicates of substitutes: {substitutes_no_dupl}\n")



    ## c) remove duplicates and inflected forms of the complex word from the substitute list

    ## first Lemmatize the complex word with spaCy, in order to compare it with the lemmatized substitute later to see if their mutual lemmas are the same
    doc_complex_word = nlp(complex_word)
    complex_word_lemma = doc_complex_word[0].lemma_
    #print(f"complex_word_lemma for complex word '{complex_word}': {complex_word_lemma}\n")


    ## then, remove duplicates and inflected forms of the complex word from the list with substitutes
    substitutes_no_dupl_complex_word = []
    for substitute in substitutes_no_dupl:
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_
        if substitute_lemma != complex_word_lemma:
            substitutes_no_dupl_complex_word.append(substitute)
    #print(f"Morphological Adaptation step c): substitute list without duplicates of the complex word nor inflected forms of the complex word: {substitutes_no_dupl_complex_word}\n")


     ## d) remove antonyms of the complex word from the substitute list
    ## step 1: get the antonyms of the complex word
    antonyms_complex_word = []
    for syn in wn.synsets(complex_word_lemma):
        for lemma in syn.lemmas():
            for antonym in lemma.antonyms():
                    antonyms_complex_word.append(antonym.name())

    #print(f"Antonyms for complex word '{complex_word}': {antonyms_complex_word}\n")

    ## step 2: remove antonyms of the complex word from the list with substitutes
    substitutes_no_antonyms = []
    for substitute in substitutes_no_dupl_complex_word:
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_
        if substitute_lemma not in antonyms_complex_word:
            substitutes_no_antonyms.append(substitute)
        # else:
        #     print(f"Removed antonym: {substitute}")
    #print(f"Morphological Adaptation step d): substitute list without antonyms of the complex word: {substitutes_no_antonyms}\n") 
    
    
     #3: Substitute Selection (SS) by contextualized embeddings and cosine similarity scores:  
          
    # create sentence with the complex word replaced by the substitutes
    sentence_with_substitutes = [sentence.replace(complex_word, sub) for sub in substitutes_no_antonyms]
    #print(f"List with sentences where complex word is substituted: {sentence_with_substitutes}\n")
    
    
    ##  calculate cosine similarity scores, and rank the substitutes based on their similarity score
      
    
    if len(sentence_with_substitutes) > 0: # to make sure the list with substitutes is always filled
        logging.getLogger('transformers').setLevel(logging.ERROR)  # to prevent the same warnings from being printed x times 
        similarity_scores = calculate_similarity_scores(sentence, sentence_with_substitutes)
        logging.getLogger('transformers').setLevel(logging.WARNING) # to reset the logging level back to printing warnings
        embeddings_ranked_substitutes_with_scores = sorted(zip(substitutes_no_antonyms, similarity_scores), key=lambda x: x[1], reverse=True)
        embeddings_ranked_substitutes_only = [substitute for substitute, score in embeddings_ranked_substitutes_with_scores]
        #print(f"SS step: Ranked substitutes, based on embedding scores in context: {embeddings_ranked_substitutes_only}\n")   
        
        # limit the substitutes to the 10 first ones for evaluation
        #print(f" SS step: top-10 substitutes based on embedding scores in context: {embeddings_top_10_substitutes}\n")
        embeddings_top_10_substitutes = embeddings_ranked_substitutes_only[:10]
    else:
        embeddings_top_10_substitutes = []
    
        
    # fill the dataframe with 10 elements even if there are less than 10 in the previous list
    required_for_dataframe = 10
    # pad the list with None until it has 10 elements
    embeddings_top_10_substitutes += [None] * (required_for_dataframe - len(embeddings_top_10_substitutes))
    
    # add the sentence, complex_word, and the substitutes to the dataframe 
    substitutes_df.loc[index] = [sentence, complex_word] + embeddings_top_10_substitutes
    
        
    #print('---------------------------------------------------------------------------------------------------------------------------------------------')
    
    
# export the dataframe to a tsv file for evaluation
substitutes_df.to_csv("./predictions/trial/SG_MA_ce_Robertabase_robertabase.tsv", sep="\t", index=False, header=False) 
print("SG_MA_ce_Robertabase_robertabase exported to csv in path './predictions/trial/SG_MA_ce_Robertabase_robertabase.tsv'}\n")  

In [None]:
python tsar_eval.py --gold_file ./data/trial/tsar2022_en_trial_gold_no_noise.tsv --predictions_file ./predictions/trial/SG_MA_ce_Robertabase_robertabase.tsv --output_file ./output/trial/SG_MA_ce_Robertabase_robertabase.tsv

#### Substitute Generation, Morphological Adaptation, and BertScore:

##### Bs with bertbase:

In [None]:
# in each row, for each complex word: 
for index, row in data.iterrows():
    
    # print the sentence and the complex word
    sentence, complex_word = row["sentence"], row["complex_word"]
    #print(f"Sentence: {sentence}")
    #print(f"Complex word: {complex_word}")
    
    
       
    # 1. Substitute Generation (SG): perform masking and generate substitutes:

    ## in the sentence, replace the complex word with a masked word
    sentence_masked_word = sentence.replace(complex_word, lm_tokenizer.mask_token)

    ## concatenate the original sentence and the masked sentence
    sentences_concat= f"{sentence} {lm_tokenizer.sep_token} {sentence_masked_word}"

    ## generate and rank candidate substitutes for the masked word using the fill_mask pipeline (removing elements without token_str key; as this gave errors in the ELECTRA models) .
    top_k = 30
    result= fill_mask(sentences_concat, top_k=top_k)
    substitutes = [substitute["token_str"] for substitute in result if "token_str" in substitute]
    #print(f"Substitute Generation step: initial substitute list: {substitutes}\n")


    #2: Morphological Generation and Context Adaptation (Morphological Adaptation):  
    ## a) remove noise in the substitutes, by ignoring generated substitutes that are empty or that have unwanted punctuation characters or that start with '##' (this returned errors with the ELECTRA model), and lowercase the substitutes (as some models don't lowercase by default)
    ## and lowercase all substitutes. Use try/except statement to prevent other character-related problems to happen

    punctuation_set = set(string.punctuation) - set('-') # retained hyphens in case tokenizers don't split on hyphenated compounds
    punctuation_set.update({'“','”'})   # as these curly quotes appeared in the Electra (SG step) results but were not part of the string set

    try:
        substitutes = [substitute["token_str"].lower().strip() for substitute in result if not any(char in punctuation_set for char in substitute["token_str"]) # added .strip as roberta uses a leading space before each substitute
                      and not substitute["token_str"].startswith('##') and substitute["token_str"].strip() != ""]
        # print(f"Morphological Adaptation step a): substitute list without unwanted punctuation characters: {substitutes}\n")
    except TypeError as error:
        continue



    ## b) remove duplicates within the substitute list from the substitute list (duplicates are likely for models that did not lowercase by default)
    ## the last mentioned duplicate is removed on purpose, as this may probably be the (previously) uppercased variant of the lowercased substitute (lowercased subs are most likely higher ranked by the model)
    substitutes_no_dupl = []
    for sub in substitutes:
        if sub not in substitutes_no_dupl:
            substitutes_no_dupl.append(sub)
    #print(f"Morphological Adaptation step b): substitute list without duplicates of substitutes: {substitutes_no_dupl}\n")



    ## c) remove duplicates and inflected forms of the complex word from the substitute list

    ## first Lemmatize the complex word with spaCy, in order to compare it with the lemmatized substitute later to see if their mutual lemmas are the same
    doc_complex_word = nlp(complex_word)
    complex_word_lemma = doc_complex_word[0].lemma_
    #print(f"complex_word_lemma for complex word '{complex_word}': {complex_word_lemma}\n")


    ## then, remove duplicates and inflected forms of the complex word from the list with substitutes
    substitutes_no_dupl_complex_word = []
    for substitute in substitutes_no_dupl:
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_
        if substitute_lemma != complex_word_lemma:
            substitutes_no_dupl_complex_word.append(substitute)
    #print(f"Morphological Adaptation step c): substitute list without duplicates of the complex word nor inflected forms of the complex word: {substitutes_no_dupl_complex_word}\n")


     ## d) remove antonyms of the complex word from the substitute list
    ## step 1: get the antonyms of the complex word
    antonyms_complex_word = []
    for syn in wn.synsets(complex_word_lemma):
        for lemma in syn.lemmas():
            for antonym in lemma.antonyms():
                    antonyms_complex_word.append(antonym.name())

    #print(f"Antonyms for complex word '{complex_word}': {antonyms_complex_word}\n")

    ## step 2: remove antonyms of the complex word from the list with substitutes
    substitutes_no_antonyms= []
    for substitute in substitutes_no_dupl_complex_word:
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_
        if substitute_lemma not in antonyms_complex_word:
            substitutes_no_antonyms.append(substitute)
        # else:
        #     print(f"Removed antonym: {substitute}")
    #print(f"Morphological Adaptation step d): substitute list without antonyms of the complex word: {substitutes_no_antonyms}\n") 
  
    
    
    
    #3: Substitute Selection (SS) by calculating Bert scores: 

    ## create sentence with the complex word replaced by the substitutes
    sentence_with_substitutes = [sentence.replace(complex_word, sub) for sub in substitutes_no_antonyms]
    #print(f"List with sentences where complex word is substituted: {sentence_with_substitutes}\n")


    ## calculate BERTScores, and rank the substitutes based on these scores
    if len(sentence_with_substitutes) > 0: # to make sure the list with substitutes is always filled
        logging.getLogger('transformers').setLevel(logging.ERROR)  # to prevent the same warnings from being printed x times 
        scores = bert_score.score([sentence]*len(sentence_with_substitutes), sentence_with_substitutes, lang="en", model_type='bert-base-uncased', verbose=False)
        logging.getLogger('transformers').setLevel(logging.WARNING) # to reset the logging level back to printing warnings
        
        # create a list of tuples, each tuple containing a substitute and its score
        substitute_score_pairs = list(zip(substitutes_no_antonyms, scores[0].tolist()))

        # sort the list of tuples by the scores (the second element of each tuple), in descending order
        sorted_substitute_score_pairs = sorted(substitute_score_pairs, key=lambda x: x[1], reverse=True)

        # # print each substitute with its score
        # for substitute, score in sorted_substitute_score_pairs:
        #     print(f"Substitute: {substitute}, BertScore: {score}")

        # extract the list of substitutes from the sorted pairs
        bertscore_ranked_substitutes_only = [substitute for substitute, _ in sorted_substitute_score_pairs]
        #print(f"substitutes based on bertscores in context: {bertscore_ranked_substitutes_only}\n")

        # limit the substitutes to the 10 first ones for evaluation
        bertscore_top_10_substitutes = bertscore_ranked_substitutes_only[:10]
        #print(f"top-10 substitutes based on bertscores in context: {bertscore_top_10_substitutes}\n")

    else:
        bertscore_top_10_substitutes = []


    ## add the results to the dataframe
    # fill the dataframe with 10 elements even if there are less than 10 in the previous list
    required_for_dataframe = 10

    # pad the list with None until it has 10 elements
    bertscore_top_10_substitutes += [None] * (required_for_dataframe - len(bertscore_top_10_substitutes))
  


    # add the sentence, complex_word, and substitutes to the dataframe 
    substitutes_df.loc[index] = [sentence, complex_word] + bertscore_top_10_substitutes

    #print('---------------------------------------------------------------------------------------------------------------------------------------------')
   
    
# export the dataframe to a tsv file for evaluation
substitutes_df.to_csv("./predictions/trial/SG_MA_SS_bsBertbase_robertabase.tsv", sep="\t", index=False, header=False) 
print("SG_MA_SS_bsBertbase_robertabase exported to csv in path './predictions/trial/SG_MA_SS_bsBertbase_robertabase.tsv'}\n")  

In [None]:
python tsar_eval.py --gold_file ./data/trial/tsar2022_en_trial_gold_no_noise.tsv --predictions_file ./predictions/trial/SG_MA_SS_bsBertbase_robertabase.tsv --output_file ./output/trial/SG_MA_SS_bsBertbase_robertabase.tsv

##### Bs with bertlarge:

In [None]:
# in each row, for each complex word: 
for index, row in data.iterrows():
    
    # print the sentence and the complex word
    sentence, complex_word = row["sentence"], row["complex_word"]
    #print(f"Sentence: {sentence}")
    #print(f"Complex word: {complex_word}")
    
    
       
    # 1. Substitute Generation (SG): perform masking and generate substitutes:

    ## in the sentence, replace the complex word with a masked word
    sentence_masked_word = sentence.replace(complex_word, lm_tokenizer.mask_token)

    ## concatenate the original sentence and the masked sentence
    sentences_concat= f"{sentence} {lm_tokenizer.sep_token} {sentence_masked_word}"

    ## generate and rank candidate substitutes for the masked word using the fill_mask pipeline (removing elements without token_str key; as this gave errors in the ELECTRA models) .
    top_k = 30
    result= fill_mask(sentences_concat, top_k=top_k)
    substitutes = [substitute["token_str"] for substitute in result if "token_str" in substitute]
    #print(f"Substitute Generation step: initial substitute list: {substitutes}\n")


    #2: Morphological Generation and Context Adaptation (Morphological Adaptation):  
    ## a) remove noise in the substitutes, by ignoring generated substitutes that are empty or that have unwanted punctuation characters or that start with '##' (this returned errors with the ELECTRA model), and lowercase the substitutes (as some models don't lowercase by default)
    ## and lowercase all substitutes. Use try/except statement to prevent other character-related problems to happen

    punctuation_set = set(string.punctuation) - set('-') # retained hyphens in case tokenizers don't split on hyphenated compounds
    punctuation_set.update({'“','”'})   # as these curly quotes appeared in the Electra (SG step) results but were not part of the string set

    try:
        substitutes = [substitute["token_str"].lower().strip() for substitute in result if not any(char in punctuation_set for char in substitute["token_str"]) # added .strip as roberta uses a leading space before each substitute
                      and not substitute["token_str"].startswith('##') and substitute["token_str"].strip() != ""]
        # print(f"Morphological Adaptation step a): substitute list without unwanted punctuation characters: {substitutes}\n")
    except TypeError as error:
        continue



    ## b) remove duplicates within the substitute list from the substitute list (duplicates are likely for models that did not lowercase by default)
    ## the last mentioned duplicate is removed on purpose, as this may probably be the (previously) uppercased variant of the lowercased substitute (lowercased subs are most likely higher ranked by the model)
    substitutes_no_dupl = []
    for sub in substitutes:
        if sub not in substitutes_no_dupl:
            substitutes_no_dupl.append(sub)
    #print(f"Morphological Adaptation step b): substitute list without duplicates of substitutes: {substitutes_no_dupl}\n")



    ## c) remove duplicates and inflected forms of the complex word from the substitute list

    ## first Lemmatize the complex word with spaCy, in order to compare it with the lemmatized substitute later to see if their mutual lemmas are the same
    doc_complex_word = nlp(complex_word)
    complex_word_lemma = doc_complex_word[0].lemma_
    #print(f"complex_word_lemma for complex word '{complex_word}': {complex_word_lemma}\n")


    ## then, remove duplicates and inflected forms of the complex word from the list with substitutes
    substitutes_no_dupl_complex_word = []
    for substitute in substitutes_no_dupl:
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_
        if substitute_lemma != complex_word_lemma:
            substitutes_no_dupl_complex_word.append(substitute)
    #print(f"Morphological Adaptation step c): substitute list without duplicates of the complex word nor inflected forms of the complex word: {substitutes_no_dupl_complex_word}\n")


     ## d) remove antonyms of the complex word from the substitute list
    ## step 1: get the antonyms of the complex word
    antonyms_complex_word = []
    for syn in wn.synsets(complex_word_lemma):
        for lemma in syn.lemmas():
            for antonym in lemma.antonyms():
                    antonyms_complex_word.append(antonym.name())

    #print(f"Antonyms for complex word '{complex_word}': {antonyms_complex_word}\n")

    ## step 2: remove antonyms of the complex word from the list with substitutes
    substitutes_no_antonyms= []
    for substitute in substitutes_no_dupl_complex_word:
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_
        if substitute_lemma not in antonyms_complex_word:
            substitutes_no_antonyms.append(substitute)
        # else:
        #     print(f"Removed antonym: {substitute}")
    #print(f"Morphological Adaptation step d): substitute list without antonyms of the complex word: {substitutes_no_antonyms}\n") 
  
    
    
    
    #3: Substitute Selection (SS) by calculating Bert scores: 

    ## create sentence with the complex word replaced by the substitutes
    sentence_with_substitutes = [sentence.replace(complex_word, sub) for sub in substitutes_no_antonyms]
    #print(f"List with sentences where complex word is substituted: {sentence_with_substitutes}\n")


    ## calculate BERTScores, and rank the substitutes based on these scores
    if len(sentence_with_substitutes) > 0: # to make sure the list with substitutes is always filled
        logging.getLogger('transformers').setLevel(logging.ERROR)  # to prevent the same warnings from being printed x times 
        scores = bert_score.score([sentence]*len(sentence_with_substitutes), sentence_with_substitutes, lang="en", model_type='bert-large-uncased', verbose=False)
        logging.getLogger('transformers').setLevel(logging.WARNING) # to reset the logging level back to printing warnings
        
        # create a list of tuples, each tuple containing a substitute and its score
        substitute_score_pairs = list(zip(substitutes_no_antonyms, scores[0].tolist()))

        # sort the list of tuples by the scores (the second element of each tuple), in descending order
        sorted_substitute_score_pairs = sorted(substitute_score_pairs, key=lambda x: x[1], reverse=True)

        # # print each substitute with its score
        # for substitute, score in sorted_substitute_score_pairs:
        #     print(f"Substitute: {substitute}, BertScore: {score}")

        # extract the list of substitutes from the sorted pairs
        bertscore_ranked_substitutes_only = [substitute for substitute, _ in sorted_substitute_score_pairs]
        #print(f"substitutes based on bertscores in context: {bertscore_ranked_substitutes_only}\n")

        # limit the substitutes to the 10 first ones for evaluation
        bertscore_top_10_substitutes = bertscore_ranked_substitutes_only[:10]
        #print(f"top-10 substitutes based on bertscores in context: {bertscore_top_10_substitutes}\n")

    else:
        bertscore_top_10_substitutes = []


    ## add the results to the dataframe
    # fill the dataframe with 10 elements even if there are less than 10 in the previous list
    required_for_dataframe = 10

    # pad the list with None until it has 10 elements
    bertscore_top_10_substitutes += [None] * (required_for_dataframe - len(bertscore_top_10_substitutes))
  


    # add the sentence, complex_word, and substitutes to the dataframe 
    substitutes_df.loc[index] = [sentence, complex_word] + bertscore_top_10_substitutes

    #print('---------------------------------------------------------------------------------------------------------------------------------------------')
   
    
    
# export the dataframe to a tsv file for evaluation
substitutes_df.to_csv("./predictions/trial/SG_MA_SS_bsBertlarge_robertabase.tsv", sep="\t", index=False, header=False)
print("SG_MA_SS_bsBertlarge_robertabase exported to csv in path './predictions/trial/SG_MA_SS_bsBertlarge_robertabase.tsv'}\n")

python tsar_eval.py --gold_file ./data/trial/tsar2022_en_trial_gold_no_noise.tsv --predictions_file ./predictions/trial/SG_MA_SS_bsBertlarge_robertabase.tsv --output_file ./output/trial/SG_MA_SS_bsBertlarge_robertabase.tsv

##### bs with electrabase:

In [None]:
# in each row, for each complex word: 
for index, row in data.iterrows():
    
    # print the sentence and the complex word
    sentence, complex_word = row["sentence"], row["complex_word"]
    #print(f"Sentence: {sentence}")
    #print(f"Complex word: {complex_word}")
    
    
        
    # 1. Substitute Generation (SG): perform masking and generate substitutes:

    ## in the sentence, replace the complex word with a masked word
    sentence_masked_word = sentence.replace(complex_word, lm_tokenizer.mask_token)

    ## concatenate the original sentence and the masked sentence
    sentences_concat= f"{sentence} {lm_tokenizer.sep_token} {sentence_masked_word}"

    ## generate and rank candidate substitutes for the masked word using the fill_mask pipeline (removing elements without token_str key; as this gave errors in the ELECTRA models) .
    top_k = 30
    result = fill_mask(sentences_concat, top_k=top_k)
    substitutes = [substitute["token_str"] for substitute in result if "token_str" in substitute]
    #print(f"Substitute Generation step: initial substitute list: {substitutes}\n")


    #2: Morphological Generation and Context Adaptation (Morphological Adaptation):  
    ## a) remove noise in the substitutes, by ignoring generated substitutes that are empty or that have unwanted punctuation characters or that start with '##' (this returned errors with the ELECTRA model), and lowercase the substitutes (as some models don't lowercase by default)
    ## and lowercase all substitutes. Use try/except statement to prevent other character-related problems to happen

    punctuation_set = set(string.punctuation) - set('-') # retained hyphens in case tokenizers don't split on hyphenated compounds
    punctuation_set.update({'“','”'})   # as these curly quotes appeared in the Electra (SG step) results but were not part of the string set

    try:
        substitutes = [substitute["token_str"].lower().strip() for substitute in result if not any(char in punctuation_set for char in substitute["token_str"]) # added .strip as roberta uses a leading space before each substitute
                      and not substitute["token_str"].startswith('##') and substitute["token_str"].strip() != ""]
        # print(f"Morphological Adaptation step a): substitute list without unwanted punctuation characters: {substitutes}\n")
    except TypeError as error:
        continue



    ## b) remove duplicates within the substitute list from the substitute list (duplicates are likely for models that did not lowercase by default)
    ## the last mentioned duplicate is removed on purpose, as this may probably be the (previously) uppercased variant of the lowercased substitute (lowercased subs are most likely higher ranked by the model)
    substitutes_no_dupl = []
    for sub in substitutes:
        if sub not in substitutes_no_dupl:
            substitutes_no_dupl.append(sub)
    #print(f"Morphological Adaptation step b): substitute list without duplicates of substitutes: {substitutes_no_dupl}\n")



    ## c) remove duplicates and inflected forms of the complex word from the substitute list

    ## first Lemmatize the complex word with spaCy, in order to compare it with the lemmatized substitute later to see if their mutual lemmas are the same
    doc_complex_word = nlp(complex_word)
    complex_word_lemma = doc_complex_word[0].lemma_
    #print(f"complex_word_lemma for complex word '{complex_word}': {complex_word_lemma}\n")


    ## then, remove duplicates and inflected forms of the complex word from the list with substitutes
    substitutes_no_dupl_complex_word = []
    for substitute in substitutes_no_dupl:
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_
        if substitute_lemma != complex_word_lemma:
            substitutes_no_dupl_complex_word.append(substitute)
    #print(f"Morphological Adaptation step c): substitute list without duplicates of the complex word nor inflected forms of the complex word : {substitutes_no_dupl_complex_word\n")


     ## d) remove antonyms of the complex word from the substitute list
    ## step 1: get the antonyms of the complex word
    antonyms_complex_word = []
    for syn in wn.synsets(complex_word_lemma):
        for lemma in syn.lemmas():
            for antonym in lemma.antonyms():
                    antonyms_complex_word.append(antonym.name())

    #print(f"Antonyms for complex word '{complex_word}': {antonyms_complex_word}\n")

    ## step 2: remove antonyms of the complex word from the list with substitutes
    substitutes_no_antonyms = []
    for substitute in substitutes_no_dupl_complex_word:
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_
        if substitute_lemma not in antonyms_complex_word:
            substitutes_no_antonyms.append(substitute)
        # else:
        #     print(f"Removed antonym: {substitute}")
    #print(f"Morphological Adaptation step d): substitute list without antonyms of the complex word: {substitutes_no_antonyms}\n") 
  
    
    
    
    #3: Substitute Selection (SS) by calculating Bertscores: 
    ## create sentence with the complex word replaced by the substitutes
    sentence_with_substitutes = [sentence.replace(complex_word, sub) for sub in substitutes_no_antonyms]
    #print(f"List with sentences where complex word is substituted: {sentence_with_substitutes}\n")


    ## calculate BERTScores, and rank the substitutes based on these scores
    if len(sentence_with_substitutes) > 0: # to make sure the list with substitutes is always filled
        logging.getLogger('transformers').setLevel(logging.ERROR)  # to prevent the same warnings from being printed x times 
        scores = bert_score.score([sentence]*len(sentence_with_substitutes), sentence_with_substitutes, lang="en", model_type='google/electra-base-generator', verbose=False)
        logging.getLogger('transformers').setLevel(logging.WARNING) # to reset the logging level back to printing warnings
    
        # create a list of tuples, each tuple containing a substitute and its score
        substitute_score_pairs = list(zip(substitutes_no_antonyms, scores[0].tolist()))

        # sort the list of tuples by the scores (the second element of each tuple), in descending order
        sorted_substitute_score_pairs = sorted(substitute_score_pairs, key=lambda x: x[1], reverse=True)

        # print each substitute with its score
        # for substitute, score in sorted_substitute_score_pairs:
        #     print(f"Substitute: {substitute}, BertScore: {score}")

        # extract the list of substitutes from the sorted pairs
        bertscore_ranked_substitutes_only = [substitute for substitute, _ in sorted_substitute_score_pairs]
        #print(f"substitutes based on bertscores in context: {bertscore_ranked_substitutes_only}\n")

        # limit the substitutes to the 10 first ones for evaluation
        bertscore_top_10_substitutes = bertscore_ranked_substitutes_only[:10]
        #print(f"top-10 substitutes based on bertscores in context: {bertscore_top_10_substitutes}\n")

    else:
        bertscore_top_10_substitutes = []


    ## add the results to the dataframe
    ## fill the dataframe with 10 elements even if there are less than 10 in the previous list
    required_for_dataframe = 10

    ## pad the list with None until it has 10 elements
    bertscore_top_10_substitutes += [None] * (required_for_dataframe - len(bertscore_top_10_substitutes))
   


    ## add the sentence, complex_word, and substitutes to the dataframe 
    substitutes_df.loc[index] = [sentence, complex_word] + bertscore_top_10_substitutes

    #print('---------------------------------------------------------------------------------------------------------------------------------------------')
   

# export the dataframe to a tsv file for evaluation
substitutes_df.to_csv("./predictions/trial/SG_MA_SS_bsElectrabase_robertabase.tsv", sep="\t", index=False, header=False)   
print("SG_MA_SS_bsElectrabase_robertabase exported to csv in path './predictions/trial/SG_MA_SS_bsElectrabase_robertabase.tsv'}\n")

python tsar_eval.py --gold_file ./data/trial/tsar2022_en_trial_gold_no_noise.tsv --predictions_file ./predictions/trial/SG_MA_SS_bsElectrabase_robertabase.tsv --output_file ./output/trial/SG_MA_SS_bsElectrabase_robertabase.tsv

###### bs with electralarge:

# in each row, for each complex word: 
for index, row in data.iterrows():
    
    # print the sentence and the complex word
    sentence, complex_word = row["sentence"], row["complex_word"]
    #print(f"Sentence: {sentence}")
    #print(f"Complex word: {complex_word}")
    
    
        
    # 1. Substitute Generation (SG): perform masking and generate substitutes:

    ## in the sentence, replace the complex word with a masked word
    sentence_masked_word = sentence.replace(complex_word, lm_tokenizer.mask_token)

    ## concatenate the original sentence and the masked sentence
    sentences_concat= f"{sentence} {lm_tokenizer.sep_token} {sentence_masked_word}"

    ## generate and rank candidate substitutes for the masked word using the fill_mask pipeline (removing elements without token_str key; as this gave errors in the ELECTRA models) .
    top_k = 30
    result = fill_mask(sentences_concat, top_k=top_k)
    substitutes = [substitute["token_str"] for substitute in result if "token_str" in substitute]
    #print(f"Substitute Generation step: initial substitute list: {substitutes}\n")


    #2: Morphological Generation and Context Adaptation (Morphological Adaptation):  
    ## a) remove noise in the substitutes, by ignoring generated substitutes that are empty or that have unwanted punctuation characters or that start with '##' (this returned errors with the ELECTRA model), and lowercase the substitutes (as some models don't lowercase by default)
    ## and lowercase all substitutes. Use try/except statement to prevent other character-related problems to happen

    punctuation_set = set(string.punctuation) - set('-') # retained hyphens in case tokenizers don't split on hyphenated compounds
    punctuation_set.update({'“','”'})   # as these curly quotes appeared in the Electra (SG step) results but were not part of the string set

    try:
        substitutes = [substitute["token_str"].lower().strip() for substitute in result if not any(char in punctuation_set for char in substitute["token_str"]) # added .strip as roberta uses a leading space before each substitute
                      and not substitute["token_str"].startswith('##') and substitute["token_str"].strip() != ""]
        # print(f"Morphological Adaptation step a): substitute list without unwanted punctuation characters: {substitutes}\n")
    except TypeError as error:
        continue



    ## b) remove duplicates within the substitute list from the substitute list (duplicates are likely for models that did not lowercase by default)
    ## the last mentioned duplicate is removed on purpose, as this may probably be the (previously) uppercased variant of the lowercased substitute (lowercased subs are most likely higher ranked by the model)
    substitutes_no_dupl = []
    for sub in substitutes:
        if sub not in substitutes_no_dupl:
            substitutes_no_dupl.append(sub)
    #print(f"Morphological Adaptation step b): substitute list without duplicates of substitutes: {substitutes_no_dupl}\n")



    ## c) remove duplicates and inflected forms of the complex word from the substitute list

    ## first Lemmatize the complex word with spaCy, in order to compare it with the lemmatized substitute later to see if their mutual lemmas are the same
    doc_complex_word = nlp(complex_word)
    complex_word_lemma = doc_complex_word[0].lemma_
    #print(f"complex_word_lemma for complex word '{complex_word}': {complex_word_lemma}\n")


    ## then, remove duplicates and inflected forms of the complex word from the list with substitutes
    substitutes_no_dupl_complex_word = []
    for substitute in substitutes_no_dupl:
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_
        if substitute_lemma != complex_word_lemma:
            substitutes_no_dupl_complex_word.append(substitute)
    #print(f"Morphological Adaptation step c): substitute list without duplicates of the complex word nor inflected forms of the complex word : {substitutes_no_dupl_complex_word\n")


     ## d) remove antonyms of the complex word from the substitute list
    ## step 1: get the antonyms of the complex word
    antonyms_complex_word = []
    for syn in wn.synsets(complex_word_lemma):
        for lemma in syn.lemmas():
            for antonym in lemma.antonyms():
                    antonyms_complex_word.append(antonym.name())

    #print(f"Antonyms for complex word '{complex_word}': {antonyms_complex_word}\n")

    ## step 2: remove antonyms of the complex word from the list with substitutes
    substitutes_no_antonyms = []
    for substitute in substitutes_no_dupl_complex_word:
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_
        if substitute_lemma not in antonyms_complex_word:
            substitutes_no_antonyms.append(substitute)
        # else:
        #     print(f"Removed antonym: {substitute}")
    #print(f"Morphological Adaptation step d): substitute list without antonyms of the complex word: {substitutes_no_antonyms}\n") 
  
    
    
    
    #3: Substitute Selection (SS) by calculating Bertscores: 
    ## create sentence with the complex word replaced by the substitutes
    sentence_with_substitutes = [sentence.replace(complex_word, sub) for sub in substitutes_no_antonyms]
    #print(f"List with sentences where complex word is substituted: {sentence_with_substitutes}\n")


    ## calculate BERTScores, and rank the substitutes based on these scores
    if len(sentence_with_substitutes) > 0: # to make sure the list with substitutes is always filled
        logging.getLogger('transformers').setLevel(logging.ERROR)  # to prevent the same warnings from being printed x times 
        scores = bert_score.score([sentence]*len(sentence_with_substitutes), sentence_with_substitutes, lang="en", model_type='google/electra-large-generator', verbose=False)
        logging.getLogger('transformers').setLevel(logging.WARNING) # to reset the logging level back to printing warnings
    
        # create a list of tuples, each tuple containing a substitute and its score
        substitute_score_pairs = list(zip(substitutes_no_antonyms, scores[0].tolist()))

        # sort the list of tuples by the scores (the second element of each tuple), in descending order
        sorted_substitute_score_pairs = sorted(substitute_score_pairs, key=lambda x: x[1], reverse=True)

        # print each substitute with its score
        # for substitute, score in sorted_substitute_score_pairs:
        #     print(f"Substitute: {substitute}, BertScore: {score}")

        # extract the list of substitutes from the sorted pairs
        bertscore_ranked_substitutes_only = [substitute for substitute, _ in sorted_substitute_score_pairs]
        #print(f"substitutes based on bertscores in context: {bertscore_ranked_substitutes_only}\n")

        # limit the substitutes to the 10 first ones for evaluation
        bertscore_top_10_substitutes = bertscore_ranked_substitutes_only[:10]
        #print(f"top-10 substitutes based on bertscores in context: {bertscore_top_10_substitutes}\n")

    else:
        bertscore_top_10_substitutes = []


    ## add the results to the dataframe
    ## fill the dataframe with 10 elements even if there are less than 10 in the previous list
    required_for_dataframe = 10

    ## pad the list with None until it has 10 elements
    bertscore_top_10_substitutes += [None] * (required_for_dataframe - len(bertscore_top_10_substitutes))
   


    ## add the sentence, complex_word, and substitutes to the dataframe 
    substitutes_df.loc[index] = [sentence, complex_word] + bertscore_top_10_substitutes

    #print('---------------------------------------------------------------------------------------------------------------------------------------------')
   

# export the dataframe to a tsv file for evaluation
substitutes_df.to_csv("./predictions/trial/SG_MA_SS_bsElectralarge_robertabase.tsv", sep="\t", index=False, header=False)   
print("SG_MA_SS_bsElectralarge_robertabase exported to csv in path './predictions/trial/SG_MA_SS_bsElectralarge_robertabase.tsv'}\n")

python tsar_eval.py --gold_file ./data/trial/tsar2022_en_trial_gold_no_noise.tsv --predictions_file ./predictions/trial/SG_MA_SS_bsElectralarge_robertabase.tsv --output_file ./output/trial/SG_MA_SS_bsElectralarge_robertabase.tsv

##### bs with robertabase:

In [None]:
# in each row, for each complex word: 
for index, row in data.iterrows():
    
    # print the sentence and the complex word
    sentence, complex_word = row["sentence"], row["complex_word"]
    #print(f"Sentence: {sentence}")
    #print(f"Complex word: {complex_word}")
    
       
    # 1. Substitute Generation (SG): perform masking and generate substitutes:

    ## in the sentence, replace the complex word with a masked word
    sentence_masked_word = sentence.replace(complex_word, lm_tokenizer.mask_token)

    ## concatenate the original sentence and the masked sentence
    sentences_concat = f"{sentence} {lm_tokenizer.sep_token} {sentence_masked_word}"

    ## generate and rank candidate substitutes for the masked word using the fill_mask pipeline (removing elements without token_str key; as this gave errors in the ELECTRA models) .
    top_k = 30
    result = fill_mask(sentences_concat, top_k=top_k)
    substitutes = [substitute["token_str"] for substitute in result if "token_str" in substitute]
    #print(f"Substitute Generation step: initial substitute list: {substitutes}\n")


    #2: Morphological Generation and Context Adaptation (Morphological Adaptation):  
    ## a) remove noise in the substitutes, by ignoring generated substitutes that are empty or that have unwanted punctuation characters or that start with '##' (this returned errors with the ELECTRA model), and lowercase the substitutes (as some models don't lowercase by default)
    ## and lowercase all substitutes. Use try/except statement to prevent other character-related problems to happen

    punctuation_set = set(string.punctuation) - set('-') # retained hyphens in case tokenizers don't split on hyphenated compounds
    punctuation_set.update({'“','”'})   # as these curly quotes appeared in the Electra (SG step) results but were not part of the string set

    try:
        substitutes = [substitute["token_str"].lower().strip() for substitute in result if not any(char in punctuation_set for char in substitute["token_str"]) # added .strip as roberta uses a leading space before each substitute
                      and not substitute["token_str"].startswith('##') and substitute["token_str"].strip() != ""]
        # print(f"Morphological Adaptation step a): substitute list without unwanted punctuation characters: {substitutes}\n")
    except TypeError as error:
        continue



    ## b) remove duplicates within the substitute list from the substitute list (duplicates are likely for models that did not lowercase by default)
    ## the last mentioned duplicate is removed on purpose, as this may probably be the (previously) uppercased variant of the lowercased substitute (lowercased subs are most likely higher ranked by the model)
    substitutes_no_dupl = []
    for sub in substitutes:
        if sub not in substitutes_no_dupl:
            substitutes_no_dupl.append(sub)
    #print(f"Morphological Adaptation step b): substitute list without duplicates of substitutes: {substitutes_no_dupl}\n")



    ## c) remove duplicates and inflected forms of the complex word from the substitute list

    ## first Lemmatize the complex word with spaCy, in order to compare it with the lemmatized substitute later to see if their mutual lemmas are the same
    doc_complex_word = nlp(complex_word)
    complex_word_lemma = doc_complex_word[0].lemma_
    #print(f"complex_word_lemma for complex word '{complex_word}': {complex_word_lemma}\n")


    ## then, remove duplicates and inflected forms of the complex word from the list with substitutes
    substitutes_no_dupl_complex_word = []
    for substitute in substitutes_no_dupl:
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_
        if substitute_lemma != complex_word_lemma:
            substitutes_no_dupl_complex_word.append(substitute)
    #print(f"Morphological Adaptation step c): substitute list without duplicates of the complex word nor inflected forms of the complex word: {substitutes_no_dupl_complex_word}\n")


     ## d) remove antonyms of the complex word from the substitute list
    ## step 1: get the antonyms of the complex word
    antonyms_complex_word = []
    for syn in wn.synsets(complex_word_lemma):
        for lemma in syn.lemmas():
            for antonym in lemma.antonyms():
                    antonyms_complex_word.append(antonym.name())

    #print(f"Antonyms for complex word '{complex_word}': {antonyms_complex_word}\n")

    ## step 2: remove antonyms of the complex word from the list with substitutes
    substitutes_no_antonyms = []
    for substitute in substitutes_no_dupl_complex_word:
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_
        if substitute_lemma not in antonyms_complex_word:
            substitutes_no_antonyms.append(substitute)
        # else:
        #     print(f"Removed antonym: {substitute}")
    #print(f"Morphological Adaptation step d): substitute list without antonyms of the complex word: {substitutes_no_antonyms}\n") 
  
    
    
    
    
    
    #3: Substitute Selection (SS) by calculating Bert scores: 

    ## create sentence with the complex word replaced by the substitutes
    sentence_with_substitutes = [sentence.replace(complex_word, sub) for sub in substitutes_no_antonyms]
    #print(f"List with sentences where complex word is substituted: {sentence_with_substitutes}\n")


    ## calculate BERTScores, and rank the substitutes based on these scores
    if len(sentence_with_substitutes) > 0: # to make sure the list with substitutes is always filled
        logging.getLogger('transformers').setLevel(logging.ERROR)  # to prevent the same warnings from being printed x times 
        scores = bert_score.score([sentence]*len(sentence_with_substitutes), sentence_with_substitutes, lang="en", model_type='roberta-base', verbose=False)
        logging.getLogger('transformers').setLevel(logging.WARNING) # to reset the logging level back to printing warnings
        
        # create a list of tuples, each tuple containing a substitute and its score
        substitute_score_pairs = list(zip(substitutes_no_antonyms, scores[0].tolist()))

        # sort the list of tuples by the scores (the second element of each tuple), in descending order
        sorted_substitute_score_pairs = sorted(substitute_score_pairs, key=lambda x: x[1], reverse=True)

        # # print each substitute with its score
        # for substitute, score in sorted_substitute_score_pairs:
        #     print(f"Substitute: {substitute}, BertScore: {score}")

        # extract the list of substitutes from the sorted pairs
        bertscore_ranked_substitutes_only = [substitute for substitute, _ in sorted_substitute_score_pairs]
        #print(f"substitutes based on bertscores in context: {bertscore_ranked_substitutes_only}\n")

        # limit the substitutes to the 10 first ones for evaluation
        bertscore_top_10_substitutes = bertscore_ranked_substitutes_only[:10]
        #print(f"top-10 substitutes based on bertscores in context: {bertscore_top_10_substitutes}\n")

    else:
        bertscore_top_10_substitutes = []


    ## add the results to the dataframe
    # fill the dataframe with 10 elements even if there are less than 10 in the previous list
    required_for_dataframe = 10

    # pad the list with None until it has 10 elements
    bertscore_top_10_substitutes += [None] * (required_for_dataframe - len(bertscore_top_10_substitutes))
  


    # add the sentence, complex_word, and substitutes to the dataframe 
    substitutes_df.loc[index] = [sentence, complex_word] + bertscore_top_10_substitutes

    #print('---------------------------------------------------------------------------------------------------------------------------------------------')
   
   
    
# export the dataframe to a tsv file for evaluation
substitutes_df.to_csv("./predictions/trial/SG_MA_SS_bsRobertabase_robertabase.tsv", sep="\t", index=False, header=False) 
print("SG_MA_SS_bsRobertabase_robertabase exported to csv in path './predictions/trial/SG_MA_SS_bsRobertabase_robertabase.tsv'}\n")

python tsar_eval.py --gold_file ./data/trial/tsar2022_en_trial_gold_no_noise.tsv --predictions_file ./predictions/trial/SG_MA_SS_bsRobertabase_robertabase.tsv --output_file ./output/trial/SG_MA_SS_bsRobertabase_robertabase.tsv

##### bs with robertalarge:

In [None]:
# in each row, for each complex word: 
for index, row in data.iterrows():
    
    # print the sentence and the complex word
    sentence, complex_word = row["sentence"], row["complex_word"]
    #print(f"Sentence: {sentence}")
    #print(f"Complex word: {complex_word}")
    
       
    # 1. Substitute Generation (SG): perform masking and generate substitutes:

    ## in the sentence, replace the complex word with a masked word
    sentence_masked_word = sentence.replace(complex_word, lm_tokenizer.mask_token)

    ## concatenate the original sentence and the masked sentence
    sentences_concat = f"{sentence} {lm_tokenizer.sep_token} {sentence_masked_word}"

    ## generate and rank candidate substitutes for the masked word using the fill_mask pipeline (removing elements without token_str key; as this gave errors in the ELECTRA models) .
    top_k = 30
    result = fill_mask(sentences_concat, top_k=top_k)
    substitutes = [substitute["token_str"] for substitute in result if "token_str" in substitute]
    #print(f"Substitute Generation step: initial substitute list: {substitutes}\n")


    #2: Morphological Generation and Context Adaptation (Morphological Adaptation):  
    ## a) remove noise in the substitutes, by ignoring generated substitutes that are empty or that have unwanted punctuation characters or that start with '##' (this returned errors with the ELECTRA model), and lowercase the substitutes (as some models don't lowercase by default)
    ## and lowercase all substitutes. Use try/except statement to prevent other character-related problems to happen

    punctuation_set = set(string.punctuation) - set('-') # retained hyphens in case tokenizers don't split on hyphenated compounds
    punctuation_set.update({'“','”'})   # as these curly quotes appeared in the Electra (SG step) results but were not part of the string set

    try:
        substitutes = [substitute["token_str"].lower().strip() for substitute in result if not any(char in punctuation_set for char in substitute["token_str"]) # added .strip as roberta uses a leading space before each substitute
                      and not substitute["token_str"].startswith('##') and substitute["token_str"].strip() != ""]
        # print(f"Morphological Adaptation step a): substitute list without unwanted punctuation characters: {substitutes}\n")
    except TypeError as error:
        continue



    ## b) remove duplicates within the substitute list from the substitute list (duplicates are likely for models that did not lowercase by default)
    ## the last mentioned duplicate is removed on purpose, as this may probably be the (previously) uppercased variant of the lowercased substitute (lowercased subs are most likely higher ranked by the model)
    substitutes_no_dupl = []
    for sub in substitutes:
        if sub not in substitutes_no_dupl:
            substitutes_no_dupl.append(sub)
    #print(f"Morphological Adaptation step b): substitute list without duplicates of substitutes: {substitutes_no_dupl}\n")



    ## c) remove duplicates and inflected forms of the complex word from the substitute list

    ## first Lemmatize the complex word with spaCy, in order to compare it with the lemmatized substitute later to see if their mutual lemmas are the same
    doc_complex_word = nlp(complex_word)
    complex_word_lemma = doc_complex_word[0].lemma_
    #print(f"complex_word_lemma for complex word '{complex_word}': {complex_word_lemma}\n")


    ## then, remove duplicates and inflected forms of the complex word from the list with substitutes
    substitutes_no_dupl_complex_word = []
    for substitute in substitutes_no_dupl:
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_
        if substitute_lemma != complex_word_lemma:
            substitutes_no_dupl_complex_word.append(substitute)
    #print(f"Morphological Adaptation step c): substitute list without duplicates of the complex word nor inflected forms of the complex word: {substitutes_no_dupl_complex_word}\n")


     ## d) remove antonyms of the complex word from the substitute list
    ## step 1: get the antonyms of the complex word
    antonyms_complex_word = []
    for syn in wn.synsets(complex_word_lemma):
        for lemma in syn.lemmas():
            for antonym in lemma.antonyms():
                    antonyms_complex_word.append(antonym.name())

    #print(f"Antonyms for complex word '{complex_word}': {antonyms_complex_word}\n")

    ## step 2: remove antonyms of the complex word from the list with substitutes
    substitutes_no_antonyms = []
    for substitute in substitutes_no_dupl_complex_word:
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_
        if substitute_lemma not in antonyms_complex_word:
            substitutes_no_antonyms.append(substitute)
        # else:
        #     print(f"Removed antonym: {substitute}")
    #print(f"Morphological Adaptation step d): substitute list without antonyms of the complex word: {substitutes_no_antonyms}\n") 
  
    
    
    
    
    
    #3: Substitute Selection (SS) by calculating Bert scores: 

    ## create sentence with the complex word replaced by the substitutes
    sentence_with_substitutes = [sentence.replace(complex_word, sub) for sub in substitutes_no_antonyms]
    #print(f"List with sentences where complex word is substituted: {sentence_with_substitutes}\n")


    ## calculate BERTScores, and rank the substitutes based on these scores
    if len(sentence_with_substitutes) > 0: # to make sure the list with substitutes is always filled
        logging.getLogger('transformers').setLevel(logging.ERROR)  # to prevent the same warnings from being printed x times 
        scores = bert_score.score([sentence]*len(sentence_with_substitutes), sentence_with_substitutes, lang="en", model_type='roberta-large', verbose=False)
        logging.getLogger('transformers').setLevel(logging.WARNING) # to reset the logging level back to printing warnings
        
        # create a list of tuples, each tuple containing a substitute and its score
        substitute_score_pairs = list(zip(substitutes_no_antonyms, scores[0].tolist()))

        # sort the list of tuples by the scores (the second element of each tuple), in descending order
        sorted_substitute_score_pairs = sorted(substitute_score_pairs, key=lambda x: x[1], reverse=True)

        # # print each substitute with its score
        # for substitute, score in sorted_substitute_score_pairs:
        #     print(f"Substitute: {substitute}, BertScore: {score}")

        # extract the list of substitutes from the sorted pairs
        bertscore_ranked_substitutes_only = [substitute for substitute, _ in sorted_substitute_score_pairs]
        #print(f"substitutes based on bertscores in context: {bertscore_ranked_substitutes_only}\n")

        # limit the substitutes to the 10 first ones for evaluation
        bertscore_top_10_substitutes = bertscore_ranked_substitutes_only[:10]
        #print(f"top-10 substitutes based on bertscores in context: {bertscore_top_10_substitutes}\n")

    else:
        bertscore_top_10_substitutes = []


    ## add the results to the dataframe
    # fill the dataframe with 10 elements even if there are less than 10 in the previous list
    required_for_dataframe = 10

    # pad the list with None until it has 10 elements
    bertscore_top_10_substitutes += [None] * (required_for_dataframe - len(bertscore_top_10_substitutes))
  


    # add the sentence, complex_word, and substitutes to the dataframe 
    substitutes_df.loc[index] = [sentence, complex_word] + bertscore_top_10_substitutes

    #print('---------------------------------------------------------------------------------------------------------------------------------------------')
   
   
    
# export the dataframe to a tsv file for evaluation
substitutes_df.to_csv("./predictions/trial/SG_MA_SS_bsRobertalarge_robertabase.tsv", sep="\t", index=False, header=False) 
print("SG_MA_SS_bsRobertalarge_robertabase exported to csv in path './predictions/trial/SG_MA_SS_bsRobertalarge_robertabase.tsv'}\n")

python tsar_eval.py --gold_file ./data/trial/tsar2022_en_trial_gold_no_noise.tsv --predictions_file ./predictions/trial/SG_MA_SS_bsRobertalarge_robertabase.tsv --output_file ./output/trial/SG_MA_SS_bsRobertalarge_robertabase.tsv

#### Roberta-large:

In [7]:
# Instantiate the tokenizer and the model

# for roberta-base:
lm_tokenizer = AutoTokenizer.from_pretrained("roberta-large")
lm_model = AutoModelForMaskedLM.from_pretrained("roberta-large")


# Instantiate the fill-mask pipeline with the model
fill_mask = pipeline("fill-mask", lm_model, tokenizer = lm_tokenizer)

In [8]:
# in each row, for each complex word: 
for index, row in data.iterrows():
    
    # print the sentence and the complex word
    sentence, complex_word = row["sentence"], row["complex_word"]
    #print(f"Sentence: {sentence}")
    #print(f"Complex word: {complex_word}")
    
    
       
    # 1. Substitute Generation (SG): perform masking and generate substitutes:

    ## in the sentence, replace the complex word with a masked word
    sentence_masked_word = sentence.replace(complex_word, lm_tokenizer.mask_token)

    ## concatenate the original sentence and the masked sentence
    sentences_concat = f"{sentence} {lm_tokenizer.sep_token} {sentence_masked_word}"

    ## generate and rank candidate substitutes for the masked word using the fill_mask pipeline (removing elements without token_str key; as this gave errors in the ELECTRA models) .
    top_k = 30
    result= fill_mask(sentences_concat, top_k=top_k)
    substitutes = [substitute["token_str"] for substitute in result if "token_str" in substitute]
    #print(f"Substitute Generation step: initial substitute list: {substitutes}\n")


    #2: Morphological Generation and Context Adaptation (Morphological Adaptation):  
    ## a) remove noise in the substitutes, by ignoring generated substitutes that are empty or that have unwanted punctuation characters or that start with '##' (this returned errors with the ELECTRA model), and lowercase the substitutes (as some models don't lowercase by default)
    ## and lowercase all substitutes. Use try/except statement to prevent other character-related problems to happen

    punctuation_set = set(string.punctuation) - set('-') # retained hyphens in case tokenizers don't split on hyphenated compounds
    punctuation_set.update({'“','”'})   # as these curly quotes appeared in the Electra (SG step) results but were not part of the string set

    try:
        substitutes = [substitute["token_str"].lower().strip() for substitute in result if not any(char in punctuation_set for char in substitute["token_str"]) # added .strip as roberta uses a leading space before each substitute
                      and not substitute["token_str"].startswith('##') and substitute["token_str"].strip() != ""]
        # print(f"Morphological Adaptation step a): substitute list without unwanted punctuation characters: {substitutes}\n")
    except TypeError as error:
        continue



    ## b) remove duplicates within the substitute list from the substitute list (duplicates are likely for models that did not lowercase by default)
    ## the last mentioned duplicate is removed on purpose, as this may probably be the (previously) uppercased variant of the lowercased substitute (lowercased subs are most likely higher ranked by the model)
    substitutes_no_dupl = []
    for sub in substitutes:
        if sub not in substitutes_no_dupl:
            substitutes_no_dupl.append(sub)
    #print(f"Morphological Adaptation step b): substitute list without duplicates of substitutes model: {substitutes_no_dupl}\n")



    ## c) remove duplicates and inflected forms of the complex word from the substitute list

    ## first Lemmatize the complex word with spaCy, in order to compare it with the lemmatized substitute later to see if their mutual lemmas are the same
    doc_complex_word = nlp(complex_word)
    complex_word_lemma = doc_complex_word[0].lemma_
    #print(f"complex_word_lemma for complex word '{complex_word}': {complex_word_lemma}\n")


    ## then, remove duplicates and inflected forms of the complex word from the list with substitutes
    substitutes_no_dupl_complex_word = []
    for substitute in substitutes_no_dupl:
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_
        if substitute_lemma != complex_word_lemma:
            substitutes_no_dupl_complex_word.append(substitute)
    #print(f"Morphological Adaptation step c): substitute list without duplicates of the complex word nor inflected forms of the complex word: {substitutes_no_dupl_complex_word}\n")


     ## d) remove antonyms of the complex word from the substitute list
    ## step 1: get the antonyms of the complex word
    antonyms_complex_word = []
    for syn in wn.synsets(complex_word_lemma):
        for lemma in syn.lemmas():
            for antonym in lemma.antonyms():
                    antonyms_complex_word.append(antonym.name())

    #print(f"Antonyms for complex word '{complex_word}': {antonyms_complex_word}\n")

    ## step 2: remove antonyms of the complex word from the list with substitutes
    substitutes_no_antonyms = []
    for substitute in substitutes_no_dupl_complex_word:
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_
        if substitute_lemma not in antonyms_complex_word:
            substitutes_no_antonyms.append(substitute)
        # else:
        #     print(f"Removed antonym: {substitute}")
    #print(f"Morphological Adaptation step d): substitute list without antonyms of the complex word: {substitutes_no_antonyms}\n") 
  
    
        
    # limit the substitutes to the 10 highest ranked ones for evaluation
    top_10_substitutes = substitutes_no_antonyms[:10]
    #print(f"top-10 substitutes SG and MA: {top_10_substitutes}\n")
    
    # fill the dataframe with 10 elements even if there are less than 10 in the previous list
    required_for_dataframe = 10
    # pad the list with None until it has 10 elements
    top_10_substitutes += [None] * (required_for_dataframe - len(top_10_substitutes))

    # # add the sentence, complex_word, and the substitutes to the dataframe 
    substitutes_df.loc[index] = [sentence, complex_word] + top_10_substitutes
    
    
    #print('---------------------------------------------------------------------------------------------------------------------------------------------')
    

# export the dataframe to a tsv file for evaluation
substitutes_df.to_csv("./predictions/trial/SG_MA_robertalarge.tsv", sep="\t", index=False, header=False)
print("SG_MA_robertalarge exported to csv in path './predictions/trial/SG_MA_robertalarge.tsv'}\n")


Antonyms for complex word 'compulsory': []

Morphological Adaptation step d): substitute list without antonyms of the complex word for robertabase model: ['mandatory', 'mandated', 'voluntary', 'obligatory', 'statutory', 'redundant', 'enforced', 'routine', 'relevant', 'required', 'vital', 'clandestine', 'obliged', 'bureaucratic', 'ministerial', 'mandate', 'necessary', 'lifelong', 'strict', 'gradual', 'demanded', 'lax', 'continuous', 'practicable', 'indispensable', 'forced', 'habitual', 'plain', 'preferable']

Antonyms for complex word 'instilled': []

Morphological Adaptation step d): substitute list without antonyms of the complex word for robertabase model: ['infused', 'injected', 'filled', 'inst', 'invested', 'illed', 'impressed', 'infected', 'revived', 'endowed', 'gifted', 'reassured', 'implanted', 'infiltrated', 'pumped', 'inject', 'flooded', 'sprinkled', 'installed', 'vested', 'thrilled', 'assured', 'penetrated', 'hit', 'provided', 'insulated', 'vaccinated', 'stocked', 'stoked', '

In [None]:
python tsar_eval.py --gold_file ./data/trial/tsar2022_en_trial_gold_no_noise.tsv --predictions_file ./predictions/trial/SG_MA_robertalarge.tsv --output_file ./output/trial/SG_MA_robertalarge.tsv

#### Substitute Generation, Morphological Adaptation, and Contextualized embeddings:

In [8]:
# Calculates similarity between the original sentence and the sentences with candidate substitutes that were retrieved in the SG step 
# creates a list with sentences with substitute words filled in (commented out for oversight purposes)


def calculate_similarity_scores(sentence, sentence_with_substitutes):
    tokenizer = lm_tokenizer
    tf_model = TFAutoModel.from_pretrained("roberta-large")

    def embed_text(text):
        tokens = tokenizer(text, padding=True, truncation=True, return_tensors="tf")
        outputs = tf_model(**tokens)
        embeddings = outputs.last_hidden_state[:, 0, :]
        embeddings = tf.nn.l2_normalize(embeddings, axis=1)
        return embeddings

    original_sentence_embedding = embed_text(sentence)
    substitute_sentence_embeddings = embed_text(sentence_with_substitutes)

    cosine_similarity = np.inner(original_sentence_embedding, substitute_sentence_embeddings)
    similarity_scores = cosine_similarity[0]

    return similarity_scores

In [9]:
# in each row, for each complex word: 
for index, row in data.iterrows():
    
    # print the sentence and the complex word
    sentence, complex_word = row["sentence"], row["complex_word"]
    #print(f"Sentence: {sentence}")
    #print(f"Complex word: {complex_word}")
    
    
       
    # 1. Substitute Generation (SG): perform masking and generate substitutes:

    ## in the sentence, replace the complex word with a masked word
    sentence_masked_word = sentence.replace(complex_word, lm_tokenizer.mask_token)

    ## concatenate the original sentence and the masked sentence
    sentences_concat = f"{sentence} {lm_tokenizer.sep_token} {sentence_masked_word}"

    ## generate and rank candidate substitutes for the masked word using the fill_mask pipeline (removing elements without token_str key; as this gave errors in the ELECTRA models) .
    top_k = 30
    result = fill_mask(sentences_concat, top_k=top_k)
    substitutes = [substitute["token_str"] for substitute in result if "token_str" in substitute]
    #print(f"Substitute Generation step: initial substitute list: {substitutes}\n")


    #2: Morphological Generation and Context Adaptation (Morphological Adaptation):  
    ## a) remove noise in the substitutes, by ignoring generated substitutes that are empty or that have unwanted punctuation characters or that start with '##' (this returned errors with the ELECTRA model), and lowercase the substitutes (as some models don't lowercase by default)
    ## and lowercase all substitutes. Use try/except statement to prevent other character-related problems to happen

    punctuation_set = set(string.punctuation) - set('-') # retained hyphens in case tokenizers don't split on hyphenated compounds
    punctuation_set.update({'“','”'})   # as these curly quotes appeared in the Electra (SG step) results but were not part of the string set

    try:
        substitutes = [substitute["token_str"].lower().strip() for substitute in result if not any(char in punctuation_set for char in substitute["token_str"]) # added .strip as roberta uses a leading space before each substitute
                      and not substitute["token_str"].startswith('##') and substitute["token_str"].strip() != ""]
        # print(f"Morphological Adaptation step a): substitute list without unwanted punctuation characters: {substitutes}\n")
    except TypeError as error:
        continue



    ## b) remove duplicates within the substitute list from the substitute list (duplicates are likely for models that did not lowercase by default)
    ## the last mentioned duplicate is removed on purpose, as this may probably be the (previously) uppercased variant of the lowercased substitute (lowercased subs are most likely higher ranked by the model)
    substitutes_no_dupl = []
    for sub in substitutes:
        if sub not in substitutes_no_dupl:
            substitutes_no_dupl.append(sub)
    #print(f"Morphological Adaptation step b): substitute list without duplicates of substitutes: {substitutes_no_dupl}\n")



    ## c) remove duplicates and inflected forms of the complex word from the substitute list

    ## first Lemmatize the complex word with spaCy, in order to compare it with the lemmatized substitute later to see if their mutual lemmas are the same
    doc_complex_word = nlp(complex_word)
    complex_word_lemma = doc_complex_word[0].lemma_
    #print(f"complex_word_lemma for complex word '{complex_word}': {complex_word_lemma}\n")


    ## then, remove duplicates and inflected forms of the complex word from the list with substitutes
    substitutes_no_dupl_complex_word = []
    for substitute in substitutes_no_dupl:
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_
        if substitute_lemma != complex_word_lemma:
            substitutes_no_dupl_complex_word.append(substitute)
    #print(f"Morphological Adaptation step c): substitute list without duplicates of the complex word nor inflected forms of the complex word: {substitutes_no_dupl_complex_word}\n")


     ## d) remove antonyms of the complex word from the substitute list
    ## step 1: get the antonyms of the complex word
    antonyms_complex_word = []
    for syn in wn.synsets(complex_word_lemma):
        for lemma in syn.lemmas():
            for antonym in lemma.antonyms():
                    antonyms_complex_word.append(antonym.name())

    #print(f"Antonyms for complex word '{complex_word}': {antonyms_complex_word}\n")

    ## step 2: remove antonyms of the complex word from the list with substitutes
    substitutes_no_antonyms = []
    for substitute in substitutes_no_dupl_complex_word:
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_
        if substitute_lemma not in antonyms_complex_word:
            substitutes_no_antonyms.append(substitute)
        # else:
        #     print(f"Removed antonym: {substitute}")
    #print(f"Morphological Adaptation step d): substitute list without antonyms of the complex word: {substitutes_no_antonyms}\n") 
    
    
     #3: Substitute Selection (SS) by contextualized embeddings and cosine similarity scores:  
          
    # create sentence with the complex word replaced by the substitutes
    sentence_with_substitutes = [sentence.replace(complex_word, sub) for sub in substitutes_no_antonyms]
    #print(f"List with sentences where complex word is substituted: {sentence_with_substitutes}\n")
    
    
    ##  calculate cosine similarity scores, and rank the substitutes based on their similarity score
      
    
    if len(sentence_with_substitutes) > 0: # to make sure the list with substitutes is always filled
        logging.getLogger('transformers').setLevel(logging.ERROR)  # to prevent the same warnings from being printed x times 
        similarity_scores = calculate_similarity_scores(sentence, sentence_with_substitutes)
        logging.getLogger('transformers').setLevel(logging.WARNING) # to reset the logging level back to printing warnings
        embeddings_ranked_substitutes_with_scores = sorted(zip(substitutes_no_antonyms, similarity_scores), key=lambda x: x[1], reverse=True)
        embeddings_ranked_substitutes_only = [substitute for substitute, score in embeddings_ranked_substitutes_with_scores]
        #print(f"SS step: Ranked substitutes, based on embedding scores in context: {embeddings_ranked_substitutes_only}\n")   
        
        # limit the substitutes to the 10 first ones for evaluation
        #print(f" SS step: top-10 substitutes based on embedding scores in context: {embeddings_top_10_substitutes}\n")
        embeddings_top_10_substitutes = embeddings_ranked_substitutes_only[:10]
    else:
        embeddings_top_10_substitutes = []
    
        
    # fill the dataframe with 10 elements even if there are less than 10 in the previous list
    required_for_dataframe = 10
    # pad the list with None until it has 10 elements
    embeddings_top_10_substitutes += [None] * (required_for_dataframe - len(embeddings_top_10_substitutes))
    
    # add the sentence, complex_word, and the substitutes to the dataframe 
    substitutes_df.loc[index] = [sentence, complex_word] + embeddings_top_10_substitutes
    
        
    #print('---------------------------------------------------------------------------------------------------------------------------------------------')
      
    
# export the dataframe to a tsv file for evaluation
substitutes_df.to_csv("./predictions/trial/SG_MA_SS_ceRobertalarge_robertalarge.tsv", sep="\t", index=False, header=False) 
print("SG_MA_SS_ceRobertalarge_robertalarge exported to csv in path './predictions/trial/SG_MA_SS_ceRobertalarge_robertalarge.tsv'}\n")

Sentence: A Spanish government source, however, later said that banks able to cover by themselves losses on their toxic property assets will not be forced to remove them from their books while it will be compulsory for those receiving public help.
Complex word: compulsory
top-10 substitutes based on embedding scores in context: [' compulsory', ' mandatory', ' required', ' requirement', ' necessary', ' possible', ' optional', ' voluntary', ' routine', ' prerequisite']

---------------------------------------------------------------------------------------------------------------------------------------------
Sentence: Rajoy's conservative government had instilled markets with a brief dose of confidence by stepping into Bankia, performing a U-turn on its refusal to spend public money to rescue banks.
Complex word: instilled
top-10 substitutes based on embedding scores in context: ['filled', ' provided', ' bolstered', ' struck', 'illed', ' filled', ' undermined', ' stirred', ' stunned', '

python tsar_eval.py --gold_file ./data/trial/tsar2022_en_trial_gold_no_noise.tsv --predictions_file ./predictions/trial/SG_MA_SS_ceRobertalarge_robertalarge.tsv --output_file ./output/trial/SG_MA_SS_ceRobertalarge_robertalarge.tsv

Result: BertBase_SG_MA_SS_ce is better than BertBase_SG_MA

#### Substitute Generation, Morphological Adaptation, and BertScore:

##### BErtscore based on BERT-base:

In [None]:
# in each row, for each complex word: 
for index, row in data.iterrows():
    
    # print the sentence and the complex word
    sentence, complex_word = row["sentence"], row["complex_word"]
    #print(f"Sentence: {sentence}")
    #print(f"Complex word: {complex_word}")
    
    
       
    # 1. Substitute Generation (SG): perform masking and generate substitutes:

    ## in the sentence, replace the complex word with a masked word
    sentence_masked_word = sentence.replace(complex_word, lm_tokenizer.mask_token)

    ## concatenate the original sentence and the masked sentence
    sentences_concat= f"{sentence} {lm_tokenizer.sep_token} {sentence_masked_word}"

    ## generate and rank candidate substitutes for the masked word using the fill_mask pipeline (removing elements without token_str key; as this gave errors in the ELECTRA models) .
    top_k = 30
    result= fill_mask(sentences_concat, top_k=top_k)
    substitutes = [substitute["token_str"] for substitute in result if "token_str" in substitute]
    #print(f"Substitute Generation step: initial substitute list: {substitutes}\n")


    #2: Morphological Generation and Context Adaptation (Morphological Adaptation):  
    ## a) remove noise in the substitutes, by ignoring generated substitutes that are empty or that have unwanted punctuation characters or that start with '##' (this returned errors with the ELECTRA model), and lowercase the substitutes (as some models don't lowercase by default)
    ## and lowercase all substitutes. Use try/except statement to prevent other character-related problems to happen

    punctuation_set = set(string.punctuation) - set('-') # retained hyphens in case tokenizers don't split on hyphenated compounds
    punctuation_set.update({'“','”'})   # as these curly quotes appeared in the Electra (SG step) results but were not part of the string set

    try:
        substitutes = [substitute["token_str"].lower().strip() for substitute in result if not any(char in punctuation_set for char in substitute["token_str"]) # added .strip as roberta uses a leading space before each substitute
                      and not substitute["token_str"].startswith('##') and substitute["token_str"].strip() != ""]
        # print(f"Morphological Adaptation step a): substitute list without unwanted punctuation characters: {substitutes}\n")
    except TypeError as error:
        continue



    ## b) remove duplicates within the substitute list from the substitute list (duplicates are likely for models that did not lowercase by default)
    ## the last mentioned duplicate is removed on purpose, as this may probably be the (previously) uppercased variant of the lowercased substitute (lowercased subs are most likely higher ranked by the model)
    substitutes_no_dupl = []
    for sub in substitutes:
        if sub not in substitutes_no_dupl:
            substitutes_no_dupl.append(sub)
    #print(f"Morphological Adaptation step b): substitute list without duplicates of substitutes: {substitutes_no_dupl}\n")



    ## c) remove duplicates and inflected forms of the complex word from the substitute list

    ## first Lemmatize the complex word with spaCy, in order to compare it with the lemmatized substitute later to see if their mutual lemmas are the same
    doc_complex_word = nlp(complex_word)
    complex_word_lemma = doc_complex_word[0].lemma_
    #print(f"complex_word_lemma for complex word '{complex_word}': {complex_word_lemma}\n")


    ## then, remove duplicates and inflected forms of the complex word from the list with substitutes
    substitutes_no_dupl_complex_word = []
    for substitute in substitutes_no_dupl:
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_
        if substitute_lemma != complex_word_lemma:
            substitutes_no_dupl_complex_word.append(substitute)
    #print(f"Morphological Adaptation step c): substitute list without duplicates of the complex word nor inflected forms of the complex word: {substitutes_no_dupl_complex_word}\n")


     ## d) remove antonyms of the complex word from the substitute list
    ## step 1: get the antonyms of the complex word
    antonyms_complex_word = []
    for syn in wn.synsets(complex_word_lemma):
        for lemma in syn.lemmas():
            for antonym in lemma.antonyms():
                    antonyms_complex_word.append(antonym.name())

    #print(f"Antonyms for complex word '{complex_word}': {antonyms_complex_word}\n")

    ## step 2: remove antonyms of the complex word from the list with substitutes
    substitutes_no_antonyms= []
    for substitute in substitutes_no_dupl_complex_word:
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_
        if substitute_lemma not in antonyms_complex_word:
            substitutes_no_antonyms.append(substitute)
        # else:
        #     print(f"Removed antonym: {substitute}")
    #print(f"Morphological Adaptation step d): substitute list without antonyms of the complex word: {substitutes_no_antonyms}\n") 
  
    
    
    
    #3: Substitute Selection (SS) by calculating Bert scores: 

    ## create sentence with the complex word replaced by the substitutes
    sentence_with_substitutes = [sentence.replace(complex_word, sub) for sub in substitutes_no_antonyms]
    #print(f"List with sentences where complex word is substituted: {sentence_with_substitutes}\n")


    ## calculate BERTScores, and rank the substitutes based on these scores
    if len(sentence_with_substitutes) > 0: # to make sure the list with substitutes is always filled
        logging.getLogger('transformers').setLevel(logging.ERROR)  # to prevent the same warnings from being printed x times 
        scores = bert_score.score([sentence]*len(sentence_with_substitutes), sentence_with_substitutes, lang="en", model_type='bert-base-uncased', verbose=False)
        logging.getLogger('transformers').setLevel(logging.WARNING) # to reset the logging level back to printing warnings
        
        # create a list of tuples, each tuple containing a substitute and its score
        substitute_score_pairs = list(zip(substitutes_no_antonyms, scores[0].tolist()))

        # sort the list of tuples by the scores (the second element of each tuple), in descending order
        sorted_substitute_score_pairs = sorted(substitute_score_pairs, key=lambda x: x[1], reverse=True)

        # # print each substitute with its score
        # for substitute, score in sorted_substitute_score_pairs:
        #     print(f"Substitute: {substitute}, BertScore: {score}")

        # extract the list of substitutes from the sorted pairs
        bertscore_ranked_substitutes_only = [substitute for substitute, _ in sorted_substitute_score_pairs]
        #print(f"substitutes based on bertscores in context: {bertscore_ranked_substitutes_only}\n")

        # limit the substitutes to the 10 first ones for evaluation
        bertscore_top_10_substitutes = bertscore_ranked_substitutes_only[:10]
        #print(f"top-10 substitutes based on bertscores in context: {bertscore_top_10_substitutes}\n")

    else:
        bertscore_top_10_substitutes = []


    ## add the results to the dataframe
    # fill the dataframe with 10 elements even if there are less than 10 in the previous list
    required_for_dataframe = 10

    # pad the list with None until it has 10 elements
    bertscore_top_10_substitutes += [None] * (required_for_dataframe - len(bertscore_top_10_substitutes))
  


    # add the sentence, complex_word, and substitutes to the dataframe 
    substitutes_df.loc[index] = [sentence, complex_word] + bertscore_top_10_substitutes

    #print('---------------------------------------------------------------------------------------------------------------------------------------------')
   
    
    
# export the dataframe to a tsv file for evaluation
substitutes_df.to_csv("./predictions/trial/SG_MA_SS_bsBertbase_robertalarge.tsv", sep="\t", index=False, header=False)
print("SG_MA_SS_bsBertbase_robertalarge exported to csv in path './predictions/trial/SG_MA_SS_bsBertbase_robertalarge.tsv'}\n")

python tsar_eval.py --gold_file ./data/trial/tsar2022_en_trial_gold_no_noise.tsv --predictions_file ./predictions/trial/SG_MA_SS_bsBertbase_robertalarge.tsv --output_file ./output/trial/SG_MA_SS_bsBertbase_robertalarge.tsv

In [10]:
# in each row, for each complex word: 
for index, row in data.iterrows():
    
    # print the sentence and the complex word
    sentence, complex_word = row["sentence"], row["complex_word"]
    #print(f"Sentence: {sentence}")
    #print(f"Complex word: {complex_word}")
    
    
       
    # 1. Substitute Generation (SG): perform masking and generate substitutes:

    ## in the sentence, replace the complex word with a masked word
    sentence_masked_word = sentence.replace(complex_word, lm_tokenizer.mask_token)

    ## concatenate the original sentence and the masked sentence
    sentences_concat= f"{sentence} {lm_tokenizer.sep_token} {sentence_masked_word}"

    ## generate and rank candidate substitutes for the masked word using the fill_mask pipeline (removing elements without token_str key; as this gave errors in the ELECTRA models) .
    top_k = 30
    result= fill_mask(sentences_concat, top_k=top_k)
    substitutes = [substitute["token_str"] for substitute in result if "token_str" in substitute]
    #print(f"Substitute Generation step: initial substitute list: {substitutes}\n")


    #2: Morphological Generation and Context Adaptation (Morphological Adaptation):  
    ## a) remove noise in the substitutes, by ignoring generated substitutes that are empty or that have unwanted punctuation characters or that start with '##' (this returned errors with the ELECTRA model), and lowercase the substitutes (as some models don't lowercase by default)
    ## and lowercase all substitutes. Use try/except statement to prevent other character-related problems to happen

    punctuation_set = set(string.punctuation) - set('-') # retained hyphens in case tokenizers don't split on hyphenated compounds
    punctuation_set.update({'“','”'})   # as these curly quotes appeared in the Electra (SG step) results but were not part of the string set

    try:
        substitutes = [substitute["token_str"].lower().strip() for substitute in result if not any(char in punctuation_set for char in substitute["token_str"]) # added .strip as roberta uses a leading space before each substitute
                      and not substitute["token_str"].startswith('##') and substitute["token_str"].strip() != ""]
        # print(f"Morphological Adaptation step a): substitute list without unwanted punctuation characters: {substitutes}\n")
    except TypeError as error:
        continue



    ## b) remove duplicates within the substitute list from the substitute list (duplicates are likely for models that did not lowercase by default)
    ## the last mentioned duplicate is removed on purpose, as this may probably be the (previously) uppercased variant of the lowercased substitute (lowercased subs are most likely higher ranked by the model)
    substitutes_no_dupl = []
    for sub in substitutes:
        if sub not in substitutes_no_dupl:
            substitutes_no_dupl.append(sub)
    #print(f"Morphological Adaptation step b): substitute list without duplicates of substitutes: {substitutes_no_dupl}\n")



    ## c) remove duplicates and inflected forms of the complex word from the substitute list

    ## first Lemmatize the complex word with spaCy, in order to compare it with the lemmatized substitute later to see if their mutual lemmas are the same
    doc_complex_word = nlp(complex_word)
    complex_word_lemma = doc_complex_word[0].lemma_
    #print(f"complex_word_lemma for complex word '{complex_word}': {complex_word_lemma}\n")


    ## then, remove duplicates and inflected forms of the complex word from the list with substitutes
    substitutes_no_dupl_complex_word = []
    for substitute in substitutes_no_dupl:
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_
        if substitute_lemma != complex_word_lemma:
            substitutes_no_dupl_complex_word.append(substitute)
    #print(f"Morphological Adaptation step c): substitute list without duplicates of the complex word nor inflected forms of the complex word: {substitutes_no_dupl_complex_word}\n")


     ## d) remove antonyms of the complex word from the substitute list
    ## step 1: get the antonyms of the complex word
    antonyms_complex_word = []
    for syn in wn.synsets(complex_word_lemma):
        for lemma in syn.lemmas():
            for antonym in lemma.antonyms():
                    antonyms_complex_word.append(antonym.name())

    #print(f"Antonyms for complex word '{complex_word}': {antonyms_complex_word}\n")

    ## step 2: remove antonyms of the complex word from the list with substitutes
    substitutes_no_antonyms= []
    for substitute in substitutes_no_dupl_complex_word:
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_
        if substitute_lemma not in antonyms_complex_word:
            substitutes_no_antonyms.append(substitute)
        # else:
        #     print(f"Removed antonym: {substitute}")
    #print(f"Morphological Adaptation step d): substitute list without antonyms of the complex word: {substitutes_no_antonyms}\n") 
  
    
    
    
    #3: Substitute Selection (SS) by calculating Bert scores: 

    ## create sentence with the complex word replaced by the substitutes
    sentence_with_substitutes = [sentence.replace(complex_word, sub) for sub in substitutes_no_antonyms]
    #print(f"List with sentences where complex word is substituted: {sentence_with_substitutes}\n")


    ## calculate BERTScores, and rank the substitutes based on these scores
    if len(sentence_with_substitutes) > 0: # to make sure the list with substitutes is always filled
        logging.getLogger('transformers').setLevel(logging.ERROR)  # to prevent the same warnings from being printed x times 
        scores = bert_score.score([sentence]*len(sentence_with_substitutes), sentence_with_substitutes, lang="en", model_type='bert-large-uncased', verbose=False)
        logging.getLogger('transformers').setLevel(logging.WARNING) # to reset the logging level back to printing warnings
        
        # create a list of tuples, each tuple containing a substitute and its score
        substitute_score_pairs = list(zip(substitutes_no_antonyms, scores[0].tolist()))

        # sort the list of tuples by the scores (the second element of each tuple), in descending order
        sorted_substitute_score_pairs = sorted(substitute_score_pairs, key=lambda x: x[1], reverse=True)

        # # print each substitute with its score
        # for substitute, score in sorted_substitute_score_pairs:
        #     print(f"Substitute: {substitute}, BertScore: {score}")

        # extract the list of substitutes from the sorted pairs
        bertscore_ranked_substitutes_only = [substitute for substitute, _ in sorted_substitute_score_pairs]
        #print(f"substitutes based on bertscores in context: {bertscore_ranked_substitutes_only}\n")

        # limit the substitutes to the 10 first ones for evaluation
        bertscore_top_10_substitutes = bertscore_ranked_substitutes_only[:10]
        #print(f"top-10 substitutes based on bertscores in context: {bertscore_top_10_substitutes}\n")

    else:
        bertscore_top_10_substitutes = []


    ## add the results to the dataframe
    # fill the dataframe with 10 elements even if there are less than 10 in the previous list
    required_for_dataframe = 10

    # pad the list with None until it has 10 elements
    bertscore_top_10_substitutes += [None] * (required_for_dataframe - len(bertscore_top_10_substitutes))
  


    # add the sentence, complex_word, and substitutes to the dataframe 
    substitutes_df.loc[index] = [sentence, complex_word] + bertscore_top_10_substitutes

    #print('---------------------------------------------------------------------------------------------------------------------------------------------')
       
    
# export the dataframe to a tsv file for evaluation
substitutes_df.to_csv("./predictions/trial/SG_MA_SS_bsBertlarge_robertalarge.tsv", sep="\t", index=False, header=False)
print("SG_MA_SS_bsBertlarge_robertalarge exported to csv in path './predictions/trial/SG_MA_SS_bsBertlarge_robertalarge.tsv'}\n")

Sentence: A Spanish government source, however, later said that banks able to cover by themselves losses on their toxic property assets will not be forced to remove them from their books while it will be compulsory for those receiving public help.
Complex word: compulsory
top-10 substitutes based on bertscores in context: [' compulsory', ' mandatory', ' obligatory', ' optional', ' voluntary', ' mandated', ' required', ' universal', ' conditional', ' enforced']

---------------------------------------------------------------------------------------------------------------------------------------------
Sentence: Rajoy's conservative government had instilled markets with a brief dose of confidence by stepping into Bankia, performing a U-turn on its refusal to spend public money to rescue banks.
Complex word: instilled
top-10 substitutes based on bertscores in context: [' implanted', ' bolstered', ' injected', ' infused', ' undermined', ' enriched', ' reinforced', ' endowed', ' seeded', ' 

python tsar_eval.py --gold_file ./data/trial/tsar2022_en_trial_gold_no_noise.tsv --predictions_file ./predictions/trial/SG_MA_SS_bsBertlarge_robertalarge.tsv --output_file ./output/trial/SG_MA_SS_bsBertlarge_robertalarge.tsv

#### bertscore based on electrabase:

In [None]:
# in each row, for each complex word: 
for index, row in data.iterrows():
    
    # print the sentence and the complex word
    sentence, complex_word = row["sentence"], row["complex_word"]
    #print(f"Sentence: {sentence}")
    #print(f"Complex word: {complex_word}")
    
    
        
    # 1. Substitute Generation (SG): perform masking and generate substitutes:

    ## in the sentence, replace the complex word with a masked word
    sentence_masked_word = sentence.replace(complex_word, lm_tokenizer.mask_token)

    ## concatenate the original sentence and the masked sentence
    sentences_concat= f"{sentence} {lm_tokenizer.sep_token} {sentence_masked_word}"

    ## generate and rank candidate substitutes for the masked word using the fill_mask pipeline (removing elements without token_str key; as this gave errors in the ELECTRA models) .
    top_k = 30
    result = fill_mask(sentences_concat, top_k=top_k)
    substitutes = [substitute["token_str"] for substitute in result if "token_str" in substitute]
    #print(f"Substitute Generation step: initial substitute list: {substitutes}\n")


    #2: Morphological Generation and Context Adaptation (Morphological Adaptation):  
    ## a) remove noise in the substitutes, by ignoring generated substitutes that are empty or that have unwanted punctuation characters or that start with '##' (this returned errors with the ELECTRA model), and lowercase the substitutes (as some models don't lowercase by default)
    ## and lowercase all substitutes. Use try/except statement to prevent other character-related problems to happen

    punctuation_set = set(string.punctuation) - set('-') # retained hyphens in case tokenizers don't split on hyphenated compounds
    punctuation_set.update({'“','”'})   # as these curly quotes appeared in the Electra (SG step) results but were not part of the string set

    try:
        substitutes = [substitute["token_str"].lower().strip() for substitute in result if not any(char in punctuation_set for char in substitute["token_str"]) # added .strip as roberta uses a leading space before each substitute
                      and not substitute["token_str"].startswith('##') and substitute["token_str"].strip() != ""]
        # print(f"Morphological Adaptation step a): substitute list without unwanted punctuation characters: {substitutes}\n")
    except TypeError as error:
        continue



    ## b) remove duplicates within the substitute list from the substitute list (duplicates are likely for models that did not lowercase by default)
    ## the last mentioned duplicate is removed on purpose, as this may probably be the (previously) uppercased variant of the lowercased substitute (lowercased subs are most likely higher ranked by the model)
    substitutes_no_dupl = []
    for sub in substitutes:
        if sub not in substitutes_no_dupl:
            substitutes_no_dupl.append(sub)
    #print(f"Morphological Adaptation step b): substitute list without duplicates of substitutes: {substitutes_no_dupl}\n")



    ## c) remove duplicates and inflected forms of the complex word from the substitute list

    ## first Lemmatize the complex word with spaCy, in order to compare it with the lemmatized substitute later to see if their mutual lemmas are the same
    doc_complex_word = nlp(complex_word)
    complex_word_lemma = doc_complex_word[0].lemma_
    #print(f"complex_word_lemma for complex word '{complex_word}': {complex_word_lemma}\n")


    ## then, remove duplicates and inflected forms of the complex word from the list with substitutes
    substitutes_no_dupl_complex_word = []
    for substitute in substitutes_no_dupl:
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_
        if substitute_lemma != complex_word_lemma:
            substitutes_no_dupl_complex_word.append(substitute)
    #print(f"Morphological Adaptation step c): substitute list without duplicates of the complex word nor inflected forms of the complex word : {substitutes_no_dupl_complex_word\n")


     ## d) remove antonyms of the complex word from the substitute list
    ## step 1: get the antonyms of the complex word
    antonyms_complex_word = []
    for syn in wn.synsets(complex_word_lemma):
        for lemma in syn.lemmas():
            for antonym in lemma.antonyms():
                    antonyms_complex_word.append(antonym.name())

    #print(f"Antonyms for complex word '{complex_word}': {antonyms_complex_word}\n")

    ## step 2: remove antonyms of the complex word from the list with substitutes
    substitutes_no_antonyms = []
    for substitute in substitutes_no_dupl_complex_word:
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_
        if substitute_lemma not in antonyms_complex_word:
            substitutes_no_antonyms.append(substitute)
        # else:
        #     print(f"Removed antonym: {substitute}")
    #print(f"Morphological Adaptation step d): substitute list without antonyms of the complex word: {substitutes_no_antonyms}\n") 
  
    
    
    
    #3: Substitute Selection (SS) by calculating Bertscores: 
    ## create sentence with the complex word replaced by the substitutes
    sentence_with_substitutes = [sentence.replace(complex_word, sub) for sub in substitutes_no_antonyms]
    #print(f"List with sentences where complex word is substituted: {sentence_with_substitutes}\n")


    ## calculate BERTScores, and rank the substitutes based on these scores
    if len(sentence_with_substitutes) > 0: # to make sure the list with substitutes is always filled
        logging.getLogger('transformers').setLevel(logging.ERROR)  # to prevent the same warnings from being printed x times 
        scores = bert_score.score([sentence]*len(sentence_with_substitutes), sentence_with_substitutes, lang="en", model_type='google/electra-base-generator', verbose=False)
        logging.getLogger('transformers').setLevel(logging.WARNING) # to reset the logging level back to printing warnings
    
        # create a list of tuples, each tuple containing a substitute and its score
        substitute_score_pairs = list(zip(substitutes_no_antonyms, scores[0].tolist()))

        # sort the list of tuples by the scores (the second element of each tuple), in descending order
        sorted_substitute_score_pairs = sorted(substitute_score_pairs, key=lambda x: x[1], reverse=True)

        # print each substitute with its score
        # for substitute, score in sorted_substitute_score_pairs:
        #     print(f"Substitute: {substitute}, BertScore: {score}")

        # extract the list of substitutes from the sorted pairs
        bertscore_ranked_substitutes_only = [substitute for substitute, _ in sorted_substitute_score_pairs]
        #print(f"substitutes based on bertscores in context: {bertscore_ranked_substitutes_only}\n")

        # limit the substitutes to the 10 first ones for evaluation
        bertscore_top_10_substitutes = bertscore_ranked_substitutes_only[:10]
        #print(f"top-10 substitutes based on bertscores in context: {bertscore_top_10_substitutes}\n")

    else:
        bertscore_top_10_substitutes = []


    ## add the results to the dataframe
    ## fill the dataframe with 10 elements even if there are less than 10 in the previous list
    required_for_dataframe = 10

    ## pad the list with None until it has 10 elements
    bertscore_top_10_substitutes += [None] * (required_for_dataframe - len(bertscore_top_10_substitutes))
   


    ## add the sentence, complex_word, and substitutes to the dataframe 
    substitutes_df.loc[index] = [sentence, complex_word] + bertscore_top_10_substitutes

    #print('---------------------------------------------------------------------------------------------------------------------------------------------')
   

# export the dataframe to a tsv file for evaluation
substitutes_df.to_csv("./predictions/trial/SG_MA_SS_bsElectrabase_robertalarge.tsv", sep="\t", index=False, header=False)   
print("SG_MA_SS_bsElectrabase_robertalarge exported to csv in path './predictions/trial/SG_MA_SS_bsElectrabase_robertalarge.tsv'}\n")

python tsar_eval.py --gold_file ./data/trial/tsar2022_en_trial_gold_no_noise.tsv --predictions_file ./predictions/trial/SG_MA_SS_bsElectrabase_robertalarge.tsv --output_file ./output/trial/SG_MA_SS_bsElectrabase_robertalarge.tsv

#### bertscore based on Electralarge:

In [None]:
# in each row, for each complex word: 
for index, row in data.iterrows():
    
    # print the sentence and the complex word
    sentence, complex_word = row["sentence"], row["complex_word"]
    #print(f"Sentence: {sentence}")
    #print(f"Complex word: {complex_word}")
    
    
        
    # 1. Substitute Generation (SG): perform masking and generate substitutes:

    ## in the sentence, replace the complex word with a masked word
    sentence_masked_word = sentence.replace(complex_word, lm_tokenizer.mask_token)

    ## concatenate the original sentence and the masked sentence
    sentences_concat= f"{sentence} {lm_tokenizer.sep_token} {sentence_masked_word}"

    ## generate and rank candidate substitutes for the masked word using the fill_mask pipeline (removing elements without token_str key; as this gave errors in the ELECTRA models) .
    top_k = 30
    result = fill_mask(sentences_concat, top_k=top_k)
    substitutes = [substitute["token_str"] for substitute in result if "token_str" in substitute]
    #print(f"Substitute Generation step: initial substitute list: {substitutes}\n")


    #2: Morphological Generation and Context Adaptation (Morphological Adaptation):  
    ## a) remove noise in the substitutes, by ignoring generated substitutes that are empty or that have unwanted punctuation characters or that start with '##' (this returned errors with the ELECTRA model), and lowercase the substitutes (as some models don't lowercase by default)
    ## and lowercase all substitutes. Use try/except statement to prevent other character-related problems to happen

    punctuation_set = set(string.punctuation) - set('-') # retained hyphens in case tokenizers don't split on hyphenated compounds
    punctuation_set.update({'“','”'})   # as these curly quotes appeared in the Electra (SG step) results but were not part of the string set

    try:
        substitutes = [substitute["token_str"].lower().strip() for substitute in result if not any(char in punctuation_set for char in substitute["token_str"]) # added .strip as roberta uses a leading space before each substitute
                      and not substitute["token_str"].startswith('##') and substitute["token_str"].strip() != ""]
        # print(f"Morphological Adaptation step a): substitute list without unwanted punctuation characters: {substitutes}\n")
    except TypeError as error:
        continue



    ## b) remove duplicates within the substitute list from the substitute list (duplicates are likely for models that did not lowercase by default)
    ## the last mentioned duplicate is removed on purpose, as this may probably be the (previously) uppercased variant of the lowercased substitute (lowercased subs are most likely higher ranked by the model)
    substitutes_no_dupl = []
    for sub in substitutes:
        if sub not in substitutes_no_dupl:
            substitutes_no_dupl.append(sub)
    #print(f"Morphological Adaptation step b): substitute list without duplicates of substitutes: {substitutes_no_dupl}\n")



    ## c) remove duplicates and inflected forms of the complex word from the substitute list

    ## first Lemmatize the complex word with spaCy, in order to compare it with the lemmatized substitute later to see if their mutual lemmas are the same
    doc_complex_word = nlp(complex_word)
    complex_word_lemma = doc_complex_word[0].lemma_
    #print(f"complex_word_lemma for complex word '{complex_word}': {complex_word_lemma}\n")


    ## then, remove duplicates and inflected forms of the complex word from the list with substitutes
    substitutes_no_dupl_complex_word = []
    for substitute in substitutes_no_dupl:
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_
        if substitute_lemma != complex_word_lemma:
            substitutes_no_dupl_complex_word.append(substitute)
    #print(f"Morphological Adaptation step c): substitute list without duplicates of the complex word nor inflected forms of the complex word : {substitutes_no_dupl_complex_word\n")


     ## d) remove antonyms of the complex word from the substitute list
    ## step 1: get the antonyms of the complex word
    antonyms_complex_word = []
    for syn in wn.synsets(complex_word_lemma):
        for lemma in syn.lemmas():
            for antonym in lemma.antonyms():
                    antonyms_complex_word.append(antonym.name())

    #print(f"Antonyms for complex word '{complex_word}': {antonyms_complex_word}\n")

    ## step 2: remove antonyms of the complex word from the list with substitutes
    substitutes_no_antonyms = []
    for substitute in substitutes_no_dupl_complex_word:
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_
        if substitute_lemma not in antonyms_complex_word:
            substitutes_no_antonyms.append(substitute)
        # else:
        #     print(f"Removed antonym: {substitute}")
    #print(f"Morphological Adaptation step d): substitute list without antonyms of the complex word: {substitutes_no_antonyms}\n") 
  
    
    
    
    #3: Substitute Selection (SS) by calculating Bertscores: 
    ## create sentence with the complex word replaced by the substitutes
    sentence_with_substitutes = [sentence.replace(complex_word, sub) for sub in substitutes_no_antonyms]
    #print(f"List with sentences where complex word is substituted: {sentence_with_substitutes}\n")


    ## calculate BERTScores, and rank the substitutes based on these scores
    if len(sentence_with_substitutes) > 0: # to make sure the list with substitutes is always filled
        logging.getLogger('transformers').setLevel(logging.ERROR)  # to prevent the same warnings from being printed x times 
        scores = bert_score.score([sentence]*len(sentence_with_substitutes), sentence_with_substitutes, lang="en", model_type='google/electra-large-generator', verbose=False)
        logging.getLogger('transformers').setLevel(logging.WARNING) # to reset the logging level back to printing warnings
    
        # create a list of tuples, each tuple containing a substitute and its score
        substitute_score_pairs = list(zip(substitutes_no_antonyms, scores[0].tolist()))

        # sort the list of tuples by the scores (the second element of each tuple), in descending order
        sorted_substitute_score_pairs = sorted(substitute_score_pairs, key=lambda x: x[1], reverse=True)

        # print each substitute with its score
        # for substitute, score in sorted_substitute_score_pairs:
        #     print(f"Substitute: {substitute}, BertScore: {score}")

        # extract the list of substitutes from the sorted pairs
        bertscore_ranked_substitutes_only = [substitute for substitute, _ in sorted_substitute_score_pairs]
        #print(f"substitutes based on bertscores in context: {bertscore_ranked_substitutes_only}\n")

        # limit the substitutes to the 10 first ones for evaluation
        bertscore_top_10_substitutes = bertscore_ranked_substitutes_only[:10]
        #print(f"top-10 substitutes based on bertscores in context: {bertscore_top_10_substitutes}\n")

    else:
        bertscore_top_10_substitutes = []


    ## add the results to the dataframe
    ## fill the dataframe with 10 elements even if there are less than 10 in the previous list
    required_for_dataframe = 10

    ## pad the list with None until it has 10 elements
    bertscore_top_10_substitutes += [None] * (required_for_dataframe - len(bertscore_top_10_substitutes))
   


    ## add the sentence, complex_word, and substitutes to the dataframe 
    substitutes_df.loc[index] = [sentence, complex_word] + bertscore_top_10_substitutes

    #print('---------------------------------------------------------------------------------------------------------------------------------------------')
   
    
    
# export the dataframe to a tsv file for evaluation
substitutes_df.to_csv("./predictions/trial/SG_MA_SS_bsElectralarge_robertalarge.tsv", sep="\t", index=False, header=False)
print("SG_MA_SS_bsElectralarge_robertalarge exported to csv in path './predictions/trial/SG_MA_SS_bsElectralarge_robertalarge.tsv'}\n")

python tsar_eval.py --gold_file ./data/trial/tsar2022_en_trial_gold_no_noise.tsv --predictions_file ./predictions/trial/SG_MA_SS_bsElectralarge_robertalarge.tsv --output_file ./output/trial/SG_MA_SS_bsElectralarge_robertalarge.tsv

#### bertscore based on Robertabase:

In [None]:
# in each row, for each complex word: 
for index, row in data.iterrows():
    
    # print the sentence and the complex word
    sentence, complex_word = row["sentence"], row["complex_word"]
    #print(f"Sentence: {sentence}")
    #print(f"Complex word: {complex_word}")
    
       
    # 1. Substitute Generation (SG): perform masking and generate substitutes:

    ## in the sentence, replace the complex word with a masked word
    sentence_masked_word = sentence.replace(complex_word, lm_tokenizer.mask_token)

    ## concatenate the original sentence and the masked sentence
    sentences_concat = f"{sentence} {lm_tokenizer.sep_token} {sentence_masked_word}"

    ## generate and rank candidate substitutes for the masked word using the fill_mask pipeline (removing elements without token_str key; as this gave errors in the ELECTRA models) .
    top_k = 30
    result = fill_mask(sentences_concat, top_k=top_k)
    substitutes = [substitute["token_str"] for substitute in result if "token_str" in substitute]
    #print(f"Substitute Generation step: initial substitute list: {substitutes}\n")


    #2: Morphological Generation and Context Adaptation (Morphological Adaptation):  
    ## a) remove noise in the substitutes, by ignoring generated substitutes that are empty or that have unwanted punctuation characters or that start with '##' (this returned errors with the ELECTRA model), and lowercase the substitutes (as some models don't lowercase by default)
    ## and lowercase all substitutes. Use try/except statement to prevent other character-related problems to happen

    punctuation_set = set(string.punctuation) - set('-') # retained hyphens in case tokenizers don't split on hyphenated compounds
    punctuation_set.update({'“','”'})   # as these curly quotes appeared in the Electra (SG step) results but were not part of the string set

    try:
        substitutes = [substitute["token_str"].lower().strip() for substitute in result if not any(char in punctuation_set for char in substitute["token_str"]) # added .strip as roberta uses a leading space before each substitute
                      and not substitute["token_str"].startswith('##') and substitute["token_str"].strip() != ""]
        # print(f"Morphological Adaptation step a): substitute list without unwanted punctuation characters: {substitutes}\n")
    except TypeError as error:
        continue



    ## b) remove duplicates within the substitute list from the substitute list (duplicates are likely for models that did not lowercase by default)
    ## the last mentioned duplicate is removed on purpose, as this may probably be the (previously) uppercased variant of the lowercased substitute (lowercased subs are most likely higher ranked by the model)
    substitutes_no_dupl = []
    for sub in substitutes:
        if sub not in substitutes_no_dupl:
            substitutes_no_dupl.append(sub)
    #print(f"Morphological Adaptation step b): substitute list without duplicates of substitutes: {substitutes_no_dupl}\n")



    ## c) remove duplicates and inflected forms of the complex word from the substitute list

    ## first Lemmatize the complex word with spaCy, in order to compare it with the lemmatized substitute later to see if their mutual lemmas are the same
    doc_complex_word = nlp(complex_word)
    complex_word_lemma = doc_complex_word[0].lemma_
    #print(f"complex_word_lemma for complex word '{complex_word}': {complex_word_lemma}\n")


    ## then, remove duplicates and inflected forms of the complex word from the list with substitutes
    substitutes_no_dupl_complex_word = []
    for substitute in substitutes_no_dupl:
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_
        if substitute_lemma != complex_word_lemma:
            substitutes_no_dupl_complex_word.append(substitute)
    #print(f"Morphological Adaptation step c): substitute list without duplicates of the complex word nor inflected forms of the complex word: {substitutes_no_dupl_complex_word}\n")


     ## d) remove antonyms of the complex word from the substitute list
    ## step 1: get the antonyms of the complex word
    antonyms_complex_word = []
    for syn in wn.synsets(complex_word_lemma):
        for lemma in syn.lemmas():
            for antonym in lemma.antonyms():
                    antonyms_complex_word.append(antonym.name())

    #print(f"Antonyms for complex word '{complex_word}': {antonyms_complex_word}\n")

    ## step 2: remove antonyms of the complex word from the list with substitutes
    substitutes_no_antonyms = []
    for substitute in substitutes_no_dupl_complex_word:
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_
        if substitute_lemma not in antonyms_complex_word:
            substitutes_no_antonyms.append(substitute)
        # else:
        #     print(f"Removed antonym: {substitute}")
    #print(f"Morphological Adaptation step d): substitute list without antonyms of the complex word: {substitutes_no_antonyms}\n") 
  
    
    
    
    
    
    #3: Substitute Selection (SS) by calculating Bert scores: 

    ## create sentence with the complex word replaced by the substitutes
    sentence_with_substitutes = [sentence.replace(complex_word, sub) for sub in substitutes_no_antonyms]
    #print(f"List with sentences where complex word is substituted: {sentence_with_substitutes}\n")


    ## calculate BERTScores, and rank the substitutes based on these scores
    if len(sentence_with_substitutes) > 0: # to make sure the list with substitutes is always filled
        logging.getLogger('transformers').setLevel(logging.ERROR)  # to prevent the same warnings from being printed x times 
        scores = bert_score.score([sentence]*len(sentence_with_substitutes), sentence_with_substitutes, lang="en", model_type='roberta-base', verbose=False)
        logging.getLogger('transformers').setLevel(logging.WARNING) # to reset the logging level back to printing warnings
        
        # create a list of tuples, each tuple containing a substitute and its score
        substitute_score_pairs = list(zip(substitutes_no_antonyms, scores[0].tolist()))

        # sort the list of tuples by the scores (the second element of each tuple), in descending order
        sorted_substitute_score_pairs = sorted(substitute_score_pairs, key=lambda x: x[1], reverse=True)

        # # print each substitute with its score
        # for substitute, score in sorted_substitute_score_pairs:
        #     print(f"Substitute: {substitute}, BertScore: {score}")

        # extract the list of substitutes from the sorted pairs
        bertscore_ranked_substitutes_only = [substitute for substitute, _ in sorted_substitute_score_pairs]
        #print(f"substitutes based on bertscores in context: {bertscore_ranked_substitutes_only}\n")

        # limit the substitutes to the 10 first ones for evaluation
        bertscore_top_10_substitutes = bertscore_ranked_substitutes_only[:10]
        #print(f"top-10 substitutes based on bertscores in context: {bertscore_top_10_substitutes}\n")

    else:
        bertscore_top_10_substitutes = []


    ## add the results to the dataframe
    # fill the dataframe with 10 elements even if there are less than 10 in the previous list
    required_for_dataframe = 10

    # pad the list with None until it has 10 elements
    bertscore_top_10_substitutes += [None] * (required_for_dataframe - len(bertscore_top_10_substitutes))
  


    # add the sentence, complex_word, and substitutes to the dataframe 
    substitutes_df.loc[index] = [sentence, complex_word] + bertscore_top_10_substitutes

    #print('---------------------------------------------------------------------------------------------------------------------------------------------')
   
   
    
# export the dataframe to a tsv file for evaluation
substitutes_df.to_csv("./predictions/trial/SG_MA_SS_bsRobertabase_robertalarge.tsv", sep="\t", index=False, header=False) 
print("SG_MA_SS_bsRobertabase_robertalarge exported to csv in path './predictions/trial/SG_MA_SS_bsRobertabase_robertalarge.tsv'}\n")

python tsar_eval.py --gold_file ./data/trial/tsar2022_en_trial_gold_no_noise.tsv --predictions_file ./predictions/trial/SG_MA_SS_bsRobertabase_robertalarge.tsv --output_file ./output/trial/SG_MA_SS_bsRobertabase_robertalarge.tsv

#### bertscore based on Robertalarge:

In [11]:
# in each row, for each complex word: 
for index, row in data.iterrows():
    
    # print the sentence and the complex word
    sentence, complex_word = row["sentence"], row["complex_word"]
    #print(f"Sentence: {sentence}")
    #print(f"Complex word: {complex_word}")
    
       
    # 1. Substitute Generation (SG): perform masking and generate substitutes:

    ## in the sentence, replace the complex word with a masked word
    sentence_masked_word = sentence.replace(complex_word, lm_tokenizer.mask_token)

    ## concatenate the original sentence and the masked sentence
    sentences_concat = f"{sentence} {lm_tokenizer.sep_token} {sentence_masked_word}"

    ## generate and rank candidate substitutes for the masked word using the fill_mask pipeline (removing elements without token_str key; as this gave errors in the ELECTRA models) .
    top_k = 30
    result = fill_mask(sentences_concat, top_k=top_k)
    substitutes = [substitute["token_str"] for substitute in result if "token_str" in substitute]
    #print(f"Substitute Generation step: initial substitute list: {substitutes}\n")


    #2: Morphological Generation and Context Adaptation (Morphological Adaptation):  
    ## a) remove noise in the substitutes, by ignoring generated substitutes that are empty or that have unwanted punctuation characters or that start with '##' (this returned errors with the ELECTRA model), and lowercase the substitutes (as some models don't lowercase by default)
    ## and lowercase all substitutes. Use try/except statement to prevent other character-related problems to happen

    punctuation_set = set(string.punctuation) - set('-') # retained hyphens in case tokenizers don't split on hyphenated compounds
    punctuation_set.update({'“','”'})   # as these curly quotes appeared in the Electra (SG step) results but were not part of the string set

    try:
        substitutes = [substitute["token_str"].lower().strip() for substitute in result if not any(char in punctuation_set for char in substitute["token_str"]) # added .strip as roberta uses a leading space before each substitute
                      and not substitute["token_str"].startswith('##') and substitute["token_str"].strip() != ""]
        # print(f"Morphological Adaptation step a): substitute list without unwanted punctuation characters: {substitutes}\n")
    except TypeError as error:
        continue



    ## b) remove duplicates within the substitute list from the substitute list (duplicates are likely for models that did not lowercase by default)
    ## the last mentioned duplicate is removed on purpose, as this may probably be the (previously) uppercased variant of the lowercased substitute (lowercased subs are most likely higher ranked by the model)
    substitutes_no_dupl = []
    for sub in substitutes:
        if sub not in substitutes_no_dupl:
            substitutes_no_dupl.append(sub)
    #print(f"Morphological Adaptation step b): substitute list without duplicates of substitutes: {substitutes_no_dupl}\n")



    ## c) remove duplicates and inflected forms of the complex word from the substitute list

    ## first Lemmatize the complex word with spaCy, in order to compare it with the lemmatized substitute later to see if their mutual lemmas are the same
    doc_complex_word = nlp(complex_word)
    complex_word_lemma = doc_complex_word[0].lemma_
    #print(f"complex_word_lemma for complex word '{complex_word}': {complex_word_lemma}\n")


    ## then, remove duplicates and inflected forms of the complex word from the list with substitutes
    substitutes_no_dupl_complex_word = []
    for substitute in substitutes_no_dupl:
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_
        if substitute_lemma != complex_word_lemma:
            substitutes_no_dupl_complex_word.append(substitute)
    #print(f"Morphological Adaptation step c): substitute list without duplicates of the complex word nor inflected forms of the complex word: {substitutes_no_dupl_complex_word}\n")


     ## d) remove antonyms of the complex word from the substitute list
    ## step 1: get the antonyms of the complex word
    antonyms_complex_word = []
    for syn in wn.synsets(complex_word_lemma):
        for lemma in syn.lemmas():
            for antonym in lemma.antonyms():
                    antonyms_complex_word.append(antonym.name())

    #print(f"Antonyms for complex word '{complex_word}': {antonyms_complex_word}\n")

    ## step 2: remove antonyms of the complex word from the list with substitutes
    substitutes_no_antonyms = []
    for substitute in substitutes_no_dupl_complex_word:
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_
        if substitute_lemma not in antonyms_complex_word:
            substitutes_no_antonyms.append(substitute)
        # else:
        #     print(f"Removed antonym: {substitute}")
    #print(f"Morphological Adaptation step d): substitute list without antonyms of the complex word: {substitutes_no_antonyms}\n") 
  
    
    
    
    
    
    #3: Substitute Selection (SS) by calculating Bert scores: 

    ## create sentence with the complex word replaced by the substitutes
    sentence_with_substitutes = [sentence.replace(complex_word, sub) for sub in substitutes_no_antonyms]
    #print(f"List with sentences where complex word is substituted: {sentence_with_substitutes}\n")


    ## calculate BERTScores, and rank the substitutes based on these scores
    if len(sentence_with_substitutes) > 0: # to make sure the list with substitutes is always filled
        logging.getLogger('transformers').setLevel(logging.ERROR)  # to prevent the same warnings from being printed x times 
        scores = bert_score.score([sentence]*len(sentence_with_substitutes), sentence_with_substitutes, lang="en", model_type='roberta-large', verbose=False)
        logging.getLogger('transformers').setLevel(logging.WARNING) # to reset the logging level back to printing warnings
        
        # create a list of tuples, each tuple containing a substitute and its score
        substitute_score_pairs = list(zip(substitutes_no_antonyms, scores[0].tolist()))

        # sort the list of tuples by the scores (the second element of each tuple), in descending order
        sorted_substitute_score_pairs = sorted(substitute_score_pairs, key=lambda x: x[1], reverse=True)

        # # print each substitute with its score
        # for substitute, score in sorted_substitute_score_pairs:
        #     print(f"Substitute: {substitute}, BertScore: {score}")

        # extract the list of substitutes from the sorted pairs
        bertscore_ranked_substitutes_only = [substitute for substitute, _ in sorted_substitute_score_pairs]
        #print(f"substitutes based on bertscores in context: {bertscore_ranked_substitutes_only}\n")

        # limit the substitutes to the 10 first ones for evaluation
        bertscore_top_10_substitutes = bertscore_ranked_substitutes_only[:10]
        #print(f"top-10 substitutes based on bertscores in context: {bertscore_top_10_substitutes}\n")

    else:
        bertscore_top_10_substitutes = []


    ## add the results to the dataframe
    # fill the dataframe with 10 elements even if there are less than 10 in the previous list
    required_for_dataframe = 10

    # pad the list with None until it has 10 elements
    bertscore_top_10_substitutes += [None] * (required_for_dataframe - len(bertscore_top_10_substitutes))
  


    # add the sentence, complex_word, and substitutes to the dataframe 
    substitutes_df.loc[index] = [sentence, complex_word] + bertscore_top_10_substitutes

    #print('---------------------------------------------------------------------------------------------------------------------------------------------')
   
   
   
     
# export the dataframe to a tsv file for evaluation
substitutes_df.to_csv("./predictions/trial/SG_MA_SS_bsRobertalarge_robertalarge.tsv", sep="\t", index=False, header=False)
print("SG_MA_SS_bsRobertalarge_robertalarge exported to csv in path './predictions/trial/SG_MA_SS_bsRobertalarge_robertalarge.tsv'}\n")

Sentence: A Spanish government source, however, later said that banks able to cover by themselves losses on their toxic property assets will not be forced to remove them from their books while it will be compulsory for those receiving public help.
Complex word: compulsory
top-10 substitutes based on bertscores in context: [' compulsory', ' mandatory', ' optional', ' routine', ' voluntary', ' required', ' obligatory', ' requirement', ' mandated', ' necessary']

---------------------------------------------------------------------------------------------------------------------------------------------
Sentence: Rajoy's conservative government had instilled markets with a brief dose of confidence by stepping into Bankia, performing a U-turn on its refusal to spend public money to rescue banks.
Complex word: instilled
top-10 substitutes based on bertscores in context: ['filled', ' filled', ' bolstered', ' provided', ' struck', ' stirred', ' supplied', ' injected', ' infused', 'illed']

---

python tsar_eval.py --gold_file ./data/trial/tsar2022_en_trial_gold_no_noise.tsv --predictions_file ./predictions/trial/SG_MA_SS_bsRobertalarge_robertalarge.tsv --output_file ./output/trial/SG_MA_SS_bsRobertalarge_robertalarge.tsv

### OLD CODE: remove before submission

### data.loc for 1 subs list (examples below) 

In [14]:
# #for SG_MA step: 

# row = data.loc[1]
       
# # 1. Substitute Generation (SG): perform masking and generate substitutes:

# ## print the sentence and the complex word
# sentence, complex_word = row["sentence"], row["complex_word"]
# print(f"Sentence: {sentence}")
# print(f"Complex word: {complex_word}")

# ## in the sentence, replace the complex word with a masked word
# sentence_masked_word = sentence.replace(complex_word, "<mask>") # this is different per model (this code line applies to Roberta)


# ## concatenate the original sentence and the masked sentence
# sentences_concat = f"{sentence} {lm_tokenizer.sep_token} {sentence_masked_word}"

# ## generate and rank candidate substitutes for the masked word using the fill_mask pipeline (removing elements without token_str key; as this gave errors in the ELECTRA models) .
# top_k = 30
# result = fill_mask(sentences_concat, top_k=top_k)

# substitutes = [substitute["token_str"] for substitute in result if "token_str" in substitute]
# #print(f"Substitute Generation step: initial substitute list: {substitutes}\n")


# #2: Morphological Generation and Context Adaptation (Morphological Adaptation):  
# ## a) remove noise in the substitutes, by ignoring generated substitutes that are empty or that have unwanted punctuation characters or that start with '##' (this returned errors with the ELECTRA model), and lowercase the substitutes (as some models don't lowercase by default)
# ## and lowercase all substitutes (not needed for BERT-base-uncased but by default applied). Use try/except statement to prevent other character-related problems to happen

# punctuation_set = set(string.punctuation) - set('-') # retained hyphens in case tokenizers don't split on hyphenated compounds
# punctuation_set.update({'“','”'})   # as these curly quotes appeared in the Electra (SG step) results but were not part of the string set

# try:
#     substitutes = [substitute["token_str"].lower() for substitute in result if not any(char in punctuation_set for char in substitute["token_str"]) 
#                    and not substitute["token_str"].startswith('##') and substitute["token_str"].strip() != ""]
#     # print(f"Morphological Adaptation step a): substitute list without unwanted punctuation characters: {substitutes}\n")
# except TypeError as error:
#     print(f"Error occurred: {error}")



# ## b) remove duplicates within the substitute list from the substitute list (duplicates are likely for models that did not lowercase by default)
# ## the last mentioned duplicate is removed on purpose, as this may probably be the (previously) uppercased variant of the lowercased substitute (lowercased subs are most likely higher ranked by the model)
# substitutes_no_dupl = []
# for sub in substitutes:
#     if sub not in substitutes_no_dupl:
#         substitutes_no_dupl.append(sub)
# #print(f"Morphological Adaptation step b): substitute list without duplicates of substitutes: {substitutes_no_dupl}\n")



# ## c) remove duplicates and inflected forms of the complex word from the substitute list

# ## first Lemmatize the complex word with spaCy, in order to compare it with the lemmatized substitute later to see if their mutual lemmas are the same
# doc_complex_word = nlp(complex_word)
# complex_word_lemma = doc_complex_word[0].lemma_
# #print(f"complex_word_lemma for complex word '{complex_word}': {complex_word_lemma}\n")


# ## then, remove duplicates and inflected forms of the complex word from the list with substitutes
# substitutes_no_dupl_complex_word = []
# for substitute in substitutes_no_dupl:
#     doc_substitute = nlp(substitute)
#     substitute_lemma = doc_substitute[0].lemma_
#     if substitute_lemma != complex_word_lemma:
#         substitutes_no_dupl_complex_word.append(substitute)
# #print(f"Morphological Adaptation step c): substitute list without duplicates of the complex word nor inflected forms of the complex word: {substitutes_no_dupl_complex_word}\n")


# ## d) remove antonyms of the complex word from the substitute list
# substitutes_no_dupl_complex_word_no_antonym = []
# for substitute in substitutes_no_dupl_complex_word:
#     syn = wn.synsets(complex_word_lemma)
#     if syn:
#         syn = syn[0]
#         for lemma in syn.lemmas():
#             if lemma.antonyms() and lemma.name() == substitute_lemma:
#                 print(f"Antonym removed (lemma): {lemma.antonyms()[0].name()}")
#                 break
#         else:
#             substitutes_no_dupl_complex_word_no_antonym.append(substitute)
#     else:
#         substitutes_no_dupl_complex_word_no_antonym.append(substitute)
# #print(f"Morphological Adaptation step d): substitute list without antonyms of the complex word: {substitutes_no_dupl_complex_word_no_antonym}\n")


# # limit the substitutes to the 10 first ones for evaluation
# top_10_substitutes = substitutes_no_dupl_complex_word_no_antonym[:10]
# print(f"top-10 substitutes SG and MA: {top_10_substitutes}\n")

# # add the sentence, complex_word, and substitutes to the dataframe 
# substitutes_df.loc[1] = [sentence, complex_word] + top_10_substitutes

# print('---------------------------------------------------------------------------------------------------------------------------------------------')

# # export the dataframe to a tsv file for evaluation
# substitutes_df.to_csv("./predictions/trial/RobertaBase_SG_MA_dataloc1.tsv", sep="\t", index=False, header=False)

In [15]:
# # including all print statements to check where errors occur

# #for SG_MA_SS (Bertscore) step:

# row = data.loc[1]

# # 1. Substitute Generation (SG): perform masking and generate substitutes:

# ## print the sentence and the complex word
# sentence, complex_word = row["sentence"], row["complex_word"]
# print(f"Sentence: {sentence}")
# print(f"Complex word: {complex_word}")

# ## in the sentence, replace the complex word with a masked word
# sentence_masked_word = sentence.replace(complex_word, "<mask>") # this is different per model (this code line applies to Roberta)

# ## concatenate the original sentence and the masked sentence
# sentences_concat = f"{sentence} {lm_tokenizer.sep_token} {sentence_masked_word}"

# ## generate and rank candidate substitutes for the masked word using the fill_mask pipeline (removing elements without token_str key; as this gave errors in the ELECTRA models) .
# top_k = 30
# result = fill_mask(sentences_concat, top_k=top_k)
# substitutes = [substitute["token_str"] for substitute in result if "token_str" in substitute]
# print(f"Substitute Generation step: initial substitute list: {substitutes}\n")


# #2: Morphological Generation and Context Adaptation (Morphological Adaptation):  
# ## a) remove noise in the substitutes, by ignoring generated substitutes that are empty or that have unwanted punctuation characters or that start with '##' (this returned errors with the ELECTRA model), and lowercase the substitutes (as some models don't lowercase by default)
# ## and lowercase all substitutes (not needed for BERT-base-uncase but by default applied). Use try/except statement to prevent other character-related problems to happen

# punctuation_set = set(string.punctuation) - set('-') # retained hyphens in case tokenizers don't split on hyphenated compounds
# punctuation_set.update({'“','”'})   # as these curly quotes appeared in the Electra (SG step) results but were not part of the string set

# try:
#     substitutes = [substitute["token_str"].lower() for substitute in result if not any(char in punctuation_set for char in substitute["token_str"]) 
#                    and not substitute["token_str"].startswith('##') and substitute["token_str"].strip() != ""]
#     print(f"Morphological Adaptation step a): substitute list without unwanted punctuation characters: {substitutes}\n")
# except TypeError as error:
#     print(f"Error occurred: {error}")



# ## b) remove duplicates within the substitute list from the substitute list (duplicates are likely for models that did not lowercase by default)
# ## the last mentioned duplicate is removed on purpose, as this may probably be the (previously) uppercased variant of the lowercased substitute (lowercased subs are most likely higher ranked by the model)
# substitutes_no_dupl = []
# for sub in substitutes:
#     if sub not in substitutes_no_dupl:
#         substitutes_no_dupl.append(sub)
# print(f"Morphological Adaptation step b): substitute list without duplicates of substitutes: {substitutes_no_dupl}\n")



# ## c) remove duplicates and inflected forms of the complex word from the substitute list

# ## first Lemmatize the complex word with spaCy, in order to compare it with the lemmatized substitute later to see if their mutual lemmas are the same
# doc_complex_word = nlp(complex_word)
# complex_word_lemma = doc_complex_word[0].lemma_
# print(f"complex_word_lemma for complex word '{complex_word}': {complex_word_lemma}\n")


# ## then, remove duplicates and inflected forms of the complex word from the list with substitutes
# substitutes_no_dupl_complex_word = []
# for substitute in substitutes_no_dupl:
#     doc_substitute = nlp(substitute)
#     substitute_lemma = doc_substitute[0].lemma_
#     if substitute_lemma != complex_word_lemma:
#         substitutes_no_dupl_complex_word.append(substitute)
# print(f"Morphological Adaptation step c): substitute list without duplicates of the complex word nor inflected forms of the complex word: {substitutes_no_dupl_complex_word}\n")


# ## d) remove antonyms of the complex word from the substitute list
# substitutes_no_dupl_complex_word_no_antonym = []
# for substitute in substitutes_no_dupl_complex_word:
#     syn = wn.synsets(complex_word_lemma)
#     if syn:
#         syn = syn[0]
#         for lemma in syn.lemmas():
#             if lemma.antonyms() and lemma.name() == substitute_lemma:
#                 print(f"Antonym removed (lemma): {lemma.antonyms()[0].name()}")
#                 break
#         else:
#             substitutes_no_dupl_complex_word_no_antonym.append(substitute)
#     else:
#         substitutes_no_dupl_complex_word_no_antonym.append(substitute)
# print(f"Morphological Adaptation step d): substitute list without antonyms of the complex word: {substitutes_no_dupl_complex_word_no_antonym}\n")


# # create sentence with the complex word replaced by the substitutes
# sentence_with_substitutes = [sentence.replace(complex_word, sub) for sub in substitutes_no_dupl_complex_word_no_antonym]
# print(f"List with sentences where complex word is substituted: {sentence_with_substitutes}\n")


# # d) use BERTScore for sorting
# logging.getLogger('transformers').setLevel(logging.ERROR)  # to prevent the same warnings from being printed x times 

# scores = bert_score.score([sentence]*len(sentence_with_substitutes), sentence_with_substitutes, lang="en", model_type='roberta-base', verbose=False)  #use bert for BERTscore

# logging.getLogger('transformers').setLevel(logging.WARNING) # to reset the logging level back to printing warnings


# # create a list of tuples, each tuple containing a substitute and its score
# substitute_score_pairs = list(zip(substitutes_no_dupl_complex_word_no_antonym, scores[0].tolist()))

# # sort the list of tuples by the scores (the second element of each tuple), in descending order
# sorted_substitute_score_pairs = sorted(substitute_score_pairs, key=lambda x: x[1], reverse=True)

# warnings.resetwarnings() # reset the warnings so that any other warning will stil be printed

# # print each substitute with its score
# for substitute, score in sorted_substitute_score_pairs:
#     print(f"Substitute: {substitute}, BertScore: {score}")

# # extract the list of substitutes from the sorted pairs
# bertscore_ranked_substitutes_only = [substitute for substitute, _ in sorted_substitute_score_pairs]
# print(f"substitutes based on bertscores in context: {bertscore_ranked_substitutes_only}")


# # limit the substitutes to the 10 first ones for evaluation
# bertscore_top_10_substitutes = bertscore_ranked_substitutes_only[:10]
# print(f"top-10 substitutes based on bertscores in context: {bertscore_top_10_substitutes}\n")
# # add the sentence, complex_word, and substitutes to the dataframe 
# substitutes_df.loc[1] = [sentence, complex_word] + bertscore_top_10_substitutes


# print('---------------------------------------------------------------------------------------------------------------------------------------------')
    
# # export the dataframe to a tsv file for evaluation
# substitutes_df.to_csv("./predictions/trial/RobertaBase_SG_MA_SS_bsRoberta_dataloc1.tsv", sep="\t", index=False, header=False)   

In [16]:
# row = data.loc[1]
       
# # 1. Substitute Generation (SG): perform masking and generate substitutes:

# ## print the sentence and the complex word
# sentence, complex_word = row["sentence"], row["complex_word"]
# print(f"Sentence: {sentence}")
# print(f"Complex word: {complex_word}")

# ## in the sentence, replace the complex word with a masked word
# sentence_masked_word = sentence.replace(complex_word, "<mask>") # this is different per model (this code line applies to Roberta)


# ## concatenate the original sentence and the masked sentence
# sentences_concat = f"{sentence} {lm_tokenizer.sep_token} {sentence_masked_word}"

# ## generate and rank candidate substitutes for the masked word using the fill_mask pipeline (removing elements without token_str key; as this gave errors in the ELECTRA models) .
# top_k = 30
# result = fill_mask(sentences_concat, top_k=top_k)

# substitutes = [substitute["token_str"] for substitute in result if "token_str" in substitute]
# #print(f"Substitute Generation step: initial substitute list: {substitutes}\n")


# #2: Morphological Generation and Context Adaptation (Morphological Adaptation):  
# ## a) remove noise in the substitutes, by ignoring generated substitutes that are empty or that have unwanted punctuation characters or that start with '##' (this returned errors with the ELECTRA model), and lowercase the substitutes (as some models don't lowercase by default)
# ## and lowercase all substitutes (not needed for BERT-base-uncased but by default applied). Use try/except statement to prevent other character-related problems to happen

# punctuation_set = set(string.punctuation) - set('-') # retained hyphens in case tokenizers don't split on hyphenated compounds
# punctuation_set.update({'“','”'})   # as these curly quotes appeared in the Electra (SG step) results but were not part of the string set

# try:
#     substitutes = [substitute["token_str"].lower() for substitute in result if not any(char in punctuation_set for char in substitute["token_str"]) 
#                    and not substitute["token_str"].startswith('##') and substitute["token_str"].strip() != ""]
#     # print(f"Morphological Adaptation step a): substitute list without unwanted punctuation characters: {substitutes}\n")
# except TypeError as error:
#     print(f"Error occurred: {error}")



# ## b) remove duplicates within the substitute list from the substitute list (duplicates are likely for models that did not lowercase by default)
# ## the last mentioned duplicate is removed on purpose, as this may probably be the (previously) uppercased variant of the lowercased substitute (lowercased subs are most likely higher ranked by the model)
# substitutes_no_dupl = []
# for sub in substitutes:
#     if sub not in substitutes_no_dupl:
#         substitutes_no_dupl.append(sub)
# #print(f"Morphological Adaptation step b): substitute list without duplicates of substitutes: {substitutes_no_dupl}\n")



# ## c) remove duplicates and inflected forms of the complex word from the substitute list

# ## first Lemmatize the complex word with spaCy, in order to compare it with the lemmatized substitute later to see if their mutual lemmas are the same
# doc_complex_word = nlp(complex_word)
# complex_word_lemma = doc_complex_word[0].lemma_
# #print(f"complex_word_lemma for complex word '{complex_word}': {complex_word_lemma}\n")


# ## then, remove duplicates and inflected forms of the complex word from the list with substitutes
# substitutes_no_dupl_complex_word = []
# for substitute in substitutes_no_dupl:
#     doc_substitute = nlp(substitute)
#     substitute_lemma = doc_substitute[0].lemma_
#     if substitute_lemma != complex_word_lemma:
#         substitutes_no_dupl_complex_word.append(substitute)
# #print(f"Morphological Adaptation step c): substitute list without duplicates of the complex word nor inflected forms of the complex word: {substitutes_no_dupl_complex_word}\n")


# ## d) remove antonyms of the complex word from the substitute list
# substitutes_no_dupl_complex_word_no_antonym = []
# for substitute in substitutes_no_dupl_complex_word:
#     syn = wn.synsets(complex_word_lemma)
#     if syn:
#         syn = syn[0]
#         for lemma in syn.lemmas():
#             if lemma.antonyms() and lemma.name() == substitute_lemma:
#                 print(f"Antonym removed (lemma): {lemma.antonyms()[0].name()}")
#                 break
#         else:
#             substitutes_no_dupl_complex_word_no_antonym.append(substitute)
#     else:
#         substitutes_no_dupl_complex_word_no_antonym.append(substitute)
# #print(f"Morphological Adaptation step d): substitute list without antonyms of the complex word: {substitutes_no_dupl_complex_word_no_antonym}\n")


# # limit the substitutes to the 10 first ones for evaluation
# top_10_substitutes = substitutes_no_dupl_complex_word_no_antonym[:10]
# print(f"top-10 substitutes SG and MA: {top_10_substitutes}\n")

# # add the sentence, complex_word, and substitutes to the dataframe 
# substitutes_df.loc[1] = [sentence, complex_word] + top_10_substitutes

# print('---------------------------------------------------------------------------------------------------------------------------------------------')

# # export the dataframe to a tsv file for evaluation
# substitutes_df.to_csv("./predictions/trial/RobertaBase_SG_MA_dataloc1.tsv", sep="\t", index=False, header=False)