#### Evaluations RoBERTa for the trial set (10 sentences)

In [1]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
import pandas as pd
from transformers import pipeline

# read the tsv file
filename = "./data/trial/tsar2022_en_trial_none_no_noise.tsv"
data = pd.read_csv(filename, sep='\t', header=None, names=["sentence", "complex_word"])

# create an empty dataframe to store the substitutes for evaluation
substitutes_df = pd.DataFrame(columns=["sentence", "complex_word"] + [f"substitute_{i+1}" for i in range(10)])


In [2]:
import logging

In [3]:
# the code below is used for morphological adjustments in step MA
from nltk.corpus import wordnet as wn
import spacy
nlp = spacy.load("en_core_web_sm")

import string

In [4]:
# the code below is used when word embeddings are used in step SS
from transformers import TFAutoModel
import tensorflow as tf
import numpy as np

In [4]:
# the code below is used when Bertscore is used in step SS
import bert_score
from bert_score import score

In [5]:
# set the display.max_rows option to None to display all rows instead of limiting it to 50
pd.set_option('display.max_rows', None)

#### Roberta-base

In [6]:
# Instantiate the tokenizer and the model

# for roberta-base:
lm_tokenizer_robertabase = AutoTokenizer.from_pretrained("roberta-base")
lm_model_robertabase = AutoModelForMaskedLM.from_pretrained("roberta-base")


# Instantiate the fill-mask pipeline with the model
fill_mask_robertabase = pipeline("fill-mask", lm_model_robertabase, tokenizer = lm_tokenizer_robertabase)

#### Substitute Generation and Morphological Adaptation:

In [7]:
# for robertabase model:
   
       
    # 1. Substitute Generation (SG): perform masking and generate substitutes:

    ## in the sentence, replace the complex word with a masked word
    sentence_masked_word_robertabase = sentence.replace(complex_word, lm_tokenizer_robertabase.mask_token)

    ## concatenate the original sentence and the masked sentence
    sentences_concat_robertabase = f"{sentence} {lm_tokenizer_robertabase.sep_token} {sentence_masked_word_robertabase}"

    ## generate and rank candidate substitutes for the masked word using the fill_mask pipeline (removing elements without token_str key; as this gave errors in the ELECTRA models) .
    top_k = 30
    result_robertabase = fill_mask_robertabase(sentences_concat_robertabase, top_k=top_k)
    substitutes_robertabase = [substitute["token_str"] for substitute in result_robertabase if "token_str" in substitute]
    #print(f"Substitute Generation step: initial substitute list: {substitutes_robertabase}\n")


    #2: Morphological Generation and Context Adaptation (Morphological Adaptation):  
    ## a) remove noise in the substitutes, by ignoring generated substitutes that are empty or that have unwanted punctuation characters or that start with '##' (this returned errors with the ELECTRA model), and lowercase the substitutes (as some models don't lowercase by default)
    ## and lowercase all substitutes. Use try/except statement to prevent other character-related problems to happen

    punctuation_set = set(string.punctuation) - set('-') # retained hyphens in case tokenizers don't split on hyphenated compounds
    punctuation_set.update({'“','”'})   # as these curly quotes appeared in the Electra (SG step) results but were not part of the string set

    try:
        substitutes_robertabase = [substitute["token_str"].lower().strip() for substitute in result_robertabase if not any(char in punctuation_set for char in substitute["token_str"]) # added .strip as roberta uses a leading space before each substitute
                      and not substitute["token_str"].startswith('##') and substitute["token_str"].strip() != ""]
        # print(f"Morphological Adaptation step a): substitute list without unwanted punctuation characters for robertabase model: {substitutes_robertabase\n")
    except TypeError as error:
        continue




    ## b) remove duplicates within the substitute list from the substitute list (duplicates are likely for models that did not lowercase by default)
    ## the last mentioned duplicate is removed on purpose, as this may probably be the (previously) uppercased variant of the lowercased substitute (lowercased subs are most likely higher ranked by the model)
    substitutes_no_dupl_robertabase= []
    for sub in substitutes_robertabase:
        if sub not in substitutes_no_dupl_robertabase:
            substitutes_no_dupl_robertabase.append(sub)
    #print(f"Morphological Adaptation step b): substitute list without duplicates of substitutes for bert-base model: {substitutes_no_dupl_robertabase\n")



    ## c) remove duplicates and inflected forms of the complex word from the substitute list

    ## first Lemmatize the complex word with spaCy, in order to compare it with the lemmatized substitute later to see if their mutual lemmas are the same
    doc_complex_word = nlp(complex_word)
    complex_word_lemma = doc_complex_word[0].lemma_
    #print(f"complex_word_lemma for complex word '{complex_word}': {complex_word_lemma}\n")


    ## then, remove duplicates and inflected forms of the complex word from the list with substitutes
    substitutes_no_dupl_complex_word_robertabase = []
    for substitute in substitutes_no_dupl_robertabase:
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_
        if substitute_lemma != complex_word_lemma:
            substitutes_no_dupl_complex_word_robertabase.append(substitute)
    #print(f"Morphological Adaptation step c): substitute list without duplicates of the complex word nor inflected forms of the complex word for robertabase model: {substitutes_no_dupl_complex_word_robertabase}\n")


    ## d) remove antonyms of the complex word from the substitute list
    ## step 1: get the antonyms of the complex word
    antonyms_complex_word = []
    for syn in wn.synsets(complex_word_lemma):
        for lemma in syn.lemmas():
            for antonym in lemma.antonyms():
                    antonyms_complex_word.append(antonym.name())

    print(f"Antonyms for complex word '{complex_word}': {antonyms_complex_word}\n")

    ## step 2: remove antonyms of the complex word from the list with substitutes
    substitutes_no_antonyms_robertabase = []
    for substitute in substitutes_no_dupl_complex_word_robertabase:
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_
        if substitute_lemma not in antonyms_complex_word:
            substitutes_no_antonyms_robertabase.append(substitute)
        else:
            print(f"Removed antonym: {substitute}")
    print(f"Morphological Adaptation step d): substitute list without antonyms of the complex word for robertabase model: {substitutes_no_antonyms_robertabase}\n") 


        
    # limit the substitutes to the 10 highest ranked ones for evaluation
    top_10_substitutes = substitutes_no_antonyms_robertabase[:10]
    #print(f"top-10 substitutes SG and MA: {top_10_substitutes}\n")
    
    # fill the dataframe with 10 elements even if there are less than 10 in the previous list
    required_for_dataframe = 10
    # pad the list with None until it has 10 elements
    top_10_substitutes += [None] * (required_for_dataframe - len(top_10_substitutes))

    # # add the sentence, complex_word, and the substitutes to the dataframe 
    substitutes_df.loc[index] = [sentence, complex_word] + top_10_substitutes
    
    
    #print('---------------------------------------------------------------------------------------------------------------------------------------------')
    
# export the dataframe to a tsv file for evaluation
substitutes_df.to_csv("./predictions/trial/RobertaBase_SG_MA.tsv", sep="\t", index=False, header=False)
print("RobertaBase_SG_MA exported to csv in path './predictions/trial/RobertaBase_SG_MA.tsv'}\n")
    

Sentence: A Spanish government source, however, later said that banks able to cover by themselves losses on their toxic property assets will not be forced to remove them from their books while it will be compulsory for those receiving public help.
Complex word: compulsory
top-10 substitutes SG and MA: [' compulsory', ' mandatory', ' obligatory', ' voluntary', ' required', ' optional', ' obliged', ' uniform', ' necessary', ' available']

---------------------------------------------------------------------------------------------------------------------------------------------
Sentence: Rajoy's conservative government had instilled markets with a brief dose of confidence by stepping into Bankia, performing a U-turn on its refusal to spend public money to rescue banks.
Complex word: instilled
top-10 substitutes SG and MA: [' infused', ' injected', ' endowed', 'illed', ' inst', ' furnished', ' supplied', ' bolstered', ' implanted', ' impressed']

------------------------------------------

python tsar_eval.py --gold_file ./data/trial/tsar2022_en_trial_gold_no_noise.tsv --predictions_file ./predictions/trial/RobertaBase_SG_MA.tsv --output_file ./output/trial/RobertaBase_SG_MA.tsv

#### Substitute Generation and Morphological Adaptation, with additions:
- moving already generated synonyms from the generated top-30 to the top-10 (if not already there) 
- adding synonyms to the generated list, to the top-10 so that the top-10 can be ranked on SR in the SR step. 

Question to consider: should adding or moving synonyms be executed before the SS step (Bertscores) or after? Even if Bertscores reveal that they are not very similar, did annotators still use synonyms for simplification? If so, then the synonyms should be added after the SS step (Bertscores). 
Answer: perform both before and after Bertscores, and see from the trial set results what will be the best approach. 

### Before the SS step:

In [7]:
# in each row, for each complex word: 
for index, row in data.iterrows():
    
    # print the sentence and the complex word
    sentence, complex_word = row["sentence"], row["complex_word"]
    print(f"Sentence: {sentence}")
    print(f"Complex word: {complex_word}")
    


# for robertabase model:
   
       
    # 1. Substitute Generation (SG): perform masking and generate substitutes:

    ## in the sentence, replace the complex word with a masked word
    sentence_masked_word_robertabase = sentence.replace(complex_word, lm_tokenizer_robertabase.mask_token)

    ## concatenate the original sentence and the masked sentence
    sentences_concat_robertabase = f"{sentence} {lm_tokenizer_robertabase.sep_token} {sentence_masked_word_robertabase}"

    ## generate and rank candidate substitutes for the masked word using the fill_mask pipeline (removing elements without token_str key; as this gave errors in the ELECTRA models) .
    top_k = 30
    result_robertabase = fill_mask_robertabase(sentences_concat_robertabase, top_k=top_k)
    substitutes_robertabase = [substitute["token_str"] for substitute in result_robertabase if "token_str" in substitute]
    #print(f"Substitute Generation step: initial substitute list: {substitutes_robertabase}\n")


    #2: Morphological Generation and Context Adaptation (Morphological Adaptation):  
    ## a) remove noise in the substitutes, by ignoring generated substitutes that are empty or that have unwanted punctuation characters or that start with '##' (this returned errors with the ELECTRA model), and lowercase the substitutes (as some models don't lowercase by default)
    ## and lowercase all substitutes. Use try/except statement to prevent other character-related problems to happen

    punctuation_set = set(string.punctuation) - set('-') # retained hyphens in case tokenizers don't split on hyphenated compounds
    punctuation_set.update({'“','”'})   # as these curly quotes appeared in the Electra (SG step) results but were not part of the string set

    try:
        substitutes_robertabase = [substitute["token_str"].lower().strip() for substitute in result_robertabase if not any(char in punctuation_set for char in substitute["token_str"]) # added .strip as roberta uses a leading space before each substitute
                      and not substitute["token_str"].startswith('##') and substitute["token_str"].strip() != ""]
        # print(f"Morphological Adaptation step a): substitute list without unwanted punctuation characters for robertabase model: {substitutes_robertabase\n")
    except TypeError as error:
        continue




    ## b) remove duplicates within the substitute list from the substitute list (duplicates are likely for models that did not lowercase by default)
    ## the last mentioned duplicate is removed on purpose, as this may probably be the (previously) uppercased variant of the lowercased substitute (lowercased subs are most likely higher ranked by the model)
    substitutes_no_dupl_robertabase= []
    for sub in substitutes_robertabase:
        if sub not in substitutes_no_dupl_robertabase:
            substitutes_no_dupl_robertabase.append(sub)
    #print(f"Morphological Adaptation step b): substitute list without duplicates of substitutes for bert-base model: {substitutes_no_dupl_robertabase\n")



    ## c) remove duplicates and inflected forms of the complex word from the substitute list

    ## first Lemmatize the complex word with spaCy, in order to compare it with the lemmatized substitute later to see if their mutual lemmas are the same
    doc_complex_word = nlp(complex_word)
    complex_word_lemma = doc_complex_word[0].lemma_
    #print(f"complex_word_lemma for complex word '{complex_word}': {complex_word_lemma}\n")


    ## then, remove duplicates and inflected forms of the complex word from the list with substitutes
    substitutes_no_dupl_complex_word_robertabase = []
    for substitute in substitutes_no_dupl_robertabase:
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_
        if substitute_lemma != complex_word_lemma:
            substitutes_no_dupl_complex_word_robertabase.append(substitute)
    #print(f"Morphological Adaptation step c): substitute list without duplicates of the complex word nor inflected forms of the complex word for robertabase model: {substitutes_no_dupl_complex_word_robertabase}\n")


    ## d) remove antonyms of the complex word from the substitute list
    ## step 1: get the antonyms of the complex word
    antonyms_complex_word = []
    for syn in wn.synsets(complex_word_lemma):
        for lemma in syn.lemmas():
            for antonym in lemma.antonyms():
                    antonyms_complex_word.append(antonym.name())

    print(f"Antonyms for complex word '{complex_word}': {antonyms_complex_word}\n")

    ## step 2: remove antonyms of the complex word from the list with substitutes
    substitutes_no_antonyms_robertabase = []
    for substitute in substitutes_no_dupl_complex_word_robertabase:
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_
        if substitute_lemma not in antonyms_complex_word:
            substitutes_no_antonyms_robertabase.append(substitute)
        else:
            print(f"Removed antonym: {substitute}")
    print(f"Morphological Adaptation step d): substitute list without antonyms of the complex word for robertabase model: {substitutes_no_antonyms_robertabase}\n") 

    
    
    
    # create two lists to hold synonyms and non-synonyms
    synonyms = []
    non_synonyms = []

    # iterate through each substitute
    for substitute in substitutes_no_antonyms_robertabase:
        substitute_synsets = wn.synsets(substitute)
        #print(f"substitute_synsets for {substitute}: {substitute_synsets}\n")

        # Get all the lemmas for the substitute
        substitute_syn_lemmas = [lemma.name() for syn in substitute_synsets for lemma in syn.lemmas()]
        #print(f"substitute_syn_lemmas for {substitute}: {substitute_syn_lemmas}\n")

        # Get all the lemmas for the complex word
        complex_word_synsets = wn.synsets(complex_word_lemma)
        #print(f"complex_word_synsets for {complex_word}: {complex_word_synsets}\n")

        complex_word_syn_lemmas = [lemma.name() for syn in complex_word_synsets for lemma in syn.lemmas()]
        #print(f"complex_word_syn_lemmas for {complex_word}: {complex_word_syn_lemmas}\n")

        # Find the intersection between the lemmas of the substitute and the complex word
        intersection = set(complex_word_syn_lemmas).intersection(set(substitute_syn_lemmas))
        #print(f"intersection between lemmas of synsets of complex word {complex_word} and lemmas of synsets of substitute {substitute}: {intersection}\n")

        if intersection:
            #print(f"Substitute {substitute} is a synonym of the complex word. Matching lemmas of their synsets: {intersection}\n")
            # Add substitute to synonyms list
            synonyms.append(substitute)
        else:
            # Add substitute to non_synonyms list
            non_synonyms.append(substitute)

        
    # Print the lists of synonyms and non-synonyms
    print(f"List of substitutes that are synonyms: {synonyms}\n")
    #print(f"List of substitutes that are not synonyms: {non_synonyms}\n")

    # Combine the lists with synonyms appearing first
    final_list = synonyms + non_synonyms
    print(f"Final list with synonyms appearing first: {final_list}\n")

        
    print('-----------------------------------------------------------------------------------')
    
    
    #     # limit the substitutes to the 10 highest ranked ones for evaluation
    top_10_substitutes = final_list[:10]
    print(f"top_10_substitutes: {top_10_substitutes}\n")

    
    # # add the sentence, complex_word, and the substitutes to the dataframe 
    substitutes_df.loc[index] = [sentence, complex_word] + top_10_substitutes
    

# export the dataframe to a tsv file for evaluation
substitutes_df.to_csv("./predictions/trial/RobertaBase_SG_MA_SS_synsfirst.tsv", sep="\t", index=False, header=False)
print("RobertaBase_SG_MA_SS_synsfirst exported to csv in path './predictions/trial/RobertaBase_SG_MA_SS_synsfirst.tsv'}\n")

    
    
 

Sentence: A Spanish government source, however, later said that banks able to cover by themselves losses on their toxic property assets will not be forced to remove them from their books while it will be compulsory for those receiving public help.
Complex word: compulsory
Antonyms for complex word 'compulsory': []

Morphological Adaptation step d): substitute list without antonyms of the complex word for robertabase model: ['mandatory', 'obligatory', 'voluntary', 'required', 'optional', 'obliged', 'uniform', 'necessary', 'available', 'mandated', 'sufficient', 'routine', 'forced', 'customary', 'prerequisite', 'feasible', 'indispensable', 'forthcoming', 'universal', 'requirement', 'involuntary', 'obligated', 'compelled', 'conditional', 'enforced', 'contingent', 'possible', 'compulsion']

List of substitutes that are synonyms: ['mandatory', 'required']

Final list with synonyms appearing first: ['mandatory', 'required', 'obligatory', 'voluntary', 'optional', 'obliged', 'uniform', 'necessa

python tsar_eval.py --gold_file ./data/trial/tsar2022_en_trial_gold_no_noise.tsv --predictions_file ./predictions/trial/RobertaBase_SG_MA_SS_synsfirst.tsv --output_file ./output/trial/RobertaBase_SG_MA_SS_synsfirst.tsv

### results worse than without the synonyms first!