#### Evaluations BERT for the TEST set

In [1]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
import pandas as pd
from transformers import pipeline

# read the tsv file
filename = "./data/test/tsar2022_en_test_none_no_noise.tsv"
data = pd.read_csv(filename, sep='\t', header=None, names=["sentence", "complex_word"])

# create an empty dataframe to store the substitutes for evaluation
substitutes_df = pd.DataFrame(columns=["sentence", "complex_word"] + [f"substitute_{i+1}" for i in range(10)])


In [2]:
import logging

In [3]:
# the code below is used for morphological adjustments in step MA
from nltk.corpus import wordnet as wn
import spacy
nlp = spacy.load("en_core_web_sm")

import string

#### Bert-large-uncased

In [4]:
# Instantiate the tokenizer and the model

lm_tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased")
lm_model= AutoModelForMaskedLM.from_pretrained("bert-large-uncased")


# Instantiate the fill-mask pipeline with the model
fill_mask= pipeline("fill-mask", lm_model, tokenizer = lm_tokenizer)

Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


#### Substitute Generation and Morphological Adaptation: substitutes with shared synonyms with the complex word first:

In [10]:
# in each row, for each complex word: 
for index, row in data.iterrows():
    
    # print the sentence and the complex word
    sentence, complex_word = row["sentence"], row["complex_word"]
    #print(f"Sentence: {sentence}")
    #print(f"Complex word: {complex_word}")
    
    
       
    # 1. Substitute Generation (SG): perform masking and generate substitutes:

    ## in the sentence, replace the complex word with a masked word
    sentence_masked_word = sentence.replace(complex_word, lm_tokenizer.mask_token)

    ## concatenate the original sentence and the masked sentence
    sentences_concat = f"{sentence} {lm_tokenizer.sep_token} {sentence_masked_word}"

    ## generate and rank candidate substitutes for the masked word using the fill_mask pipeline (removing elements without token_str key; as this gave errors in the ELECTRA models) .
    top_k = 30
    result= fill_mask(sentences_concat, top_k=top_k)
    substitutes = [substitute["token_str"] for substitute in result if "token_str" in substitute]
    #print(f"Substitute Generation step: initial substitute list: {substitutes}\n")


    #2: Morphological Generation and Context Adaptation (Morphological Adaptation):  
    ## a) remove noise in the substitutes, by ignoring generated substitutes that are empty or that have unwanted punctuation characters or that start with '##' (this returned errors with the ELECTRA model), and lowercase the substitutes (as some models don't lowercase by default)
    ## and lowercase all substitutes. Use try/except statement to prevent other character-related problems to happen

    punctuation_set = set(string.punctuation) - set('-') # retained hyphens in case tokenizers don't split on hyphenated compounds
    punctuation_set.update({'“','”'})   # as these curly quotes appeared in the Electra (SG step) results but were not part of the string set

    try:
        substitutes = [substitute["token_str"].lower().strip() for substitute in result if not any(char in punctuation_set for char in substitute["token_str"]) # added .strip as roberta uses a leading space before each substitute
                      and not substitute["token_str"].startswith('##') and substitute["token_str"].strip() != ""]
        # print(f"Morphological Adaptation step a): substitute list without unwanted punctuation characters: {substitutes}\n")
    except TypeError as error:
        continue



    ## b) remove duplicates within the substitute list from the substitute list (duplicates are likely for models that did not lowercase by default)
    ## the last mentioned duplicate is removed on purpose, as this may probably be the (previously) uppercased variant of the lowercased substitute (lowercased subs are most likely higher ranked by the model)
    substitutes_no_dupl = []
    for sub in substitutes:
        if sub not in substitutes_no_dupl:
            substitutes_no_dupl.append(sub)
    #print(f"Morphological Adaptation step b): substitute list without duplicates of substitutes model: {substitutes_no_dupl}\n")



    ## c) remove duplicates and inflected forms of the complex word from the substitute list

    ## first Lemmatize the complex word with spaCy, in order to compare it with the lemmatized substitute later to see if their mutual lemmas are the same
    doc_complex_word = nlp(complex_word)
    complex_word_lemma = doc_complex_word[0].lemma_
    #print(f"complex_word_lemma for complex word '{complex_word}': {complex_word_lemma}\n")


    ## then, remove duplicates and inflected forms of the complex word from the list with substitutes
    substitutes_no_dupl_complex_word = []
    for substitute in substitutes_no_dupl:
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_
        if substitute_lemma != complex_word_lemma:
            substitutes_no_dupl_complex_word.append(substitute)
    #print(f"Morphological Adaptation step c): substitute list without duplicates of the complex word nor inflected forms of the complex word: {substitutes_no_dupl_complex_word}\n")


     ## d) remove antonyms of the complex word from the substitute list
    ## step 1: get the antonyms of the complex word
    antonyms_complex_word = []
    for syn in wn.synsets(complex_word_lemma):
        for lemma in syn.lemmas():
            for antonym in lemma.antonyms():
                    antonyms_complex_word.append(antonym.name())

    #print(f"Antonyms for complex word '{complex_word}': {antonyms_complex_word}\n")

    ## step 2: remove antonyms of the complex word from the list with substitutes
    substitutes_no_antonyms = []
    for substitute in substitutes_no_dupl_complex_word:
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_
        if substitute_lemma not in antonyms_complex_word:
            substitutes_no_antonyms.append(substitute)
        # else:
        #     print(f"Removed antonym: {substitute}")
    #print(f"Morphological Adaptation step d): substitute list without antonyms of the complex word: {substitutes_no_antonyms}\n") 
    
    
    
    # create two lists to hold synonyms and non-synonyms
    synonyms = []
    non_synonyms = []

    # iterate through each substitute
    for substitute in substitutes_no_antonyms:
        substitute_synsets = wn.synsets(substitute)
        #print(f"substitute_synsets for {substitute}: {substitute_synsets}\n")

        # get all the lemmas for the substitute
        substitute_syn_lemmas = [lemma.name() for syn in substitute_synsets for lemma in syn.lemmas()]
        #print(f"substitute_syn_lemmas for {substitute}: {substitute_syn_lemmas}\n")

        # get all the lemmas for the complex word
        complex_word_synsets = wn.synsets(complex_word_lemma)
        #print(f"complex_word_synsets for {complex_word}: {complex_word_synsets}\n")

        complex_word_syn_lemmas = [lemma.name() for syn in complex_word_synsets for lemma in syn.lemmas()]
        #print(f"complex_word_syn_lemmas for {complex_word}: {complex_word_syn_lemmas}\n")

        # find the intersection between the lemmas of the substitute and the complex word
        intersection = set(complex_word_syn_lemmas).intersection(set(substitute_syn_lemmas))
        #print(f"intersection between lemmas of synsets of complex word {complex_word} and lemmas of synsets of substitute {substitute}: {intersection}\n")

        if intersection:
            #print(f"Substitute {substitute} is a synonym of the complex word. Matching lemmas of their synsets: {intersection}\n")
            # Add substitute to synonyms list
            synonyms.append(substitute)
        else:
            # Add substitute to non_synonyms list
            non_synonyms.append(substitute)

        
    
    #print(f"List of substitutes that are synonyms with the complex word in Wordnet: {synonyms}\n")
    #print(f"List of substitutes that are not synonyms with the complex word in Wordnet: {non_synonyms}\n")

    # combine the lists with synonyms appearing first
    final_list = synonyms + non_synonyms
    #print(f"Final list with synonyms appearing first: {final_list}\n")

        
    
    #     # limit the substitutes to the 10 highest ranked ones for evaluation
    top_10_substitutes = final_list[:10]
    #print(f"Final top-10 with synonyms with the complex word in Wordnet appearing first: {top_10_substitutes}\n")
    
    #print('--------------------------------------------------------------------------------------------------------------------------------------------')
    

    
    # # add the sentence, complex_word, and the substitutes to the dataframe 
    substitutes_df.loc[index] = [sentence, complex_word] + top_10_substitutes
    
    
    
# export the dataframe to a tsv file for evaluation
substitutes_df.to_csv("./predictions/test/SG_MA_bertlarge_synsfirst.tsv", sep="\t", index=False, header=False)
print("SG_MA_bertlarge_synsfirst exported to csv in path './predictions/test/SG_MA_bertlarge_synsfirst.tsv'}\n")

List of substitutes that are synonyms with the complex word in Wordnet: ['mandatory', 'required']

List of substitutes that are not synonyms with the complex word in Wordnet: ['obligatory', 'voluntary', 'optional', 'mandated', 'necessary', 'forbidden', 'permitted', 'prescribed', 'illegal', 'available', 'beneficial', 'prohibited', 'scheduled', 'obliged', 'tertiary', 'secondary', 'statutory', 'legal', 'free', 'used', 'customary', 'primary', 'authorised', 'canonical', 'standard', 'lawful']

Final list with synonyms appearing first: ['mandatory', 'required', 'obligatory', 'voluntary', 'optional', 'mandated', 'necessary', 'forbidden', 'permitted', 'prescribed', 'illegal', 'available', 'beneficial', 'prohibited', 'scheduled', 'obliged', 'tertiary', 'secondary', 'statutory', 'legal', 'free', 'used', 'customary', 'primary', 'authorised', 'canonical', 'standard', 'lawful']

Final top-10 with synonyms appearing first: ['mandatory', 'required', 'obligatory', 'voluntary', 'optional', 'mandated', '

python tsar_eval.py --gold_file ./data/test/tsar2022_en_test_gold_no_noise.tsv --predictions_file ./predictions/test/SG_MA_bertlarge_synsfirst.tsv --output_file ./output/test/SG_MA_bertlarge_synsfirst.tsv

#### Electra-large:

In [11]:
# Instantiate the tokenizer and the model
lm_tokenizer = AutoTokenizer.from_pretrained("google/electra-large-generator")
lm_model = AutoModelForMaskedLM.from_pretrained("google/electra-large-generator")


# Instantiate the fill-mask pipeline with the model
fill_mask = pipeline("fill-mask", lm_model, tokenizer = lm_tokenizer)

In [12]:
# in each row, for each complex word: 
for index, row in data.iterrows():
    
    # print the sentence and the complex word
    sentence, complex_word = row["sentence"], row["complex_word"]
    #print(f"Sentence: {sentence}")
    #print(f"Complex word: {complex_word}")
    
    
       
    # 1. Substitute Generation (SG): perform masking and generate substitutes:

    ## in the sentence, replace the complex word with a masked word
    sentence_masked_word = sentence.replace(complex_word, lm_tokenizer.mask_token)

    ## concatenate the original sentence and the masked sentence
    sentences_concat = f"{sentence} {lm_tokenizer.sep_token} {sentence_masked_word}"

    ## generate and rank candidate substitutes for the masked word using the fill_mask pipeline (removing elements without token_str key; as this gave errors in the ELECTRA models) .
    top_k = 30
    result= fill_mask(sentences_concat, top_k=top_k)
    substitutes = [substitute["token_str"] for substitute in result if "token_str" in substitute]
    #print(f"Substitute Generation step: initial substitute list: {substitutes}\n")


    #2: Morphological Generation and Context Adaptation (Morphological Adaptation):  
    ## a) remove noise in the substitutes, by ignoring generated substitutes that are empty or that have unwanted punctuation characters or that start with '##' (this returned errors with the ELECTRA model), and lowercase the substitutes (as some models don't lowercase by default)
    ## and lowercase all substitutes. Use try/except statement to prevent other character-related problems to happen

    punctuation_set = set(string.punctuation) - set('-') # retained hyphens in case tokenizers don't split on hyphenated compounds
    punctuation_set.update({'“','”'})   # as these curly quotes appeared in the Electra (SG step) results but were not part of the string set

    try:
        substitutes = [substitute["token_str"].lower().strip() for substitute in result if not any(char in punctuation_set for char in substitute["token_str"]) # added .strip as roberta uses a leading space before each substitute
                      and not substitute["token_str"].startswith('##') and substitute["token_str"].strip() != ""]
        # print(f"Morphological Adaptation step a): substitute list without unwanted punctuation characters: {substitutes}\n")
    except TypeError as error:
        continue



    ## b) remove duplicates within the substitute list from the substitute list (duplicates are likely for models that did not lowercase by default)
    ## the last mentioned duplicate is removed on purpose, as this may probably be the (previously) uppercased variant of the lowercased substitute (lowercased subs are most likely higher ranked by the model)
    substitutes_no_dupl = []
    for sub in substitutes:
        if sub not in substitutes_no_dupl:
            substitutes_no_dupl.append(sub)
    #print(f"Morphological Adaptation step b): substitute list without duplicates of substitutes model: {substitutes_no_dupl}\n")



    ## c) remove duplicates and inflected forms of the complex word from the substitute list

    ## first Lemmatize the complex word with spaCy, in order to compare it with the lemmatized substitute later to see if their mutual lemmas are the same
    doc_complex_word = nlp(complex_word)
    complex_word_lemma = doc_complex_word[0].lemma_
    #print(f"complex_word_lemma for complex word '{complex_word}': {complex_word_lemma}\n")


    ## then, remove duplicates and inflected forms of the complex word from the list with substitutes
    substitutes_no_dupl_complex_word = []
    for substitute in substitutes_no_dupl:
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_
        if substitute_lemma != complex_word_lemma:
            substitutes_no_dupl_complex_word.append(substitute)
    #print(f"Morphological Adaptation step c): substitute list without duplicates of the complex word nor inflected forms of the complex word: {substitutes_no_dupl_complex_word}\n")


     ## d) remove antonyms of the complex word from the substitute list
    ## step 1: get the antonyms of the complex word
    antonyms_complex_word = []
    for syn in wn.synsets(complex_word_lemma):
        for lemma in syn.lemmas():
            for antonym in lemma.antonyms():
                    antonyms_complex_word.append(antonym.name())

    #print(f"Antonyms for complex word '{complex_word}': {antonyms_complex_word}\n")

    ## step 2: remove antonyms of the complex word from the list with substitutes
    substitutes_no_antonyms = []
    for substitute in substitutes_no_dupl_complex_word:
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_
        if substitute_lemma not in antonyms_complex_word:
            substitutes_no_antonyms.append(substitute)
        # else:
        #     print(f"Removed antonym: {substitute}")
    #print(f"Morphological Adaptation step d): substitute list without antonyms of the complex word: {substitutes_no_antonyms}\n") 
    
    
    
    # create two lists to hold synonyms and non-synonyms
    synonyms = []
    non_synonyms = []

    # iterate through each substitute
    for substitute in substitutes_no_antonyms:
        substitute_synsets = wn.synsets(substitute)
        #print(f"substitute_synsets for {substitute}: {substitute_synsets}\n")

        # get all the lemmas for the substitute
        substitute_syn_lemmas = [lemma.name() for syn in substitute_synsets for lemma in syn.lemmas()]
        #print(f"substitute_syn_lemmas for {substitute}: {substitute_syn_lemmas}\n")

        # get all the lemmas for the complex word
        complex_word_synsets = wn.synsets(complex_word_lemma)
        #print(f"complex_word_synsets for {complex_word}: {complex_word_synsets}\n")

        complex_word_syn_lemmas = [lemma.name() for syn in complex_word_synsets for lemma in syn.lemmas()]
        #print(f"complex_word_syn_lemmas for {complex_word}: {complex_word_syn_lemmas}\n")

        # find the intersection between the lemmas of the substitute and the complex word
        intersection = set(complex_word_syn_lemmas).intersection(set(substitute_syn_lemmas))
        #print(f"intersection between lemmas of synsets of complex word {complex_word} and lemmas of synsets of substitute {substitute}: {intersection}\n")

        if intersection:
            #print(f"Substitute {substitute} is a synonym of the complex word. Matching lemmas of their synsets: {intersection}\n")
            # Add substitute to synonyms list
            synonyms.append(substitute)
        else:
            # Add substitute to non_synonyms list
            non_synonyms.append(substitute)

        
  
    #print(f"List of substitutes that are synonyms with the complex word in Wordnet: {synonyms}\n")
    #print(f"List of substitutes that are not synonyms with the complex word in Wordnet: {non_synonyms}\n")

    # combine the lists with synonyms appearing first
    final_list = synonyms + non_synonyms
    #print(f"Final list with synonyms appearing first: {final_list}\n")

        
    
    #     # limit the substitutes to the 10 highest ranked ones for evaluation
    top_10_substitutes = final_list[:10]
    #print(f"Final top-10 with synonyms with the complex word in Wordnet appearing first: {top_10_substitutes}\n")
    
    #print('--------------------------------------------------------------------------------------------------------------------------------------------')
    

    
    # # add the sentence, complex_word, and the substitutes to the dataframe 
    substitutes_df.loc[index] = [sentence, complex_word] + top_10_substitutes
    
    
    
# export the dataframe to a tsv file for evaluation
substitutes_df.to_csv("./predictions/test/SG_MA_electralarge_synsfirst.tsv", sep="\t", index=False, header=False)
print("SG_MA_electralarge_synsfirst exported to csv in path './predictions/test/SG_MA_electralarge_synsfirst.tsv'}\n")

List of substitutes that are synonyms with the complex word in Wordnet: ['mandatory', 'required']

List of substitutes that are not synonyms with the complex word in Wordnet: ['obligatory', 'optional', 'necessary', 'voluntary', 'standard', 'mandated', 'redundant', 'unnecessary', 'possible', 'essential', 'prescribed', 'free', 'universal', 'consent', 'obsolete', 'sanctioned', 'fashionable', 'valid', 'taboo', 'supplementary', 'forbidden', 'continuous', 'illegal', 'requirement', 'prohibited', 'banned', 'sufficient']

Final list with synonyms appearing first: ['mandatory', 'required', 'obligatory', 'optional', 'necessary', 'voluntary', 'standard', 'mandated', 'redundant', 'unnecessary', 'possible', 'essential', 'prescribed', 'free', 'universal', 'consent', 'obsolete', 'sanctioned', 'fashionable', 'valid', 'taboo', 'supplementary', 'forbidden', 'continuous', 'illegal', 'requirement', 'prohibited', 'banned', 'sufficient']

Final top-10 with synonyms with the complex word in Wordnet appearing 

python tsar_eval.py --gold_file ./data/test/tsar2022_en_test_gold_no_noise.tsv --predictions_file ./predictions/test/SG_MA_electralarge_synsfirst.tsv --output_file ./output/test/SG_MA_electralarge_synsfirst.tsv

#### Roberta-base:

In [13]:
# Instantiate the tokenizer and the model
lm_tokenizer = AutoTokenizer.from_pretrained("roberta-base")
lm_model = AutoModelForMaskedLM.from_pretrained("roberta-base")


# Instantiate the fill-mask pipeline with the model
fill_mask = pipeline("fill-mask", lm_model, tokenizer = lm_tokenizer)

In [14]:
# in each row, for each complex word: 
for index, row in data.iterrows():
    
    # print the sentence and the complex word
    sentence, complex_word = row["sentence"], row["complex_word"]
    #print(f"Sentence: {sentence}")
    #print(f"Complex word: {complex_word}")
    
    
       
    # 1. Substitute Generation (SG): perform masking and generate substitutes:

    ## in the sentence, replace the complex word with a masked word
    sentence_masked_word = sentence.replace(complex_word, lm_tokenizer.mask_token)

    ## concatenate the original sentence and the masked sentence
    sentences_concat = f"{sentence} {lm_tokenizer.sep_token} {sentence_masked_word}"

    ## generate and rank candidate substitutes for the masked word using the fill_mask pipeline (removing elements without token_str key; as this gave errors in the ELECTRA models) .
    top_k = 30
    result= fill_mask(sentences_concat, top_k=top_k)
    substitutes = [substitute["token_str"] for substitute in result if "token_str" in substitute]
    #print(f"Substitute Generation step: initial substitute list: {substitutes}\n")


    #2: Morphological Generation and Context Adaptation (Morphological Adaptation):  
    ## a) remove noise in the substitutes, by ignoring generated substitutes that are empty or that have unwanted punctuation characters or that start with '##' (this returned errors with the ELECTRA model), and lowercase the substitutes (as some models don't lowercase by default)
    ## and lowercase all substitutes. Use try/except statement to prevent other character-related problems to happen

    punctuation_set = set(string.punctuation) - set('-') # retained hyphens in case tokenizers don't split on hyphenated compounds
    punctuation_set.update({'“','”'})   # as these curly quotes appeared in the Electra (SG step) results but were not part of the string set

    try:
        substitutes = [substitute["token_str"].lower().strip() for substitute in result if not any(char in punctuation_set for char in substitute["token_str"]) # added .strip as roberta uses a leading space before each substitute
                      and not substitute["token_str"].startswith('##') and substitute["token_str"].strip() != ""]
        # print(f"Morphological Adaptation step a): substitute list without unwanted punctuation characters: {substitutes}\n")
    except TypeError as error:
        continue



    ## b) remove duplicates within the substitute list from the substitute list (duplicates are likely for models that did not lowercase by default)
    ## the last mentioned duplicate is removed on purpose, as this may probably be the (previously) uppercased variant of the lowercased substitute (lowercased subs are most likely higher ranked by the model)
    substitutes_no_dupl = []
    for sub in substitutes:
        if sub not in substitutes_no_dupl:
            substitutes_no_dupl.append(sub)
    #print(f"Morphological Adaptation step b): substitute list without duplicates of substitutes model: {substitutes_no_dupl}\n")



    ## c) remove duplicates and inflected forms of the complex word from the substitute list

    ## first Lemmatize the complex word with spaCy, in order to compare it with the lemmatized substitute later to see if their mutual lemmas are the same
    doc_complex_word = nlp(complex_word)
    complex_word_lemma = doc_complex_word[0].lemma_
    #print(f"complex_word_lemma for complex word '{complex_word}': {complex_word_lemma}\n")


    ## then, remove duplicates and inflected forms of the complex word from the list with substitutes
    substitutes_no_dupl_complex_word = []
    for substitute in substitutes_no_dupl:
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_
        if substitute_lemma != complex_word_lemma:
            substitutes_no_dupl_complex_word.append(substitute)
    #print(f"Morphological Adaptation step c): substitute list without duplicates of the complex word nor inflected forms of the complex word: {substitutes_no_dupl_complex_word}\n")


     ## d) remove antonyms of the complex word from the substitute list
    ## step 1: get the antonyms of the complex word
    antonyms_complex_word = []
    for syn in wn.synsets(complex_word_lemma):
        for lemma in syn.lemmas():
            for antonym in lemma.antonyms():
                    antonyms_complex_word.append(antonym.name())

    #print(f"Antonyms for complex word '{complex_word}': {antonyms_complex_word}\n")

    ## step 2: remove antonyms of the complex word from the list with substitutes
    substitutes_no_antonyms = []
    for substitute in substitutes_no_dupl_complex_word:
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_
        if substitute_lemma not in antonyms_complex_word:
            substitutes_no_antonyms.append(substitute)
        # else:
        #     print(f"Removed antonym: {substitute}")
    #print(f"Morphological Adaptation step d): substitute list without antonyms of the complex word: {substitutes_no_antonyms}\n") 
    
    
    
    # create two lists to hold synonyms and non-synonyms
    synonyms = []
    non_synonyms = []

    # iterate through each substitute
    for substitute in substitutes_no_antonyms:
        substitute_synsets = wn.synsets(substitute)
        #print(f"substitute_synsets for {substitute}: {substitute_synsets}\n")

        # get all the lemmas for the substitute
        substitute_syn_lemmas = [lemma.name() for syn in substitute_synsets for lemma in syn.lemmas()]
        #print(f"substitute_syn_lemmas for {substitute}: {substitute_syn_lemmas}\n")

        # get all the lemmas for the complex word
        complex_word_synsets = wn.synsets(complex_word_lemma)
        #print(f"complex_word_synsets for {complex_word}: {complex_word_synsets}\n")

        complex_word_syn_lemmas = [lemma.name() for syn in complex_word_synsets for lemma in syn.lemmas()]
        #print(f"complex_word_syn_lemmas for {complex_word}: {complex_word_syn_lemmas}\n")

        # find the intersection between the lemmas of the substitute and the complex word
        intersection = set(complex_word_syn_lemmas).intersection(set(substitute_syn_lemmas))
        #print(f"intersection between lemmas of synsets of complex word {complex_word} and lemmas of synsets of substitute {substitute}: {intersection}\n")

        if intersection:
            #print(f"Substitute {substitute} is a synonym of the complex word. Matching lemmas of their synsets: {intersection}\n")
            # Add substitute to synonyms list
            synonyms.append(substitute)
        else:
            # Add substitute to non_synonyms list
            non_synonyms.append(substitute)

        
    #print(f"List of substitutes that are synonyms with the complex word in Wordnet: {synonyms}\n")
    #print(f"List of substitutes that are not synonyms with the complex word in Wordnet: {non_synonyms}\n")

    # combine the lists with synonyms appearing first
    final_list = synonyms + non_synonyms
    #print(f"Final list with synonyms appearing first: {final_list}\n")

        
    
    #     # limit the substitutes to the 10 highest ranked ones for evaluation
    top_10_substitutes = final_list[:10]
    #print(f"Final top-10 with synonyms with the complex word in Wordnet appearing first: {top_10_substitutes}\n")
    
    #print('--------------------------------------------------------------------------------------------------------------------------------------------')
    
    
    # # add the sentence, complex_word, and the substitutes to the dataframe 
    substitutes_df.loc[index] = [sentence, complex_word] + top_10_substitutes
    
    
    
# export the dataframe to a tsv file for evaluation
substitutes_df.to_csv("./predictions/test/SG_MA_robertabase_synsfirst.tsv", sep="\t", index=False, header=False)
print("SG_MA_robertabase_synsfirst exported to csv in path './predictions/test/SG_MA_robertabase_synsfirst.tsv'}\n")

List of substitutes that are synonyms with the complex word in Wordnet: ['mandatory', 'required']

List of substitutes that are not synonyms with the complex word in Wordnet: ['obligatory', 'voluntary', 'optional', 'obliged', 'uniform', 'necessary', 'available', 'mandated', 'sufficient', 'routine', 'forced', 'customary', 'prerequisite', 'feasible', 'indispensable', 'forthcoming', 'universal', 'requirement', 'involuntary', 'obligated', 'compelled', 'conditional', 'enforced', 'contingent', 'possible', 'compulsion']

Final list with synonyms appearing first: ['mandatory', 'required', 'obligatory', 'voluntary', 'optional', 'obliged', 'uniform', 'necessary', 'available', 'mandated', 'sufficient', 'routine', 'forced', 'customary', 'prerequisite', 'feasible', 'indispensable', 'forthcoming', 'universal', 'requirement', 'involuntary', 'obligated', 'compelled', 'conditional', 'enforced', 'contingent', 'possible', 'compulsion']

Final top-10 with synonyms with the complex word in Wordnet appearin

In [None]:
python tsar_eval.py --gold_file ./data/test/tsar2022_en_test_gold_no_noise.tsv --predictions_file ./predictions/test/SG_MA_robertabase_synsfirst.tsv --output_file ./output/test/SG_MA_robertabase_synsfirst.tsv

#### Roberta-large:

In [15]:
# Instantiate the tokenizer and the model
lm_tokenizer = AutoTokenizer.from_pretrained("roberta-large")
lm_model = AutoModelForMaskedLM.from_pretrained("roberta-large")


# Instantiate the fill-mask pipeline with the model
fill_mask = pipeline("fill-mask", lm_model, tokenizer = lm_tokenizer)

In [16]:
# in each row, for each complex word: 
for index, row in data.iterrows():
    
    # print the sentence and the complex word
    sentence, complex_word = row["sentence"], row["complex_word"]
    #print(f"Sentence: {sentence}")
    #print(f"Complex word: {complex_word}")
    
    
       
    # 1. Substitute Generation (SG): perform masking and generate substitutes:

    ## in the sentence, replace the complex word with a masked word
    sentence_masked_word = sentence.replace(complex_word, lm_tokenizer.mask_token)

    ## concatenate the original sentence and the masked sentence
    sentences_concat = f"{sentence} {lm_tokenizer.sep_token} {sentence_masked_word}"

    ## generate and rank candidate substitutes for the masked word using the fill_mask pipeline (removing elements without token_str key; as this gave errors in the ELECTRA models) .
    top_k = 30
    result= fill_mask(sentences_concat, top_k=top_k)
    substitutes = [substitute["token_str"] for substitute in result if "token_str" in substitute]
    #print(f"Substitute Generation step: initial substitute list: {substitutes}\n")


    #2: Morphological Generation and Context Adaptation (Morphological Adaptation):  
    ## a) remove noise in the substitutes, by ignoring generated substitutes that are empty or that have unwanted punctuation characters or that start with '##' (this returned errors with the ELECTRA model), and lowercase the substitutes (as some models don't lowercase by default)
    ## and lowercase all substitutes. Use try/except statement to prevent other character-related problems to happen

    punctuation_set = set(string.punctuation) - set('-') # retained hyphens in case tokenizers don't split on hyphenated compounds
    punctuation_set.update({'“','”'})   # as these curly quotes appeared in the Electra (SG step) results but were not part of the string set

    try:
        substitutes = [substitute["token_str"].lower().strip() for substitute in result if not any(char in punctuation_set for char in substitute["token_str"]) # added .strip as roberta uses a leading space before each substitute
                      and not substitute["token_str"].startswith('##') and substitute["token_str"].strip() != ""]
        # print(f"Morphological Adaptation step a): substitute list without unwanted punctuation characters: {substitutes}\n")
    except TypeError as error:
        continue



    ## b) remove duplicates within the substitute list from the substitute list (duplicates are likely for models that did not lowercase by default)
    ## the last mentioned duplicate is removed on purpose, as this may probably be the (previously) uppercased variant of the lowercased substitute (lowercased subs are most likely higher ranked by the model)
    substitutes_no_dupl = []
    for sub in substitutes:
        if sub not in substitutes_no_dupl:
            substitutes_no_dupl.append(sub)
    #print(f"Morphological Adaptation step b): substitute list without duplicates of substitutes model: {substitutes_no_dupl}\n")



    ## c) remove duplicates and inflected forms of the complex word from the substitute list

    ## first Lemmatize the complex word with spaCy, in order to compare it with the lemmatized substitute later to see if their mutual lemmas are the same
    doc_complex_word = nlp(complex_word)
    complex_word_lemma = doc_complex_word[0].lemma_
    #print(f"complex_word_lemma for complex word '{complex_word}': {complex_word_lemma}\n")


    ## then, remove duplicates and inflected forms of the complex word from the list with substitutes
    substitutes_no_dupl_complex_word = []
    for substitute in substitutes_no_dupl:
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_
        if substitute_lemma != complex_word_lemma:
            substitutes_no_dupl_complex_word.append(substitute)
    #print(f"Morphological Adaptation step c): substitute list without duplicates of the complex word nor inflected forms of the complex word: {substitutes_no_dupl_complex_word}\n")


     ## d) remove antonyms of the complex word from the substitute list
    ## step 1: get the antonyms of the complex word
    antonyms_complex_word = []
    for syn in wn.synsets(complex_word_lemma):
        for lemma in syn.lemmas():
            for antonym in lemma.antonyms():
                    antonyms_complex_word.append(antonym.name())

    #print(f"Antonyms for complex word '{complex_word}': {antonyms_complex_word}\n")

    ## step 2: remove antonyms of the complex word from the list with substitutes
    substitutes_no_antonyms = []
    for substitute in substitutes_no_dupl_complex_word:
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_
        if substitute_lemma not in antonyms_complex_word:
            substitutes_no_antonyms.append(substitute)
        # else:
        #     print(f"Removed antonym: {substitute}")
    #print(f"Morphological Adaptation step d): substitute list without antonyms of the complex word: {substitutes_no_antonyms}\n") 
    
    
    
    # create two lists to hold synonyms and non-synonyms
    synonyms = []
    non_synonyms = []

    # iterate through each substitute
    for substitute in substitutes_no_antonyms:
        substitute_synsets = wn.synsets(substitute)
        #print(f"substitute_synsets for {substitute}: {substitute_synsets}\n")

        # get all the lemmas for the substitute
        substitute_syn_lemmas = [lemma.name() for syn in substitute_synsets for lemma in syn.lemmas()]
        #print(f"substitute_syn_lemmas for {substitute}: {substitute_syn_lemmas}\n")

        # get all the lemmas for the complex word
        complex_word_synsets = wn.synsets(complex_word_lemma)
        #print(f"complex_word_synsets for {complex_word}: {complex_word_synsets}\n")

        complex_word_syn_lemmas = [lemma.name() for syn in complex_word_synsets for lemma in syn.lemmas()]
        #print(f"complex_word_syn_lemmas for {complex_word}: {complex_word_syn_lemmas}\n")

        # find the intersection between the lemmas of the substitute and the complex word
        intersection = set(complex_word_syn_lemmas).intersection(set(substitute_syn_lemmas))
        #print(f"intersection between lemmas of synsets of complex word {complex_word} and lemmas of synsets of substitute {substitute}: {intersection}\n")

        if intersection:
            #print(f"Substitute {substitute} is a synonym of the complex word. Matching lemmas of their synsets: {intersection}\n")
            # Add substitute to synonyms list
            synonyms.append(substitute)
        else:
            # Add substitute to non_synonyms list
            non_synonyms.append(substitute)

        
    #print(f"List of substitutes that are synonyms with the complex word in Wordnet: {synonyms}\n")
    #print(f"List of substitutes that are not synonyms with the complex word in Wordnet: {non_synonyms}\n")

    # combine the lists with synonyms appearing first
    final_list = synonyms + non_synonyms
    #print(f"Final list with synonyms appearing first: {final_list}\n")

        
    
    #     # limit the substitutes to the 10 highest ranked ones for evaluation
    top_10_substitutes = final_list[:10]
    #print(f"Final top-10 with synonyms with the complex word in Wordnet appearing first: {top_10_substitutes}\n")
    
    #print('--------------------------------------------------------------------------------------------------------------------------------------------')
    
    
    # # add the sentence, complex_word, and the substitutes to the dataframe 
    substitutes_df.loc[index] = [sentence, complex_word] + top_10_substitutes
    
    
# export the dataframe to a tsv file for evaluation
substitutes_df.to_csv("./predictions/test/SG_MA_robertalarge_synsfirst.tsv", sep="\t", index=False, header=False)
print("SG_MA_robertalarge_synsfirst exported to csv in path './predictions/test/SG_MA_robertalarge_synsfirst.tsv'}\n")

List of substitutes that are synonyms with the complex word in Wordnet: ['mandatory', 'required', 'mandate']

List of substitutes that are not synonyms with the complex word in Wordnet: ['mandated', 'voluntary', 'obligatory', 'statutory', 'redundant', 'enforced', 'routine', 'relevant', 'vital', 'clandestine', 'obliged', 'bureaucratic', 'ministerial', 'necessary', 'lifelong', 'strict', 'gradual', 'demanded', 'lax', 'continuous', 'practicable', 'indispensable', 'forced', 'habitual', 'plain', 'preferable']

Final list with synonyms appearing first: ['mandatory', 'required', 'mandate', 'mandated', 'voluntary', 'obligatory', 'statutory', 'redundant', 'enforced', 'routine', 'relevant', 'vital', 'clandestine', 'obliged', 'bureaucratic', 'ministerial', 'necessary', 'lifelong', 'strict', 'gradual', 'demanded', 'lax', 'continuous', 'practicable', 'indispensable', 'forced', 'habitual', 'plain', 'preferable']

Final top-10 with synonyms with the complex word in Wordnet appearing first: ['mandatory

In [None]:
python tsar_eval.py --gold_file ./data/test/tsar2022_en_test_gold_no_noise.tsv --predictions_file ./predictions/test/SG_MA_robertalarge_synsfirst.tsv --output_file ./output/test/SG_MA_robertalarge_synsfirst.tsv