### All code below uses concatenated sentence pairs in the Substitute Generation step in order to generate similar substitutes (as opposed to generation of fitting substitutes only)

In [1]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
import pandas as pd
from transformers import pipeline

# read the tsv file
filename = './data/trial/tsar2022_en_trial_none_no_noise.tsv'
data = pd.read_csv(filename, sep='\t', header=None, names=["sentence", "complex_word"])

# create an empty dataframe to store the substitutes for evaluation
substitutes_df = pd.DataFrame(columns=["sentence", "complex_word"] + [f"substitute_{i+1}" for i in range(10)])


In [2]:
import logging

In [3]:
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
import spacy
nlp = spacy.load("en_core_web_sm")

import string

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\IrmaT\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
# the code below is used when Bertscore is used in step SS
import bert_score
from bert_score import score

In [5]:
# set the display.max_rows option to None to display all rows instead of limiting it to 50
pd.set_option('display.max_rows', None)

In [None]:
# # Instantiate the tokenizer and the model

# # for roberta-base:
# lm_tokenizer_robertabase = AutoTokenizer.from_pretrained("roberta-base")
# lm_model_robertabase = AutoModelForMaskedLM.from_pretrained("roberta-base")


# # Instantiate the fill-mask pipeline with the model
# fill_mask_robertabase = pipeline("fill-mask", lm_model_robertabase, tokenizer = lm_tokenizer_robertabase)

In [7]:
models = ['bertbase', 'electrabase']

#### Top 5 of bertbase model with bertscore bertbase and bertscore electrabase, and electrabase model with bertscore bertbase and bertscore electrabase (4 models):


In [None]:

# create empty lists, to use in step 3 Substitute Selection
bsBert_bertbase = []
bsBert_electrabase = []
bsElectra_bertbase = []
bsElectra_electrabase = []

bsBert_bertbase_2 = []
bsBert_electrabase_2 = []
bsElectra_bertbase_2 = []
bsElectra_electrabase_2 = []



# iterate over the models
for model in models:
    if model == 'bertbase':
        lm_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
        lm_model = AutoModelForMaskedLM.from_pretrained("bert-base-uncased")
    elif model == 'electrabase':
        lm_tokenizer = AutoTokenizer.from_pretrained("google/electra-base-generator")
        lm_model = AutoModelForMaskedLM.from_pretrained("google/electra-base-generator")
    fill_mask = pipeline("fill-mask", lm_model, tokenizer = lm_tokenizer)


    # in each row, for each complex word:
    for index, row in data.iterrows():

        # print the sentence and the complex word
        sentence, complex_word = row["sentence"], row["complex_word"]
        #print(f"Sentence: {sentence}")
        #print(f"Complex word: {complex_word}")

        #print('--------------------------------------------------------------------------------------------------------------------------------')
        
        # 1. Substitute Generation (SG): perform masking and generate substitutes:

        ## in the sentence, replace the complex word with a masked word
        sentence_masked_word = sentence.replace(complex_word, lm_tokenizer.mask_token)

        ## concatenate the original sentence and the masked sentence
        sentences_concat = f"{sentence} {lm_tokenizer.sep_token} {sentence_masked_word}"

        ## generate and rank candidate substitutes for the masked word using the fill_mask pipeline (removing elements without token_str key; as this gave errors in the ELECTRA models) .
        top_k = 20
        result = fill_mask(sentences_concat, top_k=top_k)
        substitutes = [substitute["token_str"] for substitute in result if "token_str" in substitute]
        #print(f"Substitute Generation step: initial substitute list for {model}: {substitutes}\n")
        
        #print('-----------------------------------------------------')


        #2: Morphological Generation and Context Adaptation (Morphological Adaptation):
        ## a) remove noise in the substitutes, by ignoring generated substitutes that are empty or that have unwanted punctuation characters or that start with '##' (this returned errors with the ELECTRA model), and lowercase the substitutes (as some models don't lowercase by default)
        ## and lowercase all substitutes. Use try/except statement to prevent other character-related problems to happen

        punctuation_set = set(string.punctuation) - set('-') # retained hyphens in case tokenizers don't split on hyphenated compounds
        punctuation_set.update({'“','”'})   # as these curly quotes appeared in the Electra (SG step) results but were not part of the string set

        try:
            substitutes = [substitute["token_str"].lower().strip() for substitute in result if not any(char in punctuation_set for char in substitute["token_str"]) # added .strip as roberta uses a leading space before each substitute
                          and not substitute["token_str"].startswith('##') and substitute["token_str"].strip() != ""]
            #print(f"Morphological Adaptation step a): substitute list without unwanted punctuation characters for {model}: {substitutes}\n")
        except TypeError as error:
            continue



        ## b) remove duplicates within the substitute list from the substitute list (duplicates are likely for models that did not lowercase by default)
        ## the last mentioned duplicate is removed on purpose, as this may probably be the (previously) uppercased variant of the lowercased substitute (lowercased subs are most likely higher ranked by the model)
        substitutes_no_dupl = []
        for sub in substitutes:
            if sub not in substitutes_no_dupl:
                substitutes_no_dupl.append(sub)
        #print(f"Morphological Adaptation step b): substitute list without duplicates of substitutes for {model}: {substitutes_no_dupl}\n")



        ## c) remove duplicates and inflected forms of the complex word from the substitute list

        ## first Lemmatize the complex word with spaCy, in order to compare it with the lemmatized substitute later to see if their mutual lemmas are the same
        doc_complex_word = nlp(complex_word)
        complex_word_lemma = doc_complex_word[0].lemma_
        #print(f"complex_word_lemma for complex word '{complex_word}': {complex_word_lemma}\n")


        ## then, remove duplicates and inflected forms of the complex word from the list with substitutes
        substitutes_no_dupl_complex_word = []
        for substitute in substitutes_no_dupl:
            doc_substitute = nlp(substitute)
            substitute_lemma = doc_substitute[0].lemma_
            if substitute_lemma != complex_word_lemma:
                substitutes_no_dupl_complex_word.append(substitute)
        #print(f"Morphological Adaptation step c): substitute list without duplicates of the complex word nor inflected forms of the complex word for {model}: {substitutes_no_dupl_complex_word}\n")


        ## d) remove antonyms of the complex word from the substitute list
        ## step 1: get the antonyms of the complex word
        antonyms_complex_word = []
        for syn in wn.synsets(complex_word_lemma):
            for lemma in syn.lemmas():
                for antonym in lemma.antonyms():
                        antonyms_complex_word.append(antonym.name())

        #print(f"Antonyms for complex word '{complex_word}': {antonyms_complex_word}\n")

        ## step 2: remove antonyms of the complex word from the list with substitutes
        substitutes_no_antonyms = []
        for substitute in substitutes_no_dupl_complex_word:
            doc_substitute = nlp(substitute)
            substitute_lemma = doc_substitute[0].lemma_
            if substitute_lemma not in antonyms_complex_word:
                substitutes_no_antonyms.append(substitute)
            else:
                print(f"Removed antonym: {substitute}")
        #print(f"Morphological Adaptation final step d): substitute list without antonyms of the complex word for {model}: {substitutes_no_antonyms}\n")
        
        #print('-----------------------------------------------------')
        
        
        

        #3: Substitute Selection (SS) by calculating Bert scores:

        ## create sentence with the complex word replaced by the substitutes
        sentence_with_substitutes = [sentence.replace(complex_word, sub) for sub in substitutes_no_antonyms]
        #print(f"List with sentences where complex word is substituted for {model}: {sentence_with_substitutes}\n")


        ## calculate BERTScores, and rank the substitutes based on these scores
        if len(sentence_with_substitutes) > 0: # to make sure the list with substitutes is always filled
            logging.getLogger('transformers').setLevel(logging.ERROR)  # to prevent the same warnings from being printed x times
            scores_bsBert = bert_score.score([sentence]*len(sentence_with_substitutes), sentence_with_substitutes, lang="en", model_type='bert-base-uncased', verbose=False)
            scores_bsElectra = bert_score.score([sentence]*len(sentence_with_substitutes), sentence_with_substitutes, lang="en", model_type='google/electra-base-generator', verbose=False)
            logging.getLogger('transformers').setLevel(logging.WARNING) # to reset the logging level back to printing warnings

            # create a list of tuples, each tuple containing a substitute and its score
            substitute_score_pairs_bsBert = list(zip(substitutes_no_antonyms, scores_bsBert[0].tolist()))
            substitute_score_pairs_bsElectra = list(zip(substitutes_no_antonyms, scores_bsElectra[0].tolist()))

            # sort the list of tuples by the scores (the second element of each tuple), in descending order
            sorted_substitute_score_pairs_bsBert = sorted(substitute_score_pairs_bsBert, key=lambda x: x[1], reverse=True)
            sorted_substitute_score_pairs_bsElectra = sorted(substitute_score_pairs_bsElectra, key=lambda x: x[1], reverse=True)

            # extract the list of substitutes from the sorted pairs
            ranked_substitutes_only_bsBert = [substitute for substitute, _ in sorted_substitute_score_pairs_bsBert]
            #print(f"Substitute Selection step a): substitutes based on bertscores with Bert for {model}: {ranked_substitutes_only_bsBert}\n")
            ranked_substitutes_only_bsElectra = [substitute for substitute, _ in sorted_substitute_score_pairs_bsElectra]
            #print(f"Substitute Selection step a): substitutes based on bertscores with Electra for {model}: {ranked_substitutes_only_bsElectra}\n")


            # limit the substitutes to the 10 first ones for evaluation
            top_10_substitutes_bsBert = ranked_substitutes_only_bsBert[:10]
            #print(f" Substitute Selection step a): top-10 substitutes based on bertscores with Bert for {model}: {top_10_substitutes_bsBert}\n")
            top_10_substitutes_bsElectra = ranked_substitutes_only_bsElectra[:10]
            #print(f" Substitute Selection step a): top-10 substitutes based on bertscores with Electra for {model}: {top_10_substitutes_bsElectra}\n")

        else:
            top_10_substitutes_bsBert = []
            top_10_substitutes_bsElectra = []
            
            
        ## limit the substitutes to the top-5 for concatenation with the top-5 of other models, and put the last 5 elements of the top_10 in a list, for concatenation with lists containing the last 5 elements of other models
        if model == 'bertbase':
            top_5_substitutes_bsBert_bertbase = top_10_substitutes_bsBert[:5]
            #print(f"Substitute Selection step b): top-5 substitutes based on bertscores with Bert for {model}: {top_5_substitutes_bsBert_bertbase}\n")
            
            second_half_bsBert_bertbase = top_10_substitutes_bsBert[5:]
            #print(f"Substitute Selection step b): second half substitutes based on bertscores with Bert for {model}: {second_half_bsBert_bertbase}\n")
            
            top_5_substitutes_bsElectra_bertbase = top_10_substitutes_bsElectra[:5]
            #print(f"Substitute Selection step b): top-5 substitutes based on bertscores with Electra for {model}: {top_5_substitutes_bsElectra_bertbase}\n")
            
            second_half_bsElectra_bertbase = top_10_substitutes_bsElectra[5:]
            #print(f"Substitute Selection step b): second half substitutes based on bertscores with Electra for {model}: {second_half_bsElectra_bertbase}\n")
            
            bsBert_bertbase.append(top_5_substitutes_bsBert_bertbase)
            bsElectra_bertbase.append(top_5_substitutes_bsElectra_bertbase) 
            bsBert_bertbase_2.append(second_half_bsBert_bertbase)
            bsElectra_bertbase_2.append(second_half_bsElectra_bertbase)
            
            
            
        elif model == 'electrabase':
            top_5_substitutes_bsBert_electrabase = top_10_substitutes_bsBert[:5]
            #print(f"Substitute Selection step b): top-5 substitutes based on bertscores with Electra for {model}: {top_5_substitutes_bsBert_electrabase}\n")
            
            second_half_bsBert_electrabase = top_10_substitutes_bsBert[5:]
            #print(f"Substitute Selection step b): second half substitutes based on bertscores with Electra for {model}: {second_half_bsBert_electrabase}\n")
            
            top_5_substitutes_bsElectra_electrabase = top_10_substitutes_bsElectra[:5]
            #print(f"Substitute Selection step b): top-5 substitutes based on bertscores with Electra for {model}: {top_5_substitutes_bsElectra_electrabase}\n")
            
            second_half_bsElectra_electrabase = top_10_substitutes_bsElectra[5:]
            #print(f"Substitute Selection step b): second half substitutes based on bertscores with Electra for {model}: {second_half_bsElectra_electrabase}\n")          
            
            bsBert_electrabase.append(top_5_substitutes_bsBert_electrabase)
            bsElectra_electrabase.append(top_5_substitutes_bsElectra_electrabase) 
            bsBert_electrabase_2.append(second_half_bsBert_electrabase)
            bsElectra_electrabase_2.append(second_half_bsElectra_electrabase)
            
        #print('-----------------------------------------------------')
            
    
    #print('---------------------------------------------------------------------------------------------------------------------------------')
    
for i in range(len(bsBert_bertbase)):  # pick one of the lists for the length
    
    ## create a combined list that sticks to the original order of each sub list, by using a dict to hold the counts
    count_dict = {}

    all_lists = [bsBert_bertbase[i], bsElectra_bertbase[i], bsBert_electrabase[i], bsElectra_electrabase[i]] # try the other way around to see if score increases
    #print(f" all_lists together: {all_lists}\n")

    ## iterate through each list and count occurrences of each element
    for lst in all_lists:
        for elem in lst:
            if elem in count_dict:
                count_dict[elem] += 1
            else:
                count_dict[elem] = 1

    ## create a list of duplicates (shared elements) by including only those elements that appear in all four lists
    shared_elems = [elem for elem, count in count_dict.items() if count == len(all_lists)]

    #print(f"Substitute Selection step c): shared elements in top-5 substitutes lists based on bertscores with Bert and Electra for bertbase and electrabase models: {shared_elems}\n")

    ## create a list with non-duplicates
    non_shared_elems = [elem for elem, count in count_dict.items() if count == 1]
    #print(f"Substitute Selection step c): not-shared elements in top-5 substitutes lists based on bertscores with Bert and Electra for both bertbase and electrabase models: {non_shared_elems}\n")

    ## concatenate both lists (shared_elems and non_shared_elems, giving shared_elems priority
    top_5_combined = shared_elems + non_shared_elems
    #print(f"Substitute Selection step c): combined top-5 with prioritized shared elements over non-shared elements: {top_5_combined}\n")


    ## zip the "second half" lists
    second_half_combined = [item for sublist in zip(bsBert_bertbase_2[i], bsElectra_bertbase_2[i], bsBert_electrabase_2[i], bsElectra_electrabase_2[i]) for item in sublist] # try the other way around to see if score increases
    # exclude any items in second_half_combined that are already in top_5_concatenated
    second_half_combined = [item for item in second_half_combined if item not in top_5_combined]
    #print(f"second_half_combined: {second_half_combined}\n")

    ## concatenate second_half_combined to top_5_combined to create final list
    all_combined = top_5_combined + second_half_combined
    #print(f"Substitute Selection step c): combined top-5 + combined second half: {all_combined}\n")


    ## limit the final_list to the top 10 items for evaluation purposes
    SG_MA_SS_bs_4models_top5dup= all_combined[:10]
    #print(f"Substitute Selection final step c): final list: SG_MA_SS_bs_4models_top5dup: {SG_MA_SS_bs_4models_top5dup}\n")

    #print('----------------------------------------------------------------------------------------------------------------------------------------')

    ## add the sentence, complex_word, and substitutes to the dataframe
    substitutes_df.loc[i] = [data['sentence'][i]] + [data['complex_word'][i]] + SG_MA_SS_bs_4models_top5dup
    #print(substitutes_df)
    #print()


# export the dataframe to a tsv file for evaluation
substitutes_df.to_csv('./predictions/trial/SG_MA_SS_bs_4models_top5dup_k20.tsv', sep="\t", index=False, header=False)
print("SG_MA_SS_bs_4models_top5dup_k20 exported to csv in path './predictions/trial/SG_MA_SS_bs_4models_top5dup_k20.tsv'}\n")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


python tsar_eval.py --gold_file ./data/trial/tsar2022_en_trial_gold_no_noise.tsv --predictions_file ./predictions/trial/SG_MA_SS_bs_4models_top5dup_k20.tsv --output_file ./output/trial/SG_MA_SS_bs_4models_top5dup_k20.tsv

### Top 5 of bertbase bsBert and electrabase bsElectra (2 models):

In [None]:
# create empty lists, to use in step 3 Substitute Selection
bsBert_bertbase = []
bsElectra_electrabase = []

bsBert_bertbase_2 = []
bsElectra_electrabase_2 = []



# iterate over the models
for model in models:
    if model == 'bertbase':
        lm_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
        lm_model = AutoModelForMaskedLM.from_pretrained("bert-base-uncased")
    elif model == 'electrabase':
        lm_tokenizer = AutoTokenizer.from_pretrained("google/electra-base-generator")
        lm_model = AutoModelForMaskedLM.from_pretrained("google/electra-base-generator")
    fill_mask = pipeline("fill-mask", lm_model, tokenizer = lm_tokenizer)


    # in each row, for each complex word:
    for index, row in data.iterrows():

        # print the sentence and the complex word
        sentence, complex_word = row["sentence"], row["complex_word"]
        #print(f"Sentence: {sentence}")
        #print(f"Complex word: {complex_word}")
        
        #print('--------------------------------------------------------------------------------------------------------------------------------')


        # 1. Substitute Generation (SG): perform masking and generate substitutes:

        ## in the sentence, replace the complex word with a masked word
        sentence_masked_word = sentence.replace(complex_word, lm_tokenizer.mask_token)

        ## concatenate the original sentence and the masked sentence
        sentences_concat = f"{sentence} {lm_tokenizer.sep_token} {sentence_masked_word}"

        ## generate and rank candidate substitutes for the masked word using the fill_mask pipeline (removing elements without token_str key; as this gave errors in the ELECTRA models) .
        top_k = 20
        result = fill_mask(sentences_concat, top_k=top_k)
        substitutes = [substitute["token_str"] for substitute in result if "token_str" in substitute]
        #print(f"Substitute Generation step: initial substitute list for {model}: {substitutes}\n")
        
        #print('-----------------------------------------------------')


        #2: Morphological Generation and Context Adaptation (Morphological Adaptation):
        ## a) remove noise in the substitutes, by ignoring generated substitutes that are empty or that have unwanted punctuation characters or that start with '##' (this returned errors with the ELECTRA model), and lowercase the substitutes (as some models don't lowercase by default)
        ## and lowercase all substitutes. Use try/except statement to prevent other character-related problems to happen

        punctuation_set = set(string.punctuation) - set('-') # retained hyphens in case tokenizers don't split on hyphenated compounds
        punctuation_set.update({'“','”'})   # as these curly quotes appeared in the Electra (SG step) results but were not part of the string set

        try:
            substitutes = [substitute["token_str"].lower().strip() for substitute in result if not any(char in punctuation_set for char in substitute["token_str"]) # added .strip as roberta uses a leading space before each substitute
                          and not substitute["token_str"].startswith('##') and substitute["token_str"].strip() != ""]
            #print(f"Morphological Adaptation step a): substitute list without unwanted punctuation characters for {model}: {substitutes}\n")
        except TypeError as error:
            continue



        ## b) remove duplicates within the substitute list from the substitute list (duplicates are likely for models that did not lowercase by default)
        ## the last mentioned duplicate is removed on purpose, as this may probably be the (previously) uppercased variant of the lowercased substitute (lowercased subs are most likely higher ranked by the model)
        substitutes_no_dupl = []
        for sub in substitutes:
            if sub not in substitutes_no_dupl:
                substitutes_no_dupl.append(sub)
        #print(f"Morphological Adaptation step b): substitute list without duplicates of substitutes for {model}: {substitutes_no_dupl}\n")



        ## c) remove duplicates and inflected forms of the complex word from the substitute list

        ## first Lemmatize the complex word with spaCy, in order to compare it with the lemmatized substitute later to see if their mutual lemmas are the same
        doc_complex_word = nlp(complex_word)
        complex_word_lemma = doc_complex_word[0].lemma_
        #print(f"complex_word_lemma for complex word '{complex_word}': {complex_word_lemma}\n")


        ## then, remove duplicates and inflected forms of the complex word from the list with substitutes
        substitutes_no_dupl_complex_word = []
        for substitute in substitutes_no_dupl:
            doc_substitute = nlp(substitute)
            substitute_lemma = doc_substitute[0].lemma_
            if substitute_lemma != complex_word_lemma:
                substitutes_no_dupl_complex_word.append(substitute)
        #print(f"Morphological Adaptation step c): substitute list without duplicates of the complex word nor inflected forms of the complex word for {model}: {substitutes_no_dupl_complex_word}\n")


        ## d) remove antonyms of the complex word from the substitute list
        ## step 1: get the antonyms of the complex word
        antonyms_complex_word = []
        for syn in wn.synsets(complex_word_lemma):
            for lemma in syn.lemmas():
                for antonym in lemma.antonyms():
                        antonyms_complex_word.append(antonym.name())

        #print(f"Antonyms for complex word '{complex_word}': {antonyms_complex_word}\n")

        ## step 2: remove antonyms of the complex word from the list with substitutes
        substitutes_no_antonyms = []
        for substitute in substitutes_no_dupl_complex_word:
            doc_substitute = nlp(substitute)
            substitute_lemma = doc_substitute[0].lemma_
            if substitute_lemma not in antonyms_complex_word:
                substitutes_no_antonyms.append(substitute)
            else:
                print(f"Removed antonym: {substitute}")
        #print(f"Morphological Adaptation final step d): substitute list without antonyms of the complex word for {model}: {substitutes_no_antonyms}\n")
        
        #print('-----------------------------------------------------')
        
        
        

        #3: Substitute Selection (SS) by calculating Bert scores:

        ## create sentence with the complex word replaced by the substitutes
        sentence_with_substitutes = [sentence.replace(complex_word, sub) for sub in substitutes_no_antonyms]
        #print(f"List with sentences where complex word is substituted for {model}: {sentence_with_substitutes}\n")


        ## calculate BERTScores, and rank the substitutes based on these scores
        if len(sentence_with_substitutes) > 0: # to make sure the list with substitutes is always filled
            logging.getLogger('transformers').setLevel(logging.ERROR)  # to prevent the same warnings from being printed x times
            scores_bsBert = bert_score.score([sentence]*len(sentence_with_substitutes), sentence_with_substitutes, lang="en", model_type='bert-base-uncased', verbose=False)
            scores_bsElectra = bert_score.score([sentence]*len(sentence_with_substitutes), sentence_with_substitutes, lang="en", model_type='google/electra-base-generator', verbose=False)
            logging.getLogger('transformers').setLevel(logging.WARNING) # to reset the logging level back to printing warnings

            # create a list of tuples, each tuple containing a substitute and its score
            substitute_score_pairs_bsBert = list(zip(substitutes_no_antonyms, scores_bsBert[0].tolist()))
            substitute_score_pairs_bsElectra = list(zip(substitutes_no_antonyms, scores_bsElectra[0].tolist()))

            # sort the list of tuples by the scores (the second element of each tuple), in descending order
            sorted_substitute_score_pairs_bsBert = sorted(substitute_score_pairs_bsBert, key=lambda x: x[1], reverse=True)
            sorted_substitute_score_pairs_bsElectra = sorted(substitute_score_pairs_bsElectra, key=lambda x: x[1], reverse=True)

            # extract the list of substitutes from the sorted pairs
            ranked_substitutes_only_bsBert = [substitute for substitute, _ in sorted_substitute_score_pairs_bsBert]
            #print(f"Substitute Selection step a): substitutes based on bertscores with Bert for {model}: {ranked_substitutes_only_bsBert}\n")
            ranked_substitutes_only_bsElectra = [substitute for substitute, _ in sorted_substitute_score_pairs_bsElectra]
            #print(f"Substitute Selection step a): substitutes based on bertscores with Electra for {model}: {ranked_substitutes_only_bsElectra}\n")


            # limit the substitutes to the 10 first ones for evaluation
            top_10_substitutes_bsBert = ranked_substitutes_only_bsBert[:10]
            #print(f" Substitute Selection step a): top-10 substitutes based on bertscores with Bert for {model}: {top_10_substitutes_bsBert}\n")
            top_10_substitutes_bsElectra = ranked_substitutes_only_bsElectra[:10]
            #print(f" Substitute Selection step a): top-10 substitutes based on bertscores with Electra for {model}: {top_10_substitutes_bsElectra}\n")

        else:
            top_10_substitutes_bsBert = []
            top_10_substitutes_bsElectra = []
            
            
        ## limit the substitutes to the top-5 for concatenation with the top-5 of other models, and put the last 5 elements of the top_10 in a list, for concatenation with lists containing the last 5 elements of other models
        if model == 'bertbase':
            top_5_substitutes_bsBert_bertbase = top_10_substitutes_bsBert[:5]
            #print(f"Substitute Selection step b): top-5 substitutes based on bertscores with Bert for {model}: {top_5_substitutes_bsBert_bertbase}\n")
            
            second_half_bsBert_bertbase = top_10_substitutes_bsBert[5:]
            #print(f"Substitute Selection step b): second half substitutes based on bertscores with Bert for {model}: {second_half_bsBert_bertbase}\n")
            
            
            bsBert_bertbase.append(top_5_substitutes_bsBert_bertbase)
            bsBert_bertbase_2.append(second_half_bsBert_bertbase)

            
            
        elif model == 'electrabase':           
            top_5_substitutes_bsElectra_electrabase = top_10_substitutes_bsElectra[:5]
            #print(f"Substitute Selection step b): top-5 substitutes based on bertscores with Electra for {model}: {top_5_substitutes_bsElectra_electrabase}\n")
            
            second_half_bsElectra_electrabase = top_10_substitutes_bsElectra[5:]
            #print(f"Substitute Selection step b): second half substitutes based on bertscores with Electra for {model}: {second_half_bsElectra_electrabase}\n")          
            
            bsElectra_electrabase.append(top_5_substitutes_bsElectra_electrabase) 
            bsElectra_electrabase_2.append(second_half_bsElectra_electrabase)
            
        #print('-----------------------------------------------------')
            
    
    #print('---------------------------------------------------------------------------------------------------------------------------------')
    
for i in range(len(bsBert_bertbase)):  # pick one of the lists for the length
    
    ## create a combined list that sticks to the original order of each sub list, by using a dict to hold the counts
    count_dict = {}

    all_lists = [bsBert_bertbase[i], bsElectra_electrabase[i]] # try the other way around to see if score increases
    #print(f" all_lists together: {all_lists}\n")

    ## iterate through each list and count occurrences of each element
    for lst in all_lists:
        for elem in lst:
            if elem in count_dict:
                count_dict[elem] += 1
            else:
                count_dict[elem] = 1

    ## create a list of duplicates (shared elements) by including only those elements that appear in all four lists
    shared_elems = [elem for elem, count in count_dict.items() if count == len(all_lists)]

    #print(f"Substitute Selection step c): shared elements in top-5 substitutes lists based on bertscores with Bert and Electra for bertbase and electrabase models: {shared_elems}\n")

    ## create a list with non-duplicates
    non_shared_elems = [elem for elem, count in count_dict.items() if count == 1]
    #print(f"Substitute Selection step c): not-shared elements in top-5 substitutes lists based on bertscores with Bert and Electra for both bertbase and electrabase models: {non_shared_elems}\n")

    ## concatenate both lists (shared_elems and non_shared_elems, giving shared_elems priority
    top_5_combined = shared_elems + non_shared_elems
    #print(f"Substitute Selection step c): combined top-5 with prioritized shared elements over non-shared elements: {top_5_combined}\n")


    ## zip the "second half" lists
    second_half_combined = [item for sublist in zip(bsBert_bertbase_2[i], bsElectra_electrabase_2[i]) for item in sublist] # try the other way around to see if score increases
    # exclude any items in second_half_combined that are already in top_5_concatenated
    second_half_combined = [item for item in second_half_combined if item not in top_5_combined]
    #print(f"second_half_combined: {second_half_combined}\n")

    ## concatenate second_half_combined to top_5_combined to create final list
    all_combined = top_5_combined + second_half_combined
    #print(f"Substitute Selection step c): combined top-5 + combined second half: {all_combined}\n")


    ## limit the final_list to the top 10 items for evaluation purposes
    SG_MA_SS_bs_2models_top5dup= all_combined[:10]
    #print(f"Substitute Selection final step c): SG_MA_SS_bs_2models_top5dup: {SG_MA_SS_bs_2models_top5dup}\n")

    #print('--------------------------------------------------------------------------------------------------------------')

    ## add the sentence, complex_word, and substitutes to the dataframe
    substitutes_df.loc[i] = [data['sentence'][i]] + [data['complex_word'][i]] + SG_MA_SS_bs_2models_top5dup
    #print(substitutes_df)
    #print()


# export the dataframe to a tsv file for evaluation
substitutes_df.to_csv('./predictions/trial/SG_MA_SS_bs_2models_top5dup_k20.tsv', sep="\t", index=False, header=False)
print("SG_MA_SS_bs_2models_top5dup_k20 exported to csv in path './predictions/trial/SG_MA_SS_bs_2models_top5dup_k20.tsv'}\n")

python tsar_eval.py --gold_file ./data/trial/tsar2022_en_trial_gold_no_noise.tsv --predictions_file ./predictions/trial/SG_MA_SS_bs_2models_top5dup.tsv --output_file ./output/trial/SG_MA_SS_bs_2models_top5dup.tsv