### Cosine similarity -semantics

#### importing, loading and reading files

In [1]:
import torch
from transformers import RobertaTokenizer, RobertaForMaskedLM
import pandas as pd
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer # however, does not work for verbs
import spacy

import numpy as np
import gensim.downloader as api

In [2]:
# read the tsv file
filename = "./data/trial/tsar2022_en_trial_none.tsv"
data = pd.read_csv(filename, sep='\t', header=None, names=["sentence", "complex_word"])


# For SG step: load RoBERTa model and tokenizer
model = RobertaForMaskedLM.from_pretrained('roberta-base') # not enough memory for roberta-large
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# for SS step: use spacy for lemmatizing in steps a-d  (as wordnet lemmatizer does not work for verbs)
nlp = spacy.load("en_core_web_sm")

# For SS step: load the GloVe embeddings (this can take some minutes)
glove = api.load("glove-wiki-gigaword-100")




#### word embeddings with glove:

In [3]:
# based on glove, with Wordnet lemmatizer (does not seem to work for verbs):


# in each row, for each complex word: 
for index, row in data.iterrows():
    
    # 1. Substitute Generation (SG): perform masking and generate substitutes:
    
    ## print the sentence and the complex word
    sentence, complex_word = row["sentence"], row["complex_word"]
    print(f"Sentence: {sentence}")
    print(f"Complex word: {complex_word}")

    ## in the sentence, replace the complex word with a masked word
    sentence_masked_word = sentence.replace(complex_word, "<mask>")

    ## concatenate the sentence with the complex word and the sentence with the masked word, by using RoBERTa's separator token to create one string of both sentencesion 
    sentences_concat = f"{sentence} {tokenizer.sep_token} {sentence_masked_word}"

    ## tokenize the concatenated sentence
    sentences_concat_tokenized = tokenizer.encode(sentences_concat, return_tensors='pt')

    ## find the masked word in the tokenized sentence
    mask_location = torch.where(sentences_concat_tokenized == tokenizer.mask_token_id)[1].item()

    ## generate predictions for the masked word (forward pass not needed for predictions, only for training)
    with torch.no_grad():
        outputs = model(sentences_concat_tokenized)
        predictions = outputs.logits

    ## get the top-k substitutes based on the predicted logits
    top_k = 30
    top_tokens = torch.topk(predictions[0, mask_location], top_k).indices

    ## decode the top-k substitutes, lowercase and print them
    substitutes = [tokenizer.decode(token.item()).strip().lower() for token in top_tokens]
    print(f"SG step: generated substitutes: {substitutes}\n")
    
    

    # 2. Substitute Selection (SS): 
    
    # a) remove duplicates from the substitute list 
    
    substitutes_no_dupl = []
    for sub in substitutes:
        if sub not in substitutes_no_dupl:
            substitutes_no_dupl.append(sub)
    print(f"SS step: a) substitute list without duplicates: {substitutes_no_dupl}\n")

    # b) remove duplicates and inflected forms of the complex word from the substitute list
   
    ## Lemmmatize the complex word with the WordNet lemmatizer, in order to compare it with the lemmatized substitute later to see if their mutual lemmas are the same (will be deleted)
    lemmatizer = WordNetLemmatizer()
    complex_word_lemma = lemmatizer.lemmatize(complex_word)
    print(f"Complex_word_lemma: {complex_word_lemma}\n")
    
    ## remove duplicates and inflected forms of the complex word from the substitute list
    substitutes_no_dupl_complex_word = []
    for substitute in substitutes_no_dupl:
        if lemmatizer.lemmatize(substitute) != complex_word_lemma:
            substitutes_no_dupl_complex_word.append(substitute)
    print(f"SS step: b) substitute list without duplicates and inflected forms of the complex word: {substitutes_no_dupl_complex_word}\n")

    # c) remove antonyms of the complex word from the substitute list
    
    substitutes_no_dupl_complex_word_no_antonym = []
    for substitute in substitutes_no_dupl_complex_word:
        syn = wn.synsets(complex_word)
        if syn:
            syn = syn[0]
            for lemma in syn.lemmas():
                if lemma.antonyms() and lemma.name() == substitute:
                    print(f"Antonym removed: {lemma.antonyms()[0].name()}")
                    break
            else:
                substitutes_no_dupl_complex_word_no_antonym.append(substitute)
        else:
            substitutes_no_dupl_complex_word_no_antonym.append(substitute)
    print(f"SS step: c): substitute list without antonyms of the complex word: {substitutes_no_dupl_complex_word_no_antonym}\n")
       
       
    # d) perform ranking of substitutes_no_dupl_complex_word_no_antonym on semantic similarity
    # compute the semantic similarity between the complex word and each substitute
    
    similarities = []
    for substitute in substitutes_no_dupl_complex_word_no_antonym:
        if substitute.lower() in glove.key_to_index:
            similarity = glove.similarity(complex_word.lower(), substitute.lower())
        else:
            print(f"Word not found in embeddings (will be put at the end of the list): {substitute}")
            similarity = -1.0  # set similarity to a negative value to indicate it should be put at the very end of the list
        similarities.append(similarity)

    # rank the substitutes based on their semantic similarity with the complex word
    substitutes_ranked = [x for _, x in sorted(zip(similarities, substitutes_no_dupl_complex_word_no_antonym), reverse=True)]
    print(f"SS step: d) substitute list ranked based on semantic similarity: {substitutes_ranked}\n")
    print('----------------------------------------------------------------------------------------------------------------------')
    print()

     

Sentence: A Spanish government source, however, later said that banks able to cover by themselves losses on their toxic property assets will not be forced to remove them from their books while it will be compulsory for those receiving public help.
Complex word: compulsory
SG step: generated substitutes: ['compulsory', 'mandatory', 'obligatory', 'voluntary', 'required', 'optional', 'obliged', 'uniform', 'necessary', 'available', 'mandated', 'sufficient', 'routine', 'forced', 'customary', 'prerequisite', 'feasible', 'indispensable', 'forthcoming', 'universal', 'requirement', 'involuntary', 'obligated', 'compelled', 'conditional', 'enforced', 'contingent', 'possible', 'compulsion', 'mandatory']

SS step: a) substitute list without duplicates: ['compulsory', 'mandatory', 'obligatory', 'voluntary', 'required', 'optional', 'obliged', 'uniform', 'necessary', 'available', 'mandated', 'sufficient', 'routine', 'forced', 'customary', 'prerequisite', 'feasible', 'indispensable', 'forthcoming', 'un

In [4]:
# try removing non-matching pos tags with wordnet: does not work, for example as wordnet cannot lemmatize verbs
#used spacy's lemmatizer here


# in each row, for each complex word: 
for index, row in data.iterrows():
    
    # 1. Substitute Generation (SG): perform masking and generate substitutes:
    
    ## print the sentence and the complex word
    sentence, complex_word = row["sentence"], row["complex_word"]
    print(f"Sentence: {sentence}")
    print(f"Complex word: {complex_word}")

    ## in the sentence, replace the complex word with a masked word
    sentence_masked_word = sentence.replace(complex_word, "<mask>")

    ## concatenate the sentence with the complex word and the sentence with the masked word, by using RoBERTa's separator token to create one string of both sentencesion 
    sentences_concat = f"{sentence} {tokenizer.sep_token} {sentence_masked_word}"

    ## tokenize the concatenated sentence
    sentences_concat_tokenized = tokenizer.encode(sentences_concat, return_tensors='pt')

    ## find the masked word in the tokenized sentence
    mask_location = torch.where(sentences_concat_tokenized == tokenizer.mask_token_id)[1].item()

    ## generate predictions for the masked word (forward pass not needed for predictions, only for training)
    with torch.no_grad():
        outputs = model(sentences_concat_tokenized)
        predictions = outputs.logits

    ## get the top-k substitutes based on the predicted logits
    top_k = 30
    top_tokens = torch.topk(predictions[0, mask_location], top_k).indices

    ## decode the top-k substitutes, lowercase and print them
    substitutes = [tokenizer.decode(token.item()).strip().lower() for token in top_tokens]
    print(f"SG step: generated substitutes: {substitutes}\n")
    
    

    # 2. Substitute Selection (SS): 
    
    # a) remove duplicates from the substitute list 
    
    substitutes_no_dupl = []
    for sub in substitutes:
        if sub not in substitutes_no_dupl:
            substitutes_no_dupl.append(sub)
    print(f"SS step: a) substitute list without duplicates: {substitutes_no_dupl}\n")

   
    # b) remove duplicates and inflected forms of the complex word from the substitute list
    complex_word_doc = nlp(complex_word)
    complex_word_lemma = complex_word_doc[0].lemma_
    print(f"Complex_word_lemma: {complex_word_lemma}\n")

    substitutes_no_dupl_complex_word = []
    for substitute in substitutes_no_dupl:
        substitute_doc = nlp(substitute)
        substitute_lemma = substitute_doc[0].lemma_
        if substitute_lemma != complex_word_lemma:
            substitutes_no_dupl_complex_word.append(substitute)

    print(f"SS step: b) substitute list without duplicates and inflected forms of the complex word: {substitutes_no_dupl_complex_word}\n")

    # c) remove antonyms of the complex word from the substitute list
    substitutes_no_dupl_complex_word_no_antonym = []
    for substitute in substitutes_no_dupl_complex_word:
        syn = wn.synsets(complex_word)
        if syn:
            syn = syn[0]
            for lemma in syn.lemmas():
                if lemma.antonyms() and lemma.name() == substitute:
                    print(f"Antonym removed: {lemma.antonyms()[0].name()}")
                    break
            else:
                substitutes_no_dupl_complex_word_no_antonym.append(substitute)
        else:
            substitutes_no_dupl_complex_word_no_antonym.append(substitute)
    print(f"SS step: c): substitute list without antonyms of the complex word: {substitutes_no_dupl_complex_word_no_antonym}\n")

    # d) in the last substitute list, remove noun versions of the complex word if the complex word is a verb, and remove verb versions of the complex word if the complex word is a noun
    substitutes_no_dupl_complex_word_no_antonym_no_matchpos = []
    complex_word_synsets = wn.synsets(complex_word_lemma)
    print(f"complex_word_lemma for complex word '{complex_word}': {complex_word_lemma}")
    if complex_word_synsets:
        complex_word_pos = complex_word_synsets[0].pos()
        print (f"complex_word_pos: {complex_word_pos}")
        for substitute in substitutes_no_dupl_complex_word_no_antonym:
            substitute_doc = nlp(substitute)
            substitute_lemma = substitute_doc[0].lemma_
            print(f"substitute_lemma for substitute '{substitute}': {substitute_lemma}")
            substitute_synsets = wn.synsets(substitute_lemma)
            if substitute_synsets:
                substitute_pos = substitute_synsets[0].pos()
                print(f"substitute_pos: {substitute_pos}")
                if complex_word_pos == 'v' and substitute_pos != 'v':
                    print(f"Non-matching pos removed: {substitute}")
                elif complex_word_pos == 'n' and substitute_pos != 'n':
                    print(f"Non-matching pos removed: {substitute}")
                elif complex_word_pos == 's' and substitute_pos == 'n': #to keep e.g., verbs like required as subs for compulsory
                    print(f"Non-matching pos removed: {substitute}")    
                elif complex_word_pos == 'a' and substitute_pos == 'n': #to keep e.g., verbs like required as subs for compulsory
                    print(f"Non-matching pos removed: {substitute}")       
                else:
                    substitutes_no_dupl_complex_word_no_antonym_no_matchpos.append(substitute)
            else:
                substitutes_no_dupl_complex_word_no_antonym_no_matchpos.append(substitute)
    else:
        substitutes_no_dupl_complex_word_no_antonym_no_matchpos = substitutes_no_dupl_complex_word_no_antonym
    print(f"SS step: d) substitute list without non-matching pos tags: {substitutes_no_dupl_complex_word_no_antonym_no_matchpos}\n")
    

    
    # e) perform ranking of substitutes_no_dupl_complex_word_no_antonym_no_matchpos on semantic similarity
    # compute the semantic similarity between the complex word and each substitute
    
    similarities = []
    for substitute in substitutes_no_dupl_complex_word_no_antonym_no_matchpos:
        if substitute.lower() in glove.key_to_index:
            similarity = glove.similarity(complex_word.lower(), substitute.lower())
        else:
            print(f"Word not found in embeddings (will be put at the end of the list): {substitute}")
            similarity = -1.0  # set similarity to a negative value to indicate it should be put at the very end of the list
        similarities.append(similarity)

    # rank the substitutes based on their semantic similarity with the complex word
    substitutes_ranked = [x for _, x in sorted(zip(similarities, substitutes_no_dupl_complex_word_no_antonym_no_matchpos), reverse=True)]
    print(f"SS step: e) substitute list ranked based on semantic similarity: {substitutes_ranked}\n")
    print('----------------------------------------------------------------------------------------------------------------------')
    print()

Sentence: A Spanish government source, however, later said that banks able to cover by themselves losses on their toxic property assets will not be forced to remove them from their books while it will be compulsory for those receiving public help.
Complex word: compulsory
SG step: generated substitutes: ['compulsory', 'mandatory', 'obligatory', 'voluntary', 'required', 'optional', 'obliged', 'uniform', 'necessary', 'available', 'mandated', 'sufficient', 'routine', 'forced', 'customary', 'prerequisite', 'feasible', 'indispensable', 'forthcoming', 'universal', 'requirement', 'involuntary', 'obligated', 'compelled', 'conditional', 'enforced', 'contingent', 'possible', 'compulsion', 'mandatory']

SS step: a) substitute list without duplicates: ['compulsory', 'mandatory', 'obligatory', 'voluntary', 'required', 'optional', 'obliged', 'uniform', 'necessary', 'available', 'mandated', 'sufficient', 'routine', 'forced', 'customary', 'prerequisite', 'feasible', 'indispensable', 'forthcoming', 'un

#### Results:
SS step d) both lemmatizing and pos tagging with wordnet is not satisfactory: lemmatizing does not lemmatize verbs, and pos tagging goes wrong a lot. Moved over to spacy pos tagging which does not use wordnet pos tagging. 

In [5]:
# use spacy for pos tagging


# in each row, for each complex word: 
for index, row in data.iterrows():
    
    # 1. Substitute Generation (SG): perform masking and generate substitutes:
    
    ## print the sentence and the complex word
    sentence, complex_word = row["sentence"], row["complex_word"]
    print(f"Sentence: {sentence}")
    print(f"Complex word: {complex_word}")

    ## in the sentence, replace the complex word with a masked word
    sentence_masked_word = sentence.replace(complex_word, "<mask>")

    ## concatenate the sentence with the complex word and the sentence with the masked word, by using RoBERTa's separator token to create one string of both sentencesion 
    sentences_concat = f"{sentence} {tokenizer.sep_token} {sentence_masked_word}"

    ## tokenize the concatenated sentence
    sentences_concat_tokenized = tokenizer.encode(sentences_concat, return_tensors='pt')

    ## find the masked word in the tokenized sentence
    mask_location = torch.where(sentences_concat_tokenized == tokenizer.mask_token_id)[1].item()

    ## generate predictions for the masked word (forward pass not needed for predictions, only for training)
    with torch.no_grad():
        outputs = model(sentences_concat_tokenized)
        predictions = outputs.logits

    ## get the top-k substitutes based on the predicted logits
    top_k = 30
    top_tokens = torch.topk(predictions[0, mask_location], top_k).indices

    ## decode the top-k substitutes, lowercase and print them
    substitutes = [tokenizer.decode(token.item()).strip().lower() for token in top_tokens]
    print(f"SG step: generated substitutes: {substitutes}\n")
    
    

    # 2. Substitute Selection (SS): 
    
    # a) remove duplicates from the substitute list 
    
    substitutes_no_dupl = []
    for sub in substitutes:
        if sub not in substitutes_no_dupl:
            substitutes_no_dupl.append(sub)
    print(f"SS step: a) substitute list without duplicates: {substitutes_no_dupl}\n")

   
    # b) remove duplicates and inflected forms of the complex word from the substitute list
    ## Lemmatize the complex word with spaCy, in order to compare it with the lemmatized substitute later to see if their mutual lemmas are the same (will be deleted)
    doc = nlp(complex_word)
    complex_word_lemma = doc[0].lemma_
    print(f"complex_word_lemma for complex word '{complex_word}': {complex_word_lemma}")

    ## remove duplicates and inflected forms of the complex word from the substitute list
    substitutes_no_dupl_complex_word = []
    for substitute in substitutes_no_dupl:
        doc = nlp(substitute)
        if doc[0].lemma_ != complex_word_lemma:
            substitutes_no_dupl_complex_word.append(substitute)
    print(f"SS step: b) substitute list without duplicates and inflected forms of the complex word: {substitutes_no_dupl_complex_word}\n")

    # c) remove antonyms of the complex word from the substitute list
    substitutes_no_dupl_complex_word_no_antonym = []
    for substitute in substitutes_no_dupl_complex_word:
        syn = wn.synsets(complex_word)
        if syn:
            syn = syn[0]
            for lemma in syn.lemmas():
                if lemma.antonyms() and lemma.name() == substitute:
                    print(f"Antonym removed: {lemma.antonyms()[0].name()}")
                    break
            else:
                substitutes_no_dupl_complex_word_no_antonym.append(substitute)
        else:
            substitutes_no_dupl_complex_word_no_antonym.append(substitute)
    print(f"SS step: c): substitute list without antonyms of the complex word: {substitutes_no_dupl_complex_word_no_antonym}\n")

    # d) in the last substitute list, remove selected non-matching pos tags # does not work well! spacy's pos tagger does not work well, just like Wordnet's
    substitutes_no_dupl_complex_word_no_antonym_no_matchpos = []
    complex_word_pos = doc[0].pos_
    print(f"complex_word_pos: {complex_word_pos}")
    for substitute in substitutes_no_dupl_complex_word_no_antonym:
        substitute_doc = nlp(substitute)
        substitute_lemma = substitute_doc[0].lemma_
        print(f"substitute_lemma for substitute '{substitute}': {substitute_lemma}")
        substitute_pos = substitute_doc[0].pos_
        print(f"substitute_pos: {substitute_pos}")
        if complex_word_pos == 'VERB' and substitute_pos == 'NOUN' or substitute_pos == 'ADP' or substitute_pos == 'PROPN':
            print(f"Non-matching pos removed: {substitute}")
        elif complex_word_pos == 'NOUN' and substitute_pos == 'VERB':
            print(f"Non-matching pos removed: {substitute}")
        elif complex_word_pos == 'ADJ' and substitute_pos == 'VERB':
            print(f"Non-matching pos removed: {substitute}")
        elif complex_word_pos == 'ADV' and substitute_pos == 'VERB':
            print(f"Non-matching pos removed: {substitute}")
        else:
            substitutes_no_dupl_complex_word_no_antonym_no_matchpos.append(substitute)
    print(f"SS step: d) substitute list without non-matching pos tags: {substitutes_no_dupl_complex_word_no_antonym_no_matchpos}\n")
    

    
    # e) perform ranking of substitutes_no_dupl_complex_word_no_antonym_no_matchpos on semantic similarity
    # compute the semantic similarity between the complex word and each substitute
    
    similarities = []
    for substitute in substitutes_no_dupl_complex_word_no_antonym_no_matchpos:
        if substitute.lower() in glove.key_to_index:
            similarity = glove.similarity(complex_word.lower(), substitute.lower())
        else:
            print(f"Word not found in embeddings (will be put at the end of the list): {substitute}")
            similarity = -1.0  # set similarity to a negative value to indicate it should be put at the very end of the list
        similarities.append(similarity)

    # rank the substitutes based on their semantic similarity with the complex word
    substitutes_ranked = [x for _, x in sorted(zip(similarities, substitutes_no_dupl_complex_word_no_antonym_no_matchpos), reverse=True)]
    print(f"SS step: e) substitute list ranked based on semantic similarity: {substitutes_ranked}\n")
    print('----------------------------------------------------------------------------------------------------------------------')
    print()

Sentence: A Spanish government source, however, later said that banks able to cover by themselves losses on their toxic property assets will not be forced to remove them from their books while it will be compulsory for those receiving public help.
Complex word: compulsory
SG step: generated substitutes: ['compulsory', 'mandatory', 'obligatory', 'voluntary', 'required', 'optional', 'obliged', 'uniform', 'necessary', 'available', 'mandated', 'sufficient', 'routine', 'forced', 'customary', 'prerequisite', 'feasible', 'indispensable', 'forthcoming', 'universal', 'requirement', 'involuntary', 'obligated', 'compelled', 'conditional', 'enforced', 'contingent', 'possible', 'compulsion', 'mandatory']

SS step: a) substitute list without duplicates: ['compulsory', 'mandatory', 'obligatory', 'voluntary', 'required', 'optional', 'obliged', 'uniform', 'necessary', 'available', 'mandated', 'sufficient', 'routine', 'forced', 'customary', 'prerequisite', 'feasible', 'indispensable', 'forthcoming', 'un

#### result: spacy's pos tagger also does not work well at all either. Will continue without pos tagging so removed d) and replaced it with the previous e)

In [6]:
# in each row, for each complex word: 
for index, row in data.iterrows():
    
    # 1. Substitute Generation (SG): perform masking and generate substitutes:
    
    ## print the sentence and the complex word
    sentence, complex_word = row["sentence"], row["complex_word"]
    print(f"Sentence: {sentence}")
    print(f"Complex word: {complex_word}")

    ## in the sentence, replace the complex word with a masked word
    sentence_masked_word = sentence.replace(complex_word, "<mask>")

    ## concatenate the sentence with the complex word and the sentence with the masked word, by using RoBERTa's separator token to create one string of both sentencesion 
    sentences_concat = f"{sentence} {tokenizer.sep_token} {sentence_masked_word}"

    ## tokenize the concatenated sentence
    sentences_concat_tokenized = tokenizer.encode(sentences_concat, return_tensors='pt')

    ## find the masked word in the tokenized sentence
    mask_location = torch.where(sentences_concat_tokenized == tokenizer.mask_token_id)[1].item()

    ## generate predictions for the masked word (forward pass not needed for predictions, only for training)
    with torch.no_grad():
        outputs = model(sentences_concat_tokenized)
        predictions = outputs.logits

    ## get the top-k substitutes based on the predicted logits
    top_k = 30
    top_tokens = torch.topk(predictions[0, mask_location], top_k).indices

    ## decode the top-k substitutes, lowercase and print them
    substitutes = [tokenizer.decode(token.item()).strip().lower() for token in top_tokens]
    print(f"SG step: generated substitutes: {substitutes}\n")
    
    

    # 2. Substitute Selection (SS): 
    
    # a) remove duplicates from the substitute list 
    
    substitutes_no_dupl = []
    for sub in substitutes:
        if sub not in substitutes_no_dupl:
            substitutes_no_dupl.append(sub)
    print(f"SS step: a) substitute list without duplicates: {substitutes_no_dupl}\n")

   
    # b) remove duplicates and inflected forms of the complex word from the substitute list
    ## Lemmatize the complex word with spaCy, in order to compare it with the lemmatized substitute later to see if their mutual lemmas are the same (will be deleted)
    doc = nlp(complex_word)
    complex_word_lemma = doc[0].lemma_
    print(f"complex_word_lemma for complex word '{complex_word}': {complex_word_lemma}")

    ## remove duplicates and inflected forms of the complex word from the substitute list
    substitutes_no_dupl_complex_word = []
    for substitute in substitutes_no_dupl:
        doc = nlp(substitute)
        if doc[0].lemma_ != complex_word_lemma:
            substitutes_no_dupl_complex_word.append(substitute)
    print(f"SS step: b) substitute list without duplicates and inflected forms of the complex word: {substitutes_no_dupl_complex_word}\n")

    # c) remove antonyms of the complex word from the substitute list
    substitutes_no_dupl_complex_word_no_antonym = []
    for substitute in substitutes_no_dupl_complex_word:
        syn = wn.synsets(complex_word)
        if syn:
            syn = syn[0]
            for lemma in syn.lemmas():
                if lemma.antonyms() and lemma.name() == substitute:
                    print(f"Antonym removed: {lemma.antonyms()[0].name()}")
                    break
            else:
                substitutes_no_dupl_complex_word_no_antonym.append(substitute)
        else:
            substitutes_no_dupl_complex_word_no_antonym.append(substitute)
    print(f"SS step: c): substitute list without antonyms of the complex word: {substitutes_no_dupl_complex_word_no_antonym}\n")
    
    # d) perform ranking of substitutes_no_dupl_complex_word_no_antonym_no_matchpos on semantic similarity
    # compute the semantic similarity between the complex word and each substitute
    
    similarities = []
    for substitute in substitutes_no_dupl_complex_word_no_antonym:
        if substitute.lower() in glove.key_to_index:
            similarity = glove.similarity(complex_word.lower(), substitute.lower())
        else:
            print(f"Word not found in embeddings (will be put at the end of the list): {substitute}")
            similarity = -1.0  # set similarity to a negative value to indicate it should be put at the very end of the list
        similarities.append(similarity)

    # rank the substitutes based on their semantic similarity with the complex word
    substitutes_ranked = [x for _, x in sorted(zip(similarities, substitutes_no_dupl_complex_word_no_antonym), reverse=True)]
    print(f"SS step: e) substitute list ranked with glove on semantic similarity: {substitutes_ranked}\n")
    print('----------------------------------------------------------------------------------------------------------------------')
    print()

Sentence: A Spanish government source, however, later said that banks able to cover by themselves losses on their toxic property assets will not be forced to remove them from their books while it will be compulsory for those receiving public help.
Complex word: compulsory
SG step: generated substitutes: ['compulsory', 'mandatory', 'obligatory', 'voluntary', 'required', 'optional', 'obliged', 'uniform', 'necessary', 'available', 'mandated', 'sufficient', 'routine', 'forced', 'customary', 'prerequisite', 'feasible', 'indispensable', 'forthcoming', 'universal', 'requirement', 'involuntary', 'obligated', 'compelled', 'conditional', 'enforced', 'contingent', 'possible', 'compulsion', 'mandatory']

SS step: a) substitute list without duplicates: ['compulsory', 'mandatory', 'obligatory', 'voluntary', 'required', 'optional', 'obliged', 'uniform', 'necessary', 'available', 'mandated', 'sufficient', 'routine', 'forced', 'customary', 'prerequisite', 'feasible', 'indispensable', 'forthcoming', 'un