### All code below uses concatenated sentence pairs in the Substitute Generation step in order to generate similar substitutes (as opposed to generation of fitting substitutes only)

In [1]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
from fitbert import FitBert
import pandas as pd
from transformers import pipeline
import tensorflow as tf

# read the tsv file
filename = "./data/trial/tsar2022_en_trial_none.tsv"
data = pd.read_csv(filename, sep='\t', header=None, names=["sentence", "complex_word"])

In [2]:
# create an empty dataframe to store the substitutes for evaluation
substitutes_df = pd.DataFrame(columns=["sentence", "complex_word"] + [f"substitute_{i+1}" for i in range(10)])

### Bert-base

In [3]:
# initialize the tokenizer and the models
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
lm_model = AutoModelForMaskedLM.from_pretrained("bert-base-uncased")

# create a fill-mask pipeline
fill_mask = pipeline("fill-mask", lm_model, tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased"))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
from nltk.corpus import wordnet as wn
import spacy
nlp = spacy.load("en_core_web_sm")

#### Substitute Generation with BERT-base, and Substitute Selection steps a-c, and the resulting list with contextualized embedding layers via Keras

In [12]:
import tensorflow_hub as hub
import tensorflow_text as text

In [13]:
# updated model with SS on contextualized embeddings, based on TensorFlow Hub BERT with Keras layers.
# calculates similarity between the original sentence and the sentences with candidate substitutes.
# model used for similiarity calculations: bert_en_uncased (similar to bert_base_uncased) 

def calculate_similarity_scores(sentence, sentence_with_substitutes):
    bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
    bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")
    
    def embed_text(text):
        preprocessed_text = bert_preprocess(text)
        return bert_encoder(preprocessed_text)["pooled_output"]

    original_sentence_embedding = embed_text([sentence])
    substitute_sentence_embeddings = embed_text(sentence_with_substitutes)
    
    similarity_scores = np.inner(original_sentence_embedding, substitute_sentence_embeddings).flatten()

    return similarity_scores



# in each row, for each complex word: 
for index, row in data.iterrows():
       
    # 1. Substitute Generation (SG): perform masking and generate substitutes:
    
    ## print the sentence and the complex word
    sentence, complex_word = row["sentence"], row["complex_word"]
    print(f"Sentence: {sentence}")
    print(f"Complex word: {complex_word}")

    ## in the sentence, replace the complex word with a masked word
    sentence_masked_word = sentence.replace(complex_word, "[MASK]")

    ## concatenate the original sentence and the masked sentence
    tokenizer = fill_mask.tokenizer
    sentences_concat = f"{sentence} {tokenizer.sep_token} {sentence_masked_word}"

    ## generate and rank candidate substitutes for the masked word using the fill_mask pipeline
    top_k = 30
    result = fill_mask(sentences_concat, top_k=top_k)
   
    ## lowercase and print the top-k substitutes
    substitutes = [substitute["token_str"].lower() for substitute in result]
    print(f"SG step: generated substitutes: {substitutes}\n")
    
    # 2. Substitute Selection (SS):   
    
    # a) remove duplicates within the substitute list from the substitute list 
    
    substitutes_no_dupl = []
    for sub in substitutes:
        if sub not in substitutes_no_dupl:
            substitutes_no_dupl.append(sub)
    print(f"SS step: a) substitute list without duplicates: {substitutes_no_dupl}\n")

   
    # b) remove duplicates and inflected forms of the complex word from the substitute list
    ## Lemmatize the complex word with spaCy, in order to compare it with the lemmatized substitute later to see if their mutual lemmas are the same
    doc_complex_word = nlp(complex_word)
    complex_word_lemma = doc_complex_word[0].lemma_
    print(f"complex_word_lemma for complex word '{complex_word}': {complex_word_lemma}\n")


    ## remove duplicates and inflected forms of the complex word from the list with substitutes
    substitutes_no_dupl_complex_word = []
    for substitute in substitutes_no_dupl:
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_
        if substitute_lemma != complex_word_lemma:
            substitutes_no_dupl_complex_word.append(substitute)
    print(f"SS step: b) substitute list without duplicates and inflected forms of the complex word: {substitutes_no_dupl_complex_word}\n")

    # c) remove antonyms of the complex word from the substitute list
    substitutes_no_dupl_complex_word_no_antonym = []
    for substitute in substitutes_no_dupl_complex_word:
        syn = wn.synsets(complex_word_lemma)
        if syn:
            syn = syn[0]
            for lemma in syn.lemmas():
                if lemma.antonyms() and lemma.name() == substitute_lemma:
                    print(f"Antonym removed (lemma): {lemma.antonyms()[0].name()}")
                    break
            else:
                substitutes_no_dupl_complex_word_no_antonym.append(substitute)
        else:
            substitutes_no_dupl_complex_word_no_antonym.append(substitute)
    print(f"SS step: c): substitute list without antonyms of the complex word: {substitutes_no_dupl_complex_word_no_antonym}\n")
    
    
    # create sentence with the complex word replaced by the substitutes
    sentence_with_substitutes = [sentence.replace(complex_word, sub) for sub in substitutes_no_dupl_complex_word_no_antonym]
    #print(f"List with sentences where complex word is substituted: {sentence_with_substitutes}\n")
    
    
    # d) calculate cosine similarity scores, and rank the substitutes based on their similarity score
    similarity_scores = calculate_similarity_scores(sentence, sentence_with_substitutes)
    #print(f"Similarity scores: {similarity_scores}\n")
    ranked_substitutes_withscores = sorted(zip(substitutes_no_dupl_complex_word_no_antonym, similarity_scores), key=lambda x: x[1], reverse=True)
    #print(f"SS step d) Ranked substitutes, including similarity scores in context: {ranked_substitutes}\n")
    ranked_substitutes = [substitute for substitute, score in ranked_substitutes_withscores]
    print(f"SS step d) Ranked substitutes, based on cosine similarity scores in context: {ranked_substitutes}\n")
        
    print('-----------------------------------------------------------------------------------------')
    print()
    
       
    
    
    # limit the substitutes to the 10 first ones for evaluation
    top_10_substitutes = ranked_substitutes[:10]
    
    # add the sentence, complex_word, and substitutes to the dataframe 
    substitutes_df.loc[index] = [sentence, complex_word] + top_10_substitutes
    
    # remove the #34-3 and #35-14 character combinations from the sentences in the dataframe
    substitutes_df.iloc[:, 0] = substitutes_df.iloc[:, 0].str.replace("#34-3 \"", "")
    substitutes_df.iloc[:, 0] = substitutes_df.iloc[:, 0].str.replace("#35-14 ", "")
    
    

# export the dataframe to a tsv file
substitutes_df.to_csv("./predictions/trial/BertBase_SG_SS_abc_keras.tsv", sep="\t", index=False, header=False)




Sentence: A Spanish government source, however, later said that banks able to cover by themselves losses on their toxic property assets will not be forced to remove them from their books while it will be compulsory for those receiving public help.
Complex word: compulsory
SG step: generated substitutes: ['compulsory', 'mandatory', 'obligatory', 'optional', 'required', 'necessary', 'standard', 'voluntary', 'customary', 'impossible', 'easier', 'only', 'illegal', 'sufficient', 'unnecessary', 'easy', 'normal', 'permitted', 'mandated', 'difficult', 'simple', 'appropriate', 'expensive', 'possible', 'commonplace', 'essential', 'proper', 'available', 'enough', 'affordable']

SS step: a) substitute list without duplicates: ['compulsory', 'mandatory', 'obligatory', 'optional', 'required', 'necessary', 'standard', 'voluntary', 'customary', 'impossible', 'easier', 'only', 'illegal', 'sufficient', 'unnecessary', 'easy', 'normal', 'permitted', 'mandated', 'difficult', 'simple', 'appropriate', 'expen

python tsar_eval.py --gold_file .\gold_trial.tsv --predictions_file ./predictions/trial/BertBase_SG_SS_abc_keras.tsv --output_file .\output

bad scores!