In [1]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoModelForSequenceClassification, AutoModel
from fitbert import FitBert
import spacy
import pandas as pd
from nltk.corpus import wordnet as wn
import numpy as np



In [2]:
# read the tsv file
filename = "./data/trial/tsar2022_en_trial_none.tsv"
data = pd.read_csv(filename, sep='\t', header=None, names=["sentence", "complex_word"])

In [3]:
from transformers import pipeline

In [4]:
# for SS step: use spacy for lemmatizing in steps a-d
nlp = spacy.load("en_core_web_sm")

In [5]:
# # initialize the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased")
lm_model = AutoModelForMaskedLM.from_pretrained("bert-large-uncased")

Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias',

In [6]:
## default model for contextualized embeddings later


# create a fill-mask pipeline using BERT
fill_mask = pipeline("fill-mask", lm_model, tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased"))

# in each row, for each complex word: 
for index, row in data.iterrows():
       
    # 1. Substitute Generation (SG)
    
    ## print the sentence and the complex word
    sentence, complex_word = row["sentence"], row["complex_word"]
    print(f"Sentence: {sentence}")
    print(f"Complex word: {complex_word}")

    ## in the sentence, replace the complex word with a masked word
    sentence_masked_word = sentence.replace(complex_word, "[MASK]")

    ## concatenate the original sentence and the masked sentence
    tokenizer = fill_mask.tokenizer
    sentences_concat = f"{sentence} {tokenizer.sep_token} {sentence_masked_word}"

    ## generate and rank candidate substitutes for the masked word using the fill_mask pipeline
    top_k = 30
    result = fill_mask(sentences_concat, top_k=top_k)
   
    ## lowercase and print the top-k substitutes
    substitutes = [substitute["token_str"].lower() for substitute in result]
    print(f"SG step: generated substitutes: {substitutes}\n")
    
    
    # 2. Substitute Selection (SS):   
    
    # a) remove duplicates within the substitute list from the substitute list 
    
    substitutes_no_dupl = []
    for sub in substitutes:
        if sub not in substitutes_no_dupl:
            substitutes_no_dupl.append(sub)
    print(f"SS step: a) substitute list without duplicates: {substitutes_no_dupl}\n")

   
    # b) remove duplicates and inflected forms of the complex word from the substitute list
    ## Lemmatize the complex word with spaCy, in order to compare it with the lemmatized substitute later to see if their mutual lemmas are the same
    doc_complex_word = nlp(complex_word)
    complex_word_lemma = doc_complex_word[0].lemma_
    print(f"complex_word_lemma for complex word '{complex_word}': {complex_word_lemma}\n")


    ## remove duplicates and inflected forms of the complex word from the list with substitutes
    substitutes_no_dupl_complex_word = []
    for substitute in substitutes_no_dupl:
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_
        if substitute_lemma != complex_word_lemma:
            substitutes_no_dupl_complex_word.append(substitute)
    print(f"SS step: b) substitute list without duplicates and inflected forms of the complex word: {substitutes_no_dupl_complex_word}\n")

    # c) remove antonyms of the complex word from the substitute list
    substitutes_no_dupl_complex_word_no_antonym = []
    for substitute in substitutes_no_dupl_complex_word:
        syn = wn.synsets(complex_word_lemma)
        if syn:
            syn = syn[0]
            for lemma in syn.lemmas():
                if lemma.antonyms() and lemma.name() == substitute_lemma:
                    print(f"Antonym removed (lemma): {lemma.antonyms()[0].name()}")
                    break
            else:
                substitutes_no_dupl_complex_word_no_antonym.append(substitute)
        else:
            substitutes_no_dupl_complex_word_no_antonym.append(substitute)
    print(f"SS step: c): substitute list without antonyms of the complex word: {substitutes_no_dupl_complex_word_no_antonym}\n")
    
    
    # create sentences with the complex word replaced by the substitutes
    sentences_with_substitutes = [sentence.replace(complex_word, sub) for sub in substitutes_no_dupl_complex_word_no_antonym]
    #print(f"SG step: sentences with substitutes: {sentences_with_substitutes}\n")
    
    
    print('-----------------------------------------------------------------------------------------')
    print()

Sentence: A Spanish government source, however, later said that banks able to cover by themselves losses on their toxic property assets will not be forced to remove them from their books while it will be compulsory for those receiving public help.
Complex word: compulsory
SG step: generated substitutes: ['compulsory', 'mandatory', 'obligatory', 'required', 'voluntary', 'optional', 'mandated', 'necessary', 'forbidden', 'permitted', 'prescribed', '##rricular', 'illegal', 'available', 'beneficial', 'prohibited', 'scheduled', 'obliged', 'tertiary', 'secondary', 'statutory', 'legal', 'free', 'used', 'customary', 'primary', 'authorised', 'canonical', 'standard', 'lawful']

SS step: a) substitute list without duplicates: ['compulsory', 'mandatory', 'obligatory', 'required', 'voluntary', 'optional', 'mandated', 'necessary', 'forbidden', 'permitted', 'prescribed', '##rricular', 'illegal', 'available', 'beneficial', 'prohibited', 'scheduled', 'obliged', 'tertiary', 'secondary', 'statutory', 'leg

In [7]:
from transformers import TFAutoModel

In [8]:
import tensorflow as tf
import numpy as np

In [9]:
# updated default model with SS on contextualized embeddings,  with cosine sim scores in context of the sentence, with bert-large-uncased and tensorflow.
# Calculates similarity between the original sentence and the sentences with candidate substitutes retrieved via masking in the SG step (also with bert-large-uncased).
# creates a list with sentences with substitute words filled in (commented out for oversight purposes)

# create a fill-mask pipeline using BERT
fill_mask = pipeline("fill-mask", lm_model, tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased"))

def calculate_similarity_scores(sentence, sentence_with_substitutes):
    tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased")
    tf_model = TFAutoModel.from_pretrained("bert-large-uncased")

    def embed_text(text):
        tokens = tokenizer(text, padding=True, truncation=True, return_tensors="tf")
        outputs = tf_model(**tokens)
        embeddings = outputs.last_hidden_state[:, 0, :]
        embeddings = tf.nn.l2_normalize(embeddings, axis=1)
        return embeddings

    original_sentence_embedding = embed_text(sentence)
    substitute_sentence_embeddings = embed_text(sentence_with_substitutes)

    cosine_similarity = np.inner(original_sentence_embedding, substitute_sentence_embeddings)
    similarity_scores = cosine_similarity[0]

    return similarity_scores



# in each row, for each complex word: 
for index, row in data.iterrows():
       
    # 1. Substitute Generation (SG)
    
    ## print the sentence and the complex word
    sentence, complex_word = row["sentence"], row["complex_word"]
    print(f"Sentence: {sentence}")
    print(f"Complex word: {complex_word}")

    ## in the sentence, replace the complex word with a masked word
    sentence_masked_word = sentence.replace(complex_word, "[MASK]")

    ## concatenate the original sentence and the masked sentence
    tokenizer = fill_mask.tokenizer
    sentences_concat = f"{sentence} {tokenizer.sep_token} {sentence_masked_word}"

    ## generate and rank candidate substitutes for the masked word using the fill_mask pipeline
    top_k = 30
    result = fill_mask(sentences_concat, top_k=top_k)
   
    ## lowercase and print the top-k substitutes
    substitutes = [substitute["token_str"].lower() for substitute in result]
    print(f"SG step: generated substitutes: {substitutes}\n")
    
    
    # 2. Substitute Selection (SS):   
    
    # a) remove duplicates from the substitute list 
    
    substitutes_no_dupl = []
    for sub in substitutes:
        if sub not in substitutes_no_dupl:
            substitutes_no_dupl.append(sub)
    print(f"SS step: a) substitute list without duplicates: {substitutes_no_dupl}\n")

   
    # b) remove duplicates and inflected forms of the complex word from the substitute list
    ## Lemmatize the complex word with spaCy, in order to compare it with the lemmatized substitute later to see if their mutual lemmas are the same (will be deleted)
    doc_complex_word = nlp(complex_word)
    complex_word_lemma = doc_complex_word[0].lemma_
    print(f"complex_word_lemma for complex word '{complex_word}': {complex_word_lemma}\n")


     ## remove duplicates and inflected forms of the complex word from the list with substitutes
    substitutes_no_dupl_complex_word = []
    for substitute in substitutes_no_dupl:
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_
        if substitute_lemma != complex_word_lemma:
            substitutes_no_dupl_complex_word.append(substitute)
    print(f"SS step: b) substitute list without duplicates and inflected forms of the complex word: {substitutes_no_dupl_complex_word}\n")

    # c) remove antonyms of the complex word from the substitute list
    substitutes_no_dupl_complex_word_no_antonym = []
    for substitute in substitutes_no_dupl_complex_word:
        syn = wn.synsets(complex_word_lemma)
        if syn:
            syn = syn[0]
            for lemma in syn.lemmas():
                if lemma.antonyms() and lemma.name() == substitute_lemma:
                    print(f"Antonym removed: {lemma.antonyms()[0].name()}")
                    break
            else:
                substitutes_no_dupl_complex_word_no_antonym.append(substitute)
        else:
            substitutes_no_dupl_complex_word_no_antonym.append(substitute)
    print(f"SS step: c): substitute list without antonyms of the complex word: {substitutes_no_dupl_complex_word_no_antonym}\n")
    
    
    # create sentence with the complex word replaced by the substitutes
    sentence_with_substitutes = [sentence.replace(complex_word, sub) for sub in substitutes_no_dupl_complex_word_no_antonym]
    #print(f"List with sentences where complex word is substituted: {sentence_with_substitutes}\n")

    
    # d) calculate similarity scores, and rank the substitutes based on their similarity score
    similarity_scores = calculate_similarity_scores(sentence, sentence_with_substitutes)
    #print(f"Similarity scores: {similarity_scores}\n")
    ranked_substitutes = sorted(zip(substitutes_no_dupl_complex_word_no_antonym, similarity_scores), key=lambda x: x[1], reverse=True)
    print(f"SS step d) Ranked substitutes in context, including on similarity scores: {ranked_substitutes}\n")
    ranked_substitutes_only = [substitute for substitute, score in ranked_substitutes]
    print(f"Ranked substitutes in context only: {ranked_substitutes_only}\n")
        
    print('-----------------------------------------------------------------------------------------')
    print()

Sentence: A Spanish government source, however, later said that banks able to cover by themselves losses on their toxic property assets will not be forced to remove them from their books while it will be compulsory for those receiving public help.
Complex word: compulsory
SG step: generated substitutes: ['compulsory', 'mandatory', 'obligatory', 'required', 'voluntary', 'optional', 'mandated', 'necessary', 'forbidden', 'permitted', 'prescribed', '##rricular', 'illegal', 'available', 'beneficial', 'prohibited', 'scheduled', 'obliged', 'tertiary', 'secondary', 'statutory', 'legal', 'free', 'used', 'customary', 'primary', 'authorised', 'canonical', 'standard', 'lawful']

SS step: a) substitute list without duplicates: ['compulsory', 'mandatory', 'obligatory', 'required', 'voluntary', 'optional', 'mandated', 'necessary', 'forbidden', 'permitted', 'prescribed', '##rricular', 'illegal', 'available', 'beneficial', 'prohibited', 'scheduled', 'obliged', 'tertiary', 'secondary', 'statutory', 'leg

Some layers from the model checkpoint at bert-large-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-large-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


SS step d) Ranked substitutes in context, including on similarity scores: [('obligatory', 0.998616), ('forbidden', 0.99840474), ('voluntary', 0.9980571), ('legal', 0.9978615), ('secondary', 0.99772567), ('statutory', 0.99741924), ('primary', 0.9973586), ('mandatory', 0.9973109), ('authorised', 0.9972798), ('necessary', 0.9971266), ('illegal', 0.99701357), ('free', 0.9967106), ('available', 0.995743), ('scheduled', 0.9954355), ('lawful', 0.9951608), ('obliged', 0.99502474), ('standard', 0.99489534), ('prescribed', 0.9948679), ('used', 0.99431866), ('beneficial', 0.99399704), ('prohibited', 0.99366176), ('mandated', 0.99360925), ('permitted', 0.9934938), ('required', 0.99324286), ('optional', 0.9925215), ('customary', 0.9924901), ('tertiary', 0.98908913), ('canonical', 0.9854608), ('##rricular', 0.96308446)]

Ranked substitutes in context only: ['obligatory', 'forbidden', 'voluntary', 'legal', 'secondary', 'statutory', 'primary', 'mandatory', 'authorised', 'necessary', 'illegal', 'free',

Some layers from the model checkpoint at bert-large-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-large-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


SS step d) Ranked substitutes in context, including on similarity scores: [('infused', 0.99678254), ('filled', 0.9956644), ('injected', 0.9955159), ('fitted', 0.99550927), ('delivered', 0.9946965), ('seeded', 0.99427503), ('equipped', 0.99379665), ('infected', 0.99372524), ('left', 0.9935599), ('created', 0.9930353), ('fed', 0.99293506), ('tested', 0.9929239), ('gifted', 0.99291635), ('encouraged', 0.9928454), ('given', 0.9928283), ('inspired', 0.9928073), ('raised', 0.9927577), ('impressed', 0.992678), ('shaken', 0.9925828), ('endowed', 0.99193335), ('introduced', 0.9912216), ('hit', 0.991197), ('supplied', 0.99068856), ('treated', 0.9906582), ('offered', 0.99051076), ('struck', 0.99043155), ('provided', 0.9903593), ('rewarded', 0.9901767), ('furnished', 0.9859725), ('presented', 0.9850863)]

Ranked substitutes in context only: ['infused', 'filled', 'injected', 'fitted', 'delivered', 'seeded', 'equipped', 'infected', 'left', 'created', 'fed', 'tested', 'gifted', 'encouraged', 'given',

Some layers from the model checkpoint at bert-large-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-large-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


SS step d) Ranked substitutes in context, including on similarity scores: [('monsters', 0.99311703), ('demons', 0.991179), ('insects', 0.98955345), ('vampires', 0.9891384), ('zombies', 0.9885709), ('freaks', 0.98829615), ('dogs', 0.98787796), ('pigs', 0.9875864), ('gods', 0.98693633), ('lovers', 0.9859979), ('fantasies', 0.9859177), ('dolls', 0.98540473), ('mania', 0.9849972), ('giants', 0.9838166), ('sims', 0.981274), ('leaders', 0.9803264), ('machines', 0.9778185), ('fans', 0.97766393), ('hysteria', 0.9751682), ('followers', 0.9726733), ('children', 0.9722737), ('puppets', 0.9717666), ('organs', 0.97094476), ('heroes', 0.9695369), ('victims', 0.9654965), ('killers', 0.9633757), ('theorists', 0.9610765), ('criminals', 0.95898163), ('crimes', 0.9454692), ('##atics', 0.91807973)]

Ranked substitutes in context only: ['monsters', 'demons', 'insects', 'vampires', 'zombies', 'freaks', 'dogs', 'pigs', 'gods', 'lovers', 'fantasies', 'dolls', 'mania', 'giants', 'sims', 'leaders', 'machines', 

Some layers from the model checkpoint at bert-large-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-large-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


SS step d) Ranked substitutes in context, including on similarity scores: [('monitoring', 0.997315), ('observation', 0.99698967), ('troops', 0.996557), ('monitors', 0.9958497), ('soldiers', 0.9958277), ('analysts', 0.9956386), ('experts', 0.9955363), ('victims', 0.9950533), ('casualties', 0.9948847), ('observations', 0.99488217), ('refugees', 0.9947773), ('fighters', 0.9947088), ('observing', 0.9938984), ('civilians', 0.9937451), ('volunteers', 0.9935777), ('individuals', 0.9930544), ('observe', 0.9892867), ('indicators', 0.98927087), ('participants', 0.9890274), ('journalists', 0.9888512), ('people', 0.9887333), ('combatants', 0.98693126), ('citizens', 0.9865445), ('witnesses', 0.9825927), ('instruments', 0.966725), ('activists', 0.96632993), ('##keepers', 0.9511474), ('spectators', 0.8668432)]

Ranked substitutes in context only: ['monitoring', 'observation', 'troops', 'monitors', 'soldiers', 'analysts', 'experts', 'victims', 'casualties', 'observations', 'refugees', 'fighters', 'obs

Some layers from the model checkpoint at bert-large-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-large-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


SS step d) Ranked substitutes in context, including on similarity scores: [('bullet', 0.98448384), ('blast', 0.9811242), ('arrow', 0.9797474), ('serious', 0.97949255), ('fatal', 0.9786947), ('severe', 0.97802144), ('penetrating', 0.97779113), ('multiple', 0.977361), ('splinter', 0.97688437), ('flesh', 0.9761436), ('shotgun', 0.97486144), ('gunshot', 0.9747316), ('two', 0.97354585), ('stabbing', 0.97346896), ('several', 0.97286105), ('battle', 0.9726114), ('inflicted', 0.9712986), ('three', 0.9709559), ('minor', 0.9703641), ('entry', 0.9703451), ('exit', 0.9697292), ('shot', 0.96940994), ('burn', 0.96926177), ('open', 0.9679353), ('internal', 0.96742725), ('knife', 0.966819), ('the', 0.96552926), ('stab', 0.96441793), ('shell', 0.9622123), ('bomb', 0.9489385)]

Ranked substitutes in context only: ['bullet', 'blast', 'arrow', 'serious', 'fatal', 'severe', 'penetrating', 'multiple', 'splinter', 'flesh', 'shotgun', 'gunshot', 'two', 'stabbing', 'several', 'battle', 'inflicted', 'three', 'm

Some layers from the model checkpoint at bert-large-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-large-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


SS step d) Ranked substitutes in context, including on similarity scores: [('masked', 0.99956214), ('dressed', 0.99953103), ('clad', 0.9994428), ('covered', 0.9993106), ('armed', 0.9990921), ('hiding', 0.99907225), ('draped', 0.9990294), ('posing', 0.9990041), ('dressing', 0.9989678), ('concealed', 0.9989608), ('hidden', 0.99892384), ('acting', 0.9987667), ('clothed', 0.9987644), ('armored', 0.99850357), ('undercover', 0.998435), ('seated', 0.9983873), ('armoured', 0.9983237), (',', 0.9979368), ('portrayed', 0.99764407), ('appearing', 0.99753594), ('disguise', 0.9974008), ('depicted', 0.99735093), ('cloak', 0.997258), ('painted', 0.99721193), ('fitted', 0.99710214), ('dress', 0.9970146), ('appeared', 0.9964408), ('guise', 0.9964286), ('identified', 0.99626684)]

Ranked substitutes in context only: ['masked', 'dressed', 'clad', 'covered', 'armed', 'hiding', 'draped', 'posing', 'dressing', 'concealed', 'hidden', 'acting', 'clothed', 'armored', 'undercover', 'seated', 'armoured', ',', 'po

Some layers from the model checkpoint at bert-large-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-large-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


SS step d) Ranked substitutes in context, including on similarity scores: [('extension', 0.9992267), ('echo', 0.99907005), ('arm', 0.99894005), ('aspect', 0.99889594), ('adhere', 0.9987773), ('outpost', 0.99876857), ('affiliate', 0.99872917), ('heir', 0.99865603), ('extreme', 0.9985868), ('isolate', 0.99842954), ('expression', 0.9984244), ('imitation', 0.9982546), ('adaptation', 0.9982225), ('associate', 0.9981582), ('evolution', 0.9981181), ('offspring', 0.99808633), ('approximation', 0.99757326), ('element', 0.9975413), ('adherence', 0.9975263), ('inhibitor', 0.99709), ('ally', 0.99705994), ('opponent', 0.99644923), ('orthodox', 0.99644184), ('enemy', 0.99599284), ('enclave', 0.99546736), ('embrace', 0.995185), ('sect', 0.99505365), ('example', 0.99102676), ('issue', 0.9880693), ('exception', 0.9876683)]

Ranked substitutes in context only: ['extension', 'echo', 'arm', 'aspect', 'adhere', 'outpost', 'affiliate', 'heir', 'extreme', 'isolate', 'expression', 'imitation', 'adaptation', '

Some layers from the model checkpoint at bert-large-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-large-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


SS step d) Ranked substitutes in context, including on similarity scores: [('orchestrated', 0.99220186), ('chamber', 0.9910052), ('symphonies', 0.9898526), ('piano', 0.98832047), ('romantic', 0.98830265), ('sonata', 0.9879669), ('concerto', 0.98691916), ('symphony', 0.9829136), ('orchestral', 0.9807757), ('goldberg', 0.97796786), ('philharmonic', 0.97773), ('music', 0.97559386), ('orchestra', 0.96964294), ('instrumental', 0.9634979), ('classical', 0.96165097), ('scholarly', 0.9544653), ('modern', 0.94865894), ('concert', 0.9468763), ('musical', 0.9448357), ('dramatic', 0.9341782), ('standard', 0.9292915), ('written', 0.9286128), ('operatic', 0.92696166), ('comparative', 0.9182861), ('thematic', 0.9173455), ('choral', 0.8792027), ('continental', 0.8659702), ('poetic', 0.83668303), ('lyrical', 0.83481765)]

Ranked substitutes in context only: ['orchestrated', 'chamber', 'symphonies', 'piano', 'romantic', 'sonata', 'concerto', 'symphony', 'orchestral', 'goldberg', 'philharmonic', 'music',

Some layers from the model checkpoint at bert-large-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-large-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


SS step d) Ranked substitutes in context, including on similarity scores: [('deployment', 0.99490625), ('activate', 0.9942429), ('employ', 0.9909511), ('utilize', 0.9904423), ('ready', 0.9865898), ('operate', 0.9863219), ('dock', 0.9860799), ('dispatch', 0.9855205), ('reinforce', 0.98464274), ('exercise', 0.98400205), ('outfit', 0.98368776), ('acquire', 0.98147714), ('launch', 0.98048663), ('send', 0.9795644), ('install', 0.97683144), ('assemble', 0.9763476), ('use', 0.97628176), ('establish', 0.97609115), ('move', 0.97460824), ('maintain', 0.9732615), ('construct', 0.97157836), ('station', 0.97154474), ('drill', 0.97112155), ('build', 0.9699961), ('withdraw', 0.9599061), ('release', 0.9570528), ('fire', 0.9525608), ('alert', 0.88578594)]

Ranked substitutes in context only: ['deployment', 'activate', 'employ', 'utilize', 'ready', 'operate', 'dock', 'dispatch', 'reinforce', 'exercise', 'outfit', 'acquire', 'launch', 'send', 'install', 'assemble', 'use', 'establish', 'move', 'maintain',

Some layers from the model checkpoint at bert-large-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-large-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


SS step d) Ranked substitutes in context, including on similarity scores: [('agencies', 0.99389577), ('officials', 0.9937345), ('police', 0.9921598), ('government', 0.9901746), ('policemen', 0.9900372), ('prosecutors', 0.98969036), ('officers', 0.98931754), ('ministers', 0.9890667), ('persons', 0.9889973), ('courts', 0.9884672), ('bodies', 0.9864596), ('governments', 0.98577), ('people', 0.98549366), ('residents', 0.98460686), ('organisations', 0.9846039), ('forces', 0.9844201), ('magistrates', 0.98386586), ('subjects', 0.98361236), ('investigators', 0.9833896), ('judges', 0.98330843), ('offences', 0.975519), ('rulers', 0.97472095), ('prisoners', 0.97445524), ('victims', 0.97223413), ('journalists', 0.9672091), ('elements', 0.9667928), ('operators', 0.94491625), ('##s', 0.9427328)]

Ranked substitutes in context only: ['agencies', 'officials', 'police', 'government', 'policemen', 'prosecutors', 'officers', 'ministers', 'persons', 'courts', 'bodies', 'governments', 'people', 'residents'

In [10]:
import tensorflow_hub as hub
import tensorflow_text as text

In [11]:
# # load BERT and the preprocessing layer, this model uses tensorflow
# bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
# bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")

In [12]:
# updated model with SS on contextualized embeddings, based on TensorFlow Hub BERT with Keras layers, see https://keras.io/examples/nlp/semantic_similarity_with_bert/
# calculates similarity between the original sentence and the sentences with candidate substitutes.
# model used for similiarity calculations: bert_en_uncased (similar to bert_base_uncased) 

def calculate_similarity_scores(sentence, sentences_with_substitutes):
    bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
    bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")
    
    def embed_text(text):
        preprocessed_text = bert_preprocess(text)
        return bert_encoder(preprocessed_text)["pooled_output"]

    original_sentence_embedding = embed_text([sentence])
    substitute_sentence_embeddings = embed_text(sentences_with_substitutes)
    
    similarity_scores = np.inner(original_sentence_embedding, substitute_sentence_embeddings).flatten()

    return similarity_scores



# in each row, for each complex word: 
for index, row in data.iterrows():
       
    # 1. Substitute Generation (SG)
    
    ## print the sentence and the complex word
    sentence, complex_word = row["sentence"], row["complex_word"]
    print(f"Sentence: {sentence}")
    print(f"Complex word: {complex_word}")

    ## in the sentence, replace the complex word with a masked word
    sentence_masked_word = sentence.replace(complex_word, "[MASK]")

    ## concatenate the original sentence and the masked sentence
    tokenizer = fill_mask.tokenizer
    sentences_concat = f"{sentence} {tokenizer.sep_token} {sentence_masked_word}"

    ## generate and rank candidate substitutes for the masked word using the fill_mask pipeline
    top_k = 30
    result = fill_mask(sentences_concat, top_k=top_k)
   
    ## lowercase and print the top-k substitutes
    substitutes = [substitute["token_str"].lower() for substitute in result]
    print(f"SG step: generated substitutes: {substitutes}\n")
    
    
    # 2. Substitute Selection (SS):   
    
    # a) remove duplicates from the substitute list 
    
    substitutes_no_dupl = []
    for sub in substitutes:
        if sub not in substitutes_no_dupl:
            substitutes_no_dupl.append(sub)
    print(f"SS step: a) substitute list without duplicates: {substitutes_no_dupl}\n")

   
    # b) remove duplicates and inflected forms of the complex word from the substitute list
    ## Lemmatize the complex word with spaCy, in order to compare it with the lemmatized substitute later to see if their mutual lemmas are the same (will be deleted)
    doc_complex_word = nlp(complex_word)
    complex_word_lemma = doc_complex_word[0].lemma_
    print(f"complex_word_lemma for complex word '{complex_word}': {complex_word_lemma}\n")


     ## remove duplicates and inflected forms of the complex word from the list with substitutes
    substitutes_no_dupl_complex_word = []
    for substitute in substitutes_no_dupl:
        doc_substitute = nlp(substitute)
        substitute_lemma = doc_substitute[0].lemma_
        if substitute_lemma != complex_word_lemma:
            substitutes_no_dupl_complex_word.append(substitute)
    print(f"SS step: b) substitute list without duplicates and inflected forms of the complex word: {substitutes_no_dupl_complex_word}\n")

    # c) remove antonyms of the complex word from the list with substitutes
    substitutes_no_dupl_complex_word_no_antonym = []
    for substitute in substitutes_no_dupl_complex_word:
        syn = wn.synsets(complex_word_lemma)
        if syn:
            syn = syn[0]
            for lemma in syn.lemmas():
                if lemma.antonyms() and lemma.name() == substitute:
                    print(f"Antonym removed: {lemma.antonyms()[0].name()}")
                    break
            else:
                substitutes_no_dupl_complex_word_no_antonym.append(substitute)
        else:
            substitutes_no_dupl_complex_word_no_antonym.append(substitute)
    print(f"SS step: c): substitute list without antonyms of the complex word: {substitutes_no_dupl_complex_word_no_antonym}\n")
    
    
    # create sentences with the complex word replaced by the substitutes
    sentences_with_substitutes = [sentence.replace(complex_word, sub) for sub in substitutes_no_dupl_complex_word_no_antonym]
    #print(f"sentences with substitutes: {sentences_with_substitutes}\n")
    
    # d) calculate similarity scores, and rank the substitutes based on their similarity score
    similarity_scores = calculate_similarity_scores(sentence, sentences_with_substitutes)
    #print(f"Similarity scores: {similarity_scores}\n")
    ranked_substitutes = sorted(zip(substitutes_no_dupl_complex_word_no_antonym, similarity_scores), key=lambda x: x[1], reverse=True)
    print(f"SS step: d): Ranked substitutes with similarity scores: {ranked_substitutes}\n")
    ranked_substitutes_only = [substitute for substitute, score in ranked_substitutes]
    print(f"SS step: d): Ranked substitutes: {ranked_substitutes_only}\n")
        
    print('-----------------------------------------------------------------------------------------')
    print()

Sentence: A Spanish government source, however, later said that banks able to cover by themselves losses on their toxic property assets will not be forced to remove them from their books while it will be compulsory for those receiving public help.
Complex word: compulsory
SG step: generated substitutes: ['compulsory', 'mandatory', 'obligatory', 'required', 'voluntary', 'optional', 'mandated', 'necessary', 'forbidden', 'permitted', 'prescribed', '##rricular', 'illegal', 'available', 'beneficial', 'prohibited', 'scheduled', 'obliged', 'tertiary', 'secondary', 'statutory', 'legal', 'free', 'used', 'customary', 'primary', 'authorised', 'canonical', 'standard', 'lawful']

SS step: a) substitute list without duplicates: ['compulsory', 'mandatory', 'obligatory', 'required', 'voluntary', 'optional', 'mandated', 'necessary', 'forbidden', 'permitted', 'prescribed', '##rricular', 'illegal', 'available', 'beneficial', 'prohibited', 'scheduled', 'obliged', 'tertiary', 'secondary', 'statutory', 'leg