In [1]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoModelForSequenceClassification
from fitbert import FitBert
import pandas as pd
from nltk.corpus import wordnet as wn
import spacy
from transformers import pipeline

In [2]:
# read the tsv file
filename = "./data/trial/tsar2022_en_trial_none.tsv"
data = pd.read_csv(filename, sep='\t', header=None, names=["sentence", "complex_word"])

In [3]:
# for SS step: use spacy for lemmatizing in steps a-d
nlp = spacy.load("en_core_web_sm")

In [4]:
# # initialize the tokenizer and the models (sc_model only applies to FitBert)
tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased")
lm_model = AutoModelForMaskedLM.from_pretrained("bert-large-uncased")
sc_model = AutoModelForSequenceClassification.from_pretrained("bert-large-uncased")

Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerN

In [5]:
# instantiate the FitBert model
model = FitBert(lm_model, sc_model, tokenizer)


device: cpu
using custom model: ['BertForMaskedLM']


In [None]:
# Fitbert does not generate subsitutes by itself, but needs a list with substitutes and make that list fit better.
# So, will generate SG with bert-large-uncased first

#### Substitute Generation with bert

In [8]:
from transformers import pipeline

In [9]:
# not with concatenated sentence pairs: bad results

# create a fill-mask pipeline using BERT
fill_mask = pipeline("fill-mask", lm_model, tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased"))

# in each row, for each complex word: 
for index, row in data.iterrows():
       
    # 1. Substitute Generation (SG): perform masking and generate substitutes:
    
    ## print the sentence and the complex word
    sentence, complex_word = row["sentence"], row["complex_word"]
    print(f"Sentence: {sentence}")
    print(f"Complex word: {complex_word}")

    ## in the sentence, replace the complex word with a masked word
    sentence_masked_word = sentence.replace(complex_word, "[MASK]")

    ## generate and rank candidate substitutes for the masked word using the fill_mask pipeline
    top_k = 30
    result = fill_mask(sentence_masked_word, top_k=top_k)
   
    ## lowercase and print the top-k substitutes
    substitutes = [substitute["token_str"].lower() for substitute in result]
    print(f"SG step: generated substitutes: {substitutes}\n")

Sentence: A Spanish government source, however, later said that banks able to cover by themselves losses on their toxic property assets will not be forced to remove them from their books while it will be compulsory for those receiving public help.
Complex word: compulsory
SG step: generated substitutes: ['easier', 'possible', 'done', 'free', 'difficult', 'available', 'better', 'beneficial', 'allowed', 'provided', 'easy', 'appropriate', 'waived', 'open', 'impossible', 'mandatory', 'avoided', 'harder', 'granted', 'opened', 'improved', 'continued', 'only', 'good', 'convenient', 'preserved', 'permitted', 'compulsory', 'necessary', 'voluntary']

Sentence: Rajoy's conservative government had instilled markets with a brief dose of confidence by stepping into Bankia, performing a U-turn on its refusal to spend public money to rescue banks.
Complex word: instilled
SG step: generated substitutes: ['provided', 'surprised', 'injected', 'shocked', 'left', 'presented', 'hit', 'supplied', 'delivered'

In [10]:
# with concatenated sentence pairs: better results

# create a fill-mask pipeline using BERT
fill_mask = pipeline("fill-mask", lm_model, tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased"))

# in each row, for each complex word: 
for index, row in data.iterrows():
       
    # 1. Substitute Generation (SG): perform masking and generate substitutes:
    
    ## print the sentence and the complex word
    sentence, complex_word = row["sentence"], row["complex_word"]
    print(f"Sentence: {sentence}")
    print(f"Complex word: {complex_word}")

    ## in the sentence, replace the complex word with a masked word
    sentence_masked_word = sentence.replace(complex_word, "[MASK]")

    ## concatenate the original sentence and the masked sentence
    tokenizer = fill_mask.tokenizer
    sentences_concat = f"{sentence} {tokenizer.sep_token} {sentence_masked_word}"

    ## generate and rank candidate substitutes for the masked word using the fill_mask pipeline
    top_k = 30
    result = fill_mask(sentences_concat, top_k=top_k)
   
    ## lowercase and print the top-k substitutes
    substitutes = [substitute["token_str"].lower() for substitute in result]
    print(f"SG step: generated substitutes: {substitutes}\n")
    
    # 2. Substitute Selection (SS): 
    
    # a) remove duplicates from the substitute list 
    
    substitutes_no_dupl = []
    for sub in substitutes:
        if sub not in substitutes_no_dupl:
            substitutes_no_dupl.append(sub)
    print(f"SS step: a) substitute list without duplicates: {substitutes_no_dupl}\n")

   
    # b) remove duplicates and inflected forms of the complex word from the substitute list
    ## Lemmatize the complex word with spaCy, in order to compare it with the lemmatized substitute later to see if their mutual lemmas are the same (will be deleted)
    doc = nlp(complex_word)
    complex_word_lemma = doc[0].lemma_
    print(f"complex_word_lemma for complex word '{complex_word}': {complex_word_lemma}\n")


    ## remove duplicates and inflected forms of the complex word from the substitute list
    substitutes_no_dupl_complex_word = []
    for substitute in substitutes_no_dupl:
        doc = nlp(substitute)
        if doc[0].lemma_ != complex_word_lemma:
            substitutes_no_dupl_complex_word.append(substitute)
    print(f"SS step: b) substitute list without duplicates and inflected forms of the complex word: {substitutes_no_dupl_complex_word}\n")

    # c) remove antonyms of the complex word from the substitute list
    substitutes_no_dupl_complex_word_no_antonym = []
    for substitute in substitutes_no_dupl_complex_word:
        syn = wn.synsets(complex_word)
        if syn:
            syn = syn[0]
            for lemma in syn.lemmas():
                if lemma.antonyms() and lemma.name() == substitute:
                    print(f"Antonym removed: {lemma.antonyms()[0].name()}")
                    break
            else:
                substitutes_no_dupl_complex_word_no_antonym.append(substitute)
        else:
            substitutes_no_dupl_complex_word_no_antonym.append(substitute)
    print(f"SS step: c): substitute list without antonyms of the complex word: {substitutes_no_dupl_complex_word_no_antonym}\n")
    print('-----------------------------------------------------------------------------------------')
    print()

Sentence: A Spanish government source, however, later said that banks able to cover by themselves losses on their toxic property assets will not be forced to remove them from their books while it will be compulsory for those receiving public help.
Complex word: compulsory
SG step: generated substitutes: ['compulsory', 'mandatory', 'obligatory', 'required', 'voluntary', 'optional', 'mandated', 'necessary', 'forbidden', 'permitted', 'prescribed', '##rricular', 'illegal', 'available', 'beneficial', 'prohibited', 'scheduled', 'obliged', 'tertiary', 'secondary', 'statutory', 'legal', 'free', 'used', 'customary', 'primary', 'authorised', 'canonical', 'standard', 'lawful']

SS step: a) substitute list without duplicates: ['compulsory', 'mandatory', 'obligatory', 'required', 'voluntary', 'optional', 'mandated', 'necessary', 'forbidden', 'permitted', 'prescribed', '##rricular', 'illegal', 'available', 'beneficial', 'prohibited', 'scheduled', 'obliged', 'tertiary', 'secondary', 'statutory', 'leg

#### Substitute Selection with Fitbert

In [11]:
# test with FITBERT in step #2 d) # without FITBERT's access to the complex word. 
import fitbert

# create a fill-mask pipeline using BERT
tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased")
fill_mask = pipeline("fill-mask", lm_model, tokenizer=tokenizer)

# instantiate a FitBert model
from fitbert import FitBert
pretrained_model = AutoModelForMaskedLM.from_pretrained("bert-large-uncased")
fb_model = FitBert(model=pretrained_model)

# in each row, for each complex word: 
for index, row in data.iterrows():
       
    # 1. Substitute Generation (SG): perform masking and generate substitutes:
    
    ## print the sentence and the complex word
    sentence, complex_word = row["sentence"], row["complex_word"]
    print(f"Sentence: {sentence}")
    print(f"Complex word: {complex_word}")

    ## in the sentence, replace the complex word with a masked word
    sentence_masked_word = sentence.replace(complex_word, "[MASK]")

    ## concatenate the original sentence and the masked sentence
    tokenizer = fill_mask.tokenizer
    sentences_concat = f"{sentence} {tokenizer.sep_token} {sentence_masked_word}"

    ## generate and rank candidate substitutes for the masked word using the fill_mask pipeline
    top_k = 30
    result = fill_mask(sentences_concat, top_k=top_k)
   
    ## lowercase and print the top-k substitutes
    substitutes = [substitute["token_str"].lower() for substitute in result]
    print(f"SG step: generated substitutes: {substitutes}\n")
    
    # 2. Substitute Selection (SS): 
    
    # a) remove duplicates from the substitute list 
    
    substitutes_no_dupl = []
    for sub in substitutes:
        if sub not in substitutes_no_dupl:
            substitutes_no_dupl.append(sub)
    print(f"SS step: a) substitute list without duplicates: {substitutes_no_dupl}\n")

   
    # b) remove duplicates and inflected forms of the complex word from the substitute list
    ## Lemmatize the complex word with spaCy, in order to compare it with the lemmatized substitute later to see if their mutual lemmas are the same (will be deleted)
    doc = nlp(complex_word)
    complex_word_lemma = doc[0].lemma_
    print(f"complex_word_lemma for complex word '{complex_word}': {complex_word_lemma}\n")


    ## remove duplicates and inflected forms of the complex word from the substitute list
    substitutes_no_dupl_complex_word = []
    for substitute in substitutes_no_dupl:
        doc = nlp(substitute)
        if doc[0].lemma_ != complex_word_lemma:
            substitutes_no_dupl_complex_word.append(substitute)
    print(f"SS step: b) substitute list without duplicates and inflected forms of the complex word: {substitutes_no_dupl_complex_word}\n")

    # c) remove antonyms of the complex word from the substitute list
    substitutes_no_dupl_complex_word_no_antonym = []
    for substitute in substitutes_no_dupl_complex_word:
        syn = wn.synsets(complex_word)
        if syn:
            syn = syn[0]
            for lemma in syn.lemmas():
                if lemma.antonyms() and lemma.name() == substitute:
                    print(f"Antonym removed: {lemma.antonyms()[0].name()}")
                    break
            else:
                substitutes_no_dupl_complex_word_no_antonym.append(substitute)
        else:
            substitutes_no_dupl_complex_word_no_antonym.append(substitute)
    print(f"SS step: c): substitute list without antonyms of the complex word: {substitutes_no_dupl_complex_word_no_antonym}\n")
    
    
   # d) apply FITBERT to the list of substitutes
    sentence_fitbert_masked = sentence_masked_word.replace("[MASK]", "***mask***")
    ranked_substitutes = fb_model.rank(sentence_fitbert_masked, substitutes_no_dupl_complex_word_no_antonym)
    print(f"SS step: d) ranked substitutes using FitBert: {ranked_substitutes}\n")
    
    print('-----------------------------------------------------------------------------------------')
    print()

Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


device: cpu
using custom model: ['BertForMaskedLM']
Sentence: A Spanish government source, however, later said that banks able to cover by themselves losses on their toxic property assets will not be forced to remove them from their books while it will be compulsory for those receiving public help.
Complex word: compulsory
SG step: generated substitutes: ['compulsory', 'mandatory', 'obligatory', 'required', 'voluntary', 'optional', 'mandated', 'necessary', 'forbidden', 'permitted', 'prescribed', '##rricular', 'illegal', 'available', 'beneficial', 'prohibited', 'scheduled', 'obliged', 'tertiary', 'secondary', 'statutory', 'legal', 'free', 'used', 'customary', 'primary', 'authorised', 'canonical', 'standard', 'lawful']

SS step: a) substitute list without duplicates: ['compulsory', 'mandatory', 'obligatory', 'required', 'voluntary', 'optional', 'mandated', 'necessary', 'forbidden', 'permitted', 'prescribed', '##rricular', 'illegal', 'available', 'beneficial', 'prohibited', 'scheduled', '

In [12]:
# applied FITBERT in step #2 d) with FITBERT's access to the complex word by using the concatenated sentence as input variable.
import fitbert

# create a fill-mask pipeline using BERT
tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased")
fill_mask = pipeline("fill-mask", lm_model, tokenizer=tokenizer)

# instantiate a FitBert model
from fitbert import FitBert
pretrained_model = AutoModelForMaskedLM.from_pretrained("bert-large-uncased")
fb_model = FitBert(model=pretrained_model)

# in each row, for each complex word: 
for index, row in data.iterrows():
       
    # 1. Substitute Generation (SG): perform masking and generate substitutes:
    
    ## print the sentence and the complex word
    sentence, complex_word = row["sentence"], row["complex_word"]
    print(f"Sentence: {sentence}")
    print(f"Complex word: {complex_word}")

    ## in the sentence, replace the complex word with a masked word
    sentence_masked_word = sentence.replace(complex_word, "[MASK]")

    ## concatenate the original sentence and the masked sentence
    tokenizer = fill_mask.tokenizer
    sentences_concat = f"{sentence} {tokenizer.sep_token} {sentence_masked_word}"

    ## generate and rank candidate substitutes for the masked word using the fill_mask pipeline
    top_k = 30
    result = fill_mask(sentences_concat, top_k=top_k)
   
    ## lowercase and print the top-k substitutes
    substitutes = [substitute["token_str"].lower() for substitute in result]
    print(f"SG step: generated substitutes: {substitutes}\n")
    
    # 2. Substitute Selection (SS): 
    
    # a) remove duplicates from the substitute list 
    
    substitutes_no_dupl = []
    for sub in substitutes:
        if sub not in substitutes_no_dupl:
            substitutes_no_dupl.append(sub)
    print(f"SS step: a) substitute list without duplicates: {substitutes_no_dupl}\n")

   
    # b) remove duplicates and inflected forms of the complex word from the substitute list
    ## Lemmatize the complex word with spaCy, in order to compare it with the lemmatized substitute later to see if their mutual lemmas are the same (will be deleted)
    doc = nlp(complex_word)
    complex_word_lemma = doc[0].lemma_
    print(f"complex_word_lemma for complex word '{complex_word}': {complex_word_lemma}\n")


    ## remove duplicates and inflected forms of the complex word from the substitute list
    substitutes_no_dupl_complex_word = []
    for substitute in substitutes_no_dupl:
        doc = nlp(substitute)
        if doc[0].lemma_ != complex_word_lemma:
            substitutes_no_dupl_complex_word.append(substitute)
    print(f"SS step: b) substitute list without duplicates and inflected forms of the complex word: {substitutes_no_dupl_complex_word}\n")

    # c) remove antonyms of the complex word from the substitute list
    substitutes_no_dupl_complex_word_no_antonym = []
    for substitute in substitutes_no_dupl_complex_word:
        syn = wn.synsets(complex_word)
        if syn:
            syn = syn[0]
            for lemma in syn.lemmas():
                if lemma.antonyms() and lemma.name() == substitute:
                    print(f"Antonym removed: {lemma.antonyms()[0].name()}")
                    break
            else:
                substitutes_no_dupl_complex_word_no_antonym.append(substitute)
        else:
            substitutes_no_dupl_complex_word_no_antonym.append(substitute)
    print(f"SS step: c): substitute list without antonyms of the complex word: {substitutes_no_dupl_complex_word_no_antonym}\n")
    
    
   # d) apply FITBERT to the list of substitutes
    sentence_fitbert_masked = sentence_masked_word.replace("[MASK]", "***mask***")
    sentences_concat_fitbert = f"{sentence} {tokenizer.sep_token} {sentence_fitbert_masked}"
    
    ranked_substitutes = fb_model.rank(sentences_concat_fitbert, substitutes_no_dupl_complex_word_no_antonym)
    print(f"SS step: d) ranked substitutes using FitBert: {ranked_substitutes}\n")
    
    print('-----------------------------------------------------------------------------------------')
    print()

Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


device: cpu
using custom model: ['BertForMaskedLM']
Sentence: A Spanish government source, however, later said that banks able to cover by themselves losses on their toxic property assets will not be forced to remove them from their books while it will be compulsory for those receiving public help.
Complex word: compulsory
SG step: generated substitutes: ['compulsory', 'mandatory', 'obligatory', 'required', 'voluntary', 'optional', 'mandated', 'necessary', 'forbidden', 'permitted', 'prescribed', '##rricular', 'illegal', 'available', 'beneficial', 'prohibited', 'scheduled', 'obliged', 'tertiary', 'secondary', 'statutory', 'legal', 'free', 'used', 'customary', 'primary', 'authorised', 'canonical', 'standard', 'lawful']

SS step: a) substitute list without duplicates: ['compulsory', 'mandatory', 'obligatory', 'required', 'voluntary', 'optional', 'mandated', 'necessary', 'forbidden', 'permitted', 'prescribed', '##rricular', 'illegal', 'available', 'beneficial', 'prohibited', 'scheduled', '