### Cosine similarity -semantics

#### importing, loading and reading files

In [1]:
import torch
from transformers import RobertaTokenizer, RobertaForMaskedLM
import pandas as pd
from nltk.corpus import wordnet as wn
import spacy
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

import numpy as np
import gensim.downloader as api

In [2]:
# read the tsv file
filename = "./data/trial/tsar2022_en_trial_none.tsv"
data = pd.read_csv(filename, sep='\t', header=None, names=["sentence", "complex_word"])


# For SG step: load RoBERTa model and tokenizer
model = RobertaForMaskedLM.from_pretrained('roberta-base') # not enough memory for roberta-large
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# for SS step: use spacy for lemmatizing in steps a-d
nlp = spacy.load("en_core_web_sm")

# For SS step: use wiki2vec
from gensim.models import Word2Vec
from gensim.models import KeyedVectors



In [3]:
# word2 vec does not work 
MODEL_FILE= 'C:\\Users\\IrmaT\\Thesis\\My_code\\word-embeddings\\GoogleNews-vectors-negative300.bin.gz'

word2vec = KeyedVectors.load(MODEL_FILE,binary=False)

UnpicklingError: invalid load key, '3'.

In [4]:
# moving to wiki2vec

# path to the local copy of a model built from wikipedia
MODEL_FILE= 'C:\\Users\\IrmaT\\Thesis\\My_code\\word-embeddings\\wiki2vec\\enwiki_20180420_100\\enwiki_20180420_100d.txt'

# Loading the full model can take a while. (appr. 15 minutes).
wiki2vec = KeyedVectors.load_word2vec_format(MODEL_FILE, binary=False)

#### Use wiki2vec instead of glove for semantic similarity:

In [5]:

# in each row, for each complex word: 
for index, row in data.iterrows():
    
    # 1. Substitute Generation (SG): perform masking and generate substitutes:
    
    ## print the sentence and the complex word
    sentence, complex_word = row["sentence"], row["complex_word"]
    print(f"Sentence: {sentence}")
    print(f"Complex word: {complex_word}")

    ## in the sentence, replace the complex word with a masked word
    sentence_masked_word = sentence.replace(complex_word, "<mask>")

    ## concatenate the sentence with the complex word and the sentence with the masked word, by using RoBERTa's separator token to create one string of both sentences
    sentences_concat = f"{sentence} {tokenizer.sep_token} {sentence_masked_word}"

    ## tokenize the concatenated sentence
    sentences_concat_tokenized = tokenizer.encode(sentences_concat, return_tensors='pt')

    ## find the masked word in the tokenized sentence
    mask_location = torch.where(sentences_concat_tokenized == tokenizer.mask_token_id)[1].item()

    ## generate predictions for the masked word (forward pass not needed for predictions, only for training)
    with torch.no_grad():
        outputs = model(sentences_concat_tokenized)
        predictions = outputs.logits

    ## get the top-k substitutes based on the predicted logits
    top_k = 30
    top_tokens = torch.topk(predictions[0, mask_location], top_k).indices

    ## decode the top-k substitutes, lowercase and print them
    substitutes = [tokenizer.decode(token.item()).strip().lower() for token in top_tokens]
    print(f"SG step: generated substitutes: {substitutes}\n")
    
    

    # 2. Substitute Selection (SS): 
    
    # a) remove duplicates from the substitute list 
    
    substitutes_no_dupl = []
    for sub in substitutes:
        if sub not in substitutes_no_dupl:
            substitutes_no_dupl.append(sub)
    print(f"SS step: a) substitute list without duplicates: {substitutes_no_dupl}\n")

   
    # b) remove duplicates and inflected forms of the complex word from the substitute list
    ## Lemmatize the complex word with spaCy, in order to compare it with the lemmatized substitute later to see if their mutual lemmas are the same (will be deleted)
    doc = nlp(complex_word)
    complex_word_lemma = doc[0].lemma_
    print(f"complex_word_lemma for complex word '{complex_word}': {complex_word_lemma}")

    ## remove duplicates and inflected forms of the complex word from the substitute list
    substitutes_no_dupl_complex_word = []
    for substitute in substitutes_no_dupl:
        doc = nlp(substitute)
        if doc[0].lemma_ != complex_word_lemma:
            substitutes_no_dupl_complex_word.append(substitute)
    print(f"SS step: b) substitute list without duplicates and inflected forms of the complex word: {substitutes_no_dupl_complex_word}\n")

    # c) remove antonyms of the complex word from the substitute list
    substitutes_no_dupl_complex_word_no_antonym = []
    for substitute in substitutes_no_dupl_complex_word:
        syn = wn.synsets(complex_word)
        if syn:
            syn = syn[0]
            for lemma in syn.lemmas():
                if lemma.antonyms() and lemma.name() == substitute:
                    print(f"Antonym removed: {lemma.antonyms()[0].name()}")
                    break
            else:
                substitutes_no_dupl_complex_word_no_antonym.append(substitute)
        else:
            substitutes_no_dupl_complex_word_no_antonym.append(substitute)
    print(f"SS step: c): substitute list without antonyms of the complex word: {substitutes_no_dupl_complex_word_no_antonym}\n")
    
    # d) perform ranking of substitutes_no_dupl_complex_word_no_antonym on semantic similarity
    # compute the semantic similarity between the complex word and each substitute
    similarities = []
    for substitute in substitutes_no_dupl_complex_word_no_antonym:
        if substitute.lower() in wiki2vec.key_to_index:
            similarity = wiki2vec.similarity(complex_word.lower(), substitute.lower())
        else:
            print(f"Word not found in embeddings (will be put at the end of the list): {substitute}")
            similarity = -1.0  # set similarity to a negative value to indicate it should be put at the very end of the list
        similarities.append(similarity)

    # rank the substitutes based on their semantic similarity with the complex word
    substitutes_ranked = [x for _, x in sorted(zip(similarities, substitutes_no_dupl_complex_word_no_antonym), reverse=True)]
    print(f"SS step: d) substitute list ranked with wiki2vec on semantic similarity: {substitutes_ranked}\n")
    print('----------------------------------------------------------------------------------------------------------------------')
    print()

Sentence: A Spanish government source, however, later said that banks able to cover by themselves losses on their toxic property assets will not be forced to remove them from their books while it will be compulsory for those receiving public help.
Complex word: compulsory
SG step: generated substitutes: ['compulsory', 'mandatory', 'obligatory', 'voluntary', 'required', 'optional', 'obliged', 'uniform', 'necessary', 'available', 'mandated', 'sufficient', 'routine', 'forced', 'customary', 'prerequisite', 'feasible', 'indispensable', 'forthcoming', 'universal', 'requirement', 'involuntary', 'obligated', 'compelled', 'conditional', 'enforced', 'contingent', 'possible', 'compulsion', 'mandatory']

SS step: a) substitute list without duplicates: ['compulsory', 'mandatory', 'obligatory', 'voluntary', 'required', 'optional', 'obliged', 'uniform', 'necessary', 'available', 'mandated', 'sufficient', 'routine', 'forced', 'customary', 'prerequisite', 'feasible', 'indispensable', 'forthcoming', 'un