In [1]:
import tensorflow as tf
from sensebert import SenseBert
import numpy as np




In [2]:
# with tf.compat.v1.Session() as session:
#     sensebert_model = SenseBert("sensebert-base-uncased", session=session)  # or sensebert-large-uncased
#     input_ids, input_mask = sensebert_model.tokenize(["I went to the store to buy some groceries.", "The store was closed."])
#     model_outputs = sensebert_model.run(input_ids, input_mask)
#     contextualized_embeddings, mlm_logits, supersense_logits = model_outputs  # these are NumPy arrays
#     print(model_outputs)
#     print(sensebert_model.tokenizer.convert_ids_to_senses([np.argmax(supersense_logits[0][9])]))
#     print(sensebert_model)
#     print(supersense_logits.shape)
#     print(sensebert_model.tokenize(["I went to the store to buy some groceries.", "The store was closed."]))

In [3]:
import nltk
nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer

print(WordNetLemmatizer().lemmatize('belabor', pos="V".lower()))
print(WordNetLemmatizer().lemmatize('belabored', pos="v"))

ps = PorterStemmer()
words = ["sentence", "sentences", "excuse", "excused", "excuses"]

words = [ps.stem(w) for w in words]
print(words)

print(ps.stem("buy"))
print(ps.stem("bought"))

[nltk_data] Downloading package wordnet to /home/ubuntu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


belabor
belabor
['sentenc', 'sentenc', 'excus', 'excus', 'excus']
buy
bought


In [4]:
from sklearn.dummy import DummyClassifier

In [5]:
import xml.etree.ElementTree as ET
from nltk.corpus import wordnet as wn

In [6]:
def getDataFromXML(filename):
    d42_data = dict()
    tree = ET.parse(filename)
    root = tree.getroot()

    for category in root:
        for sentence in category:
            ids = []
            s = ""
            for word in sentence:
                if word.text in ".,!?;:)]}/%":
                    s = s[: len(s) - 1]
                s += word.text
                if word.text not in "([{/$":
                    s += " "

                if word.tag == "instance":
                    ids.append((word.attrib["id"], word.text, word.attrib["pos"]))
            s = s[: len(s) - 1]

            for idd in ids:
                pos = wn.NOUN
                if idd[2] == "VERB":
                    pos = wn.VERB
                elif idd[2] == "ADJ":
                    pos = wn.VERB
                elif idd[2] == "ADV":
                    pos = wn.ADV
                d42_data[idd[0]] = (idd[1], s, pos)
    return d42_data

d42_data = getDataFromXML("wsd_hard_benchmark/42D/42D.data.xml")

d42_gold = dict()
with open("wsd_hard_benchmark/42D/42D.gold.key.txt") as file:
    lines = file.readlines()
    for line in lines:
        l = line.strip().split()
        idd = l[0]
        senseKey = l[1]
        options = []
        for i in range(1, len(l)):
            options.append(l[i])

        d42_gold[idd] = options


In [38]:
print(len(d42_data))
print(d42_data)
mykey = d42_gold['42D.d000.s000.t004']
print(mykey)

370
['living%5:00:00:extant:00']


In [33]:
def get_lemmas(lemma, pos):
    lemmatizer = WordNetLemmatizer()
    words = lemma.split()
    lemmas = [lemmatizer.lemmatize(word, pos=pos) for word in words]
    return " ".join(lemmas)

In [42]:
super_sense_dict = {
    "1": "noun",
    "2": "verb",
    "3": "adjective",
    "4": "adverb",
    "00": "adj.all",
    "01": "adj.pert",
    "02": "adv.all",
    "03": "noun.Tops",
    "04": "noun.act",
    "05": "noun.animal",
    "06": "noun.artifact",
    "07": "noun.attribute",
    "08": "noun.body",
    "09": "noun.cognition",
    "10": "noun.communication",
    "11": "noun.event",
    "12": "noun.feeling",
    "13": "noun.food",
    "14": "noun.group",
    "15": "noun.location",
    "16": "noun.motive",
    "17": "noun.object",
    "18": "noun.person",
    "19": "noun.phenomenon",
    "20": "noun.plant",
    "21": "noun.possession",
    "22": "noun.process",
    "23": "noun.quantity",
    "24": "noun.relation",
    "25": "noun.shape",
    "26": "noun.state",
    "27": "noun.substance",
    "28": "noun.time",
    "29": "verb.body",
    "30": "verb.change",
    "31": "verb.cognition",
    "32": "verb.communication",
    "33": "verb.competition",
    "34": "verb.consumption",
    "35": "verb.contact",
    "36": "verb.creation",
    "37": "verb.emotion",
    "38": "verb.motion",
    "39": "verb.perception",
    "40": "verb.possession",
    "41": "verb.social",
    "42": "verb.stative",
    "43": "verb.weather",
    "44": "adj.ppl",
}
print(super_sense_dict["23"])

def extract_sense_from_gold(label):
#     print(label)
    split_idx = label.index("%") + 3
    super_sense = label[split_idx:split_idx+2]
    return super_sense_dict[super_sense]
    
print(extract_sense_from_gold("alpine%3:01:00::"))
print(extract_sense_from_gold("42D.d011.s002.t008 experiment%2:41:02::"))
print(extract_sense_from_gold("42D.d012.s004.t005 foodstuff%1:13:00::"))

noun.quantity
adj.pert
verb.social
noun.food


In [59]:
dev = list(zip(d42_data, d42_gold))
BATCH_SIZE = 32
ITERATIONS = (len(d42_data) // BATCH_SIZE) + 1

predictions = []
total_correct = 0
correct_examples = []
incorrect_examples = []
total_skipped = 0

with tf.Session() as session:
    sensebert_model = SenseBert("sensebert-large-uncased", session=session)  # or sensebert-large-uncased
    
    
    TP, FP, TN, FN = 0, 0, 0, 0

    all_ids = list(d42_data.keys())
    
    for i in range(ITERATIONS):
        start_idx = i * BATCH_SIZE
        end_idx = min((i+1) * BATCH_SIZE, len(d42_data))
        
        sentences = []
        poses = []
        target_words = []
        
        for j in range(start_idx, end_idx):
            idd = all_ids[j]
            target_word, s, pos = d42_data[idd]
            
            sentences.append(s)
            poses.append(pos)
            target_words.append(target_word)


        input_ids, input_mask = sensebert_model.tokenize(sentences)
        model_outputs = sensebert_model.run(input_ids, input_mask)
        contextualized_embeddings, mlm_logits, supersense_logits = model_outputs  # these are NumPy arrays
        
        found_indices = []
        skipping_instances = set()
        for j in range(0, end_idx-start_idx):
            
            curr_sentence_tokenized = sensebert_model.tokenizer.convert_ids_to_tokens(input_ids[j])
            curr_sentence_tokenized_stemmed = [ps.stem(w) for w in curr_sentence_tokenized]
            
            idd = all_ids[j+start_idx]
            target_word, s, pos = d42_data[idd]
            target_word = ps.stem(target_word.lower())
            
            try:
                target_index = curr_sentence_tokenized_stemmed.index(target_word)
#                 print(target_index, curr_sentence_tokenized[target_index], d42_data[idd][0])
                found_indices.append(target_index)
            except:
#                 print("ERROR")
#                 print(target_word, curr_sentence_tokenized_stemmed, curr_sentence_tokenized)
#                 print("ERROR")
                found_indices.append(-1)
                skipping_instances.add(j)
        print(len(found_indices), len(skipping_instances))
        
        for j in range(0, end_idx-start_idx):
            if j in skipping_instances:
                total_skipped += 1
                predictions.append("skipped")
                continue
            target_word_idx = found_indices[j]
            prediction = sensebert_model.tokenizer.convert_ids_to_senses([np.argmax(supersense_logits[j][target_word_idx])])
            predictions.append(prediction)
            
    print(len(predictions), len(d42_gold))
    
    for j in range(len(predictions)):
        prediction = predictions[j]
        curr_key = list(d42_data.keys())[j]
        gold_label = d42_gold[curr_key][0]
#         print(extract_sense_from_gold(gold_label))
#         print(prediction)
        if prediction[0] == extract_sense_from_gold(gold_label):
            correct_examples.append([curr_key, prediction[0]])
            total_correct += 1
        else:
            incorrect_examples.append([curr_key, prediction[0]])
                    
    print(total_correct)

Loading the known model 'sensebert-large-uncased'
INFO:tensorflow:Restoring parameters from gs://ai21-public-models/sensebert-large-uncased/variables/variables
Loading the known tokenizer 'sensebert-large-uncased'
32 0
32 1
32 2
32 3
32 4
32 0
32 0
32 3
32 1
32 0
32 1
18 2
370 370
150


In [60]:
print("Accuracy:", total_correct / (len(d42_gold)-total_skipped))

Accuracy: 0.42492917847025496


In [65]:
def write_predictions_to_file(filename, preds):
    with open(filename, 'w') as file:
        for j, pred in enumerate(preds):
            curr_key = list(d42_data.keys())[j]
            target_word, s, pos = d42_data[curr_key]
            line = f"Target word: {target_word}\nKey: {curr_key}\nSentence: {s}\nPrediction: {pred}\n\n"
            file.write(line)
            
def write_tuples_to_file(filename, examples):
    with open(filename, 'w') as file:
        for example in examples:
            curr_key, pred = example
            target_word, s, pos = d42_data[curr_key]
            gold_label = d42_gold[curr_key][0]
            actual_sense = extract_sense_from_gold(gold_label)
            line = f"Target word: {target_word}\nKey: {curr_key}\nSentence: {s}\nPrediction: {pred}\nActual Sense: {actual_sense}\n\n"
            file.write(line)

In [66]:
write_predictions_to_file("42d_preds_sensebert_final.txt", predictions)

write_tuples_to_file("42d_correct_sensebert.txt", correct_examples)
write_tuples_to_file("42d_incorrect_sensebert.txt", incorrect_examples)