In [4]:
import tensorflow as tf
from sensebert import SenseBert
import numpy as np

In [2]:
with tf.Session() as session:
    sensebert_model = SenseBert("sensebert-base-uncased", session=session)  # or sensebert-large-uncased
    input_ids, input_mask = sensebert_model.tokenize(["I went to the store to buy some groceries.", "The store was closed."])
    model_outputs = sensebert_model.run(input_ids, input_mask)
    contextualized_embeddings, mlm_logits, supersense_logits = model_outputs  # these are NumPy arrays
    print(model_outputs)
    print(sensebert_model.tokenizer.convert_ids_to_senses([np.argmax(supersense_logits[0][9])]))
    print(sensebert_model)
    print(supersense_logits.shape)
    print(sensebert_model.tokenize(["I went to the store to buy some groceries.", "The store was closed."]))

[array([[[-0.6506447 , -0.16967805,  0.14072387, ..., -0.37985668,
          0.82097167, -0.1802699 ],
        [-0.43151152,  0.04906671,  0.33647287, ..., -0.7424973 ,
          0.86205894,  0.56475866],
        [ 0.11911742,  0.30231696,  0.01322322, ..., -0.9130749 ,
         -0.04722284,  0.52869815],
        ...,
        [-0.27734977, -0.2632196 , -0.42078534, ..., -1.3844373 ,
         -0.25757593,  0.48685086],
        [ 0.3355558 , -0.24686146,  0.09991715, ..., -0.94355595,
          0.23015074,  0.49161816],
        [-0.46015036, -0.2328133 , -0.19267169, ..., -0.4300249 ,
          0.7341133 ,  0.20681112]],

       [[-0.31325674, -0.06467588,  0.38356358, ..., -0.3004378 ,
          0.17605749, -0.11610046],
        [-0.2908376 ,  0.7626818 , -0.16345486, ..., -1.0970807 ,
          0.07459535,  0.1260822 ],
        [-0.189485  , -0.27419227, -0.97271645, ..., -1.5522854 ,
         -0.7494767 ,  0.2617209 ],
        ...,
        [ 0.49235207,  0.49432755,  0.8935569 , ..., 

In [5]:
import nltk
nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer

print(WordNetLemmatizer().lemmatize('belabor', pos="V".lower()))
print(WordNetLemmatizer().lemmatize('belabored', pos="v"))

ps = PorterStemmer()
words = ["sentence", "sentences", "excuse", "excused", "excuses"]

words = [ps.stem(w) for w in words]
print(words)

print(ps.stem("buy"))
print(ps.stem("bought"))

[nltk_data] Downloading package wordnet to /home/ubuntu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


belabor
belabor
['sentenc', 'sentenc', 'excus', 'excus', 'excus']
buy
bought


In [90]:
dev_data = []
with open("WiC_dataset/dev/dev.data.txt") as file:
    lines = file.readlines()
    for line in lines:
        l = line.strip().split("\t")
        key, pos, locs, s1, s2 = l[0], l[1], l[2], l[3], l[4]
        loc1, loc2 = locs.split("-")
        dev_data.append((key, s1, s2, loc1, loc2, pos))

dev_gold = []
with open("WiC_dataset/dev/dev.gold.txt") as file:
    lines = file.readlines()
    dev_gold = ["True" if l.strip("\n") == "T" else "False" for l in lines]

dev = list(zip(dev_data, dev_gold))

BATCH_SIZE = 32
ITERATIONS = (len(dev_data) // BATCH_SIZE) + 1

with tf.Session() as session:
    sensebert_model = SenseBert("sensebert-base-uncased", session=session)  # or sensebert-large-uncased
    
    
    TP, FP, TN, FN = 0, 0, 0, 0
    
    for i in range(ITERATIONS):
        start_idx = i * BATCH_SIZE
        end_idx = min((i+1) * BATCH_SIZE, len(dev_data))
    #     print(start_idx, end_idx)
        sentences = []
        locations = []
        target_words = []
        poses = []
        for j in range(start_idx, end_idx):
            target_words.append(dev_data[j][0])
            target_words.append(dev_data[j][0])
            sentences.append(dev_data[j][1])
            sentences.append(dev_data[j][2])
            locations.append(dev_data[j][3])
            locations.append(dev_data[j][4])
            poses.append(dev_data[j][5].lower())
            poses.append(dev_data[j][5].lower())
    #     print(sentences)
        input_ids, input_mask = sensebert_model.tokenize(sentences)


        found_indices = []
        found_words = []
        ## check that loc1, loc2 are good positions
        for idx, input_id in enumerate(input_ids):
            tokenized = sensebert_model.tokenizer.convert_ids_to_tokens(input_id)
            target_idx = int(locations[idx])+1
            found_indices.append(target_idx)
            found_words.append(tokenized[target_idx])

    #     print(found_indices)
    #     print(found_words)

        for j in range(0, len(sentences), 2):

            if WordNetLemmatizer().lemmatize(found_words[j], pos=poses[j]) != WordNetLemmatizer().lemmatize(found_words[j+1], pos=poses[j]):

                ## lemmatize sentences, and keyword
                tokenized_1 = sensebert_model.tokenizer.convert_ids_to_tokens(input_ids[j])
                tokenized_2 = sensebert_model.tokenizer.convert_ids_to_tokens(input_ids[j+1])

                tokenized_1_stem = [ps.stem(w) for w in tokenized_1]
                tokenized_2_stem = [ps.stem(w) if w!= "bought" else "buy" for w in tokenized_2]

                key_word = ps.stem(target_words[j])

    #             print(target_words[j])
    #             print(key_word)
    #             print(tokenized_1, tokenized_2)

    #             print(tokenized_1.index(key_word))
    #             print(tokenized_2.index(key_word))

                found_indices[j] = tokenized_1_stem.index(key_word)
                found_indices[j+1] = tokenized_2_stem.index(key_word)
                found_words[j] = tokenized_1[found_indices[j]]
                found_words[j+1] = tokenized_2[found_indices[j+1]]

                ## find keyword in lemmatized sentences

    #             print(target_words[j])
    #             print(target_words[j+1])

    #     print(found_indices)
    #     print(found_words)
#         print(len(sentences))
        input_ids, input_mask = sensebert_model.tokenize(sentences)
        model_outputs = sensebert_model.run(input_ids, input_mask)
        contextualized_embeddings, mlm_logits, supersense_logits = model_outputs  # these are NumPy arrays
#         print(supersense_logits.shape)
        
        for j in range(0, len(sentences), 2):
            
            pred_1 = sensebert_model.tokenizer.convert_ids_to_senses([np.argmax(supersense_logits[j][found_indices[j]])])
            pred_2 = sensebert_model.tokenizer.convert_ids_to_senses([np.argmax(supersense_logits[j+1][found_indices[j+1]])])
            
            if start_idx==0:
                print(sentences[j], sentences[j+1])
                print(found_words[j], found_words[j+1])
                print(pred_1[0], pred_2[0])
                print(f'prediction: {pred_1[0]==pred_2[0]}')
                print(f'actual: {dev_gold[start_idx+j//2]}')
                print()
            
            if pred_1[0] == pred_2[0]:
                if dev_gold[start_idx+j//2] == "True":
                    TP += 1
                else:
                    FP += 1
            else:
                if dev_gold[start_idx+j//2] == "True":
                    FN += 1
                else:
                    TN += 1
        

    print(TP, FP, TN, FN)
            


Loading the known model 'sensebert-base-uncased'
INFO:tensorflow:Restoring parameters from gs://ai21-public-models/sensebert-base-uncased/variables/variables
Loading the known tokenizer 'sensebert-base-uncased'
Room and board . He nailed boards across the windows .
board boards
noun.substance verb.consumption
prediction: False
actual: False

Circulate a rumor . This letter is being circulated among the faculty .
circulate circulated
verb.communication verb.change
prediction: False
actual: False

Hook a fish . He hooked a snake accidentally , and was so scared he dropped his rod into the water .
hook hooked
verb.contact verb.contact
prediction: True
actual: True

For recreation he wrote poetry and solved crossword puzzles . Drug abuse is often regarded as a form of recreation .
recreation recreation
noun.act noun.act
prediction: True
actual: True

Making a hobby of domesticity . A royal family living in unpretentious domesticity .
domesticity domesticity
noun.act noun.act
prediction: Tr

In [8]:
dev_data = []
with open("WiC_dataset/dev/dev.data.txt") as file:
    lines = file.readlines()
    for line in lines:
        l = line.strip().split("\t")
        key, pos, locs, s1, s2 = l[0], l[1], l[2], l[3], l[4]
        loc1, loc2 = locs.split("-")
        dev_data.append((key, s1, s2, loc1, loc2, pos))

dev_gold = []
with open("WiC_dataset/dev/dev.gold.txt") as file:
    lines = file.readlines()
    dev_gold = ["True" if l.strip("\n") == "T" else "False" for l in lines]

dev = list(zip(dev_data, dev_gold))

BATCH_SIZE = 32
ITERATIONS = (len(dev_data) // BATCH_SIZE) + 1

with tf.Session() as session:
    sensebert_model = SenseBert("sensebert-large-uncased", session=session)  # or sensebert-large-uncased
    
    
    TP, FP, TN, FN = 0, 0, 0, 0
    
    for i in range(ITERATIONS):
        start_idx = i * BATCH_SIZE
        end_idx = min((i+1) * BATCH_SIZE, len(dev_data))
    #     print(start_idx, end_idx)
        sentences = []
        locations = []
        target_words = []
        poses = []
        for j in range(start_idx, end_idx):
            target_words.append(dev_data[j][0])
            target_words.append(dev_data[j][0])
            sentences.append(dev_data[j][1])
            sentences.append(dev_data[j][2])
            locations.append(dev_data[j][3])
            locations.append(dev_data[j][4])
            poses.append(dev_data[j][5].lower())
            poses.append(dev_data[j][5].lower())
    #     print(sentences)
        input_ids, input_mask = sensebert_model.tokenize(sentences)


        found_indices = []
        found_words = []
        ## check that loc1, loc2 are good positions
        for idx, input_id in enumerate(input_ids):
            tokenized = sensebert_model.tokenizer.convert_ids_to_tokens(input_id)
            target_idx = int(locations[idx])+1
            found_indices.append(target_idx)
            found_words.append(tokenized[target_idx])

    #     print(found_indices)
    #     print(found_words)

        for j in range(0, len(sentences), 2):

            if WordNetLemmatizer().lemmatize(found_words[j], pos=poses[j]) != WordNetLemmatizer().lemmatize(found_words[j+1], pos=poses[j]):

                ## lemmatize sentences, and keyword
                tokenized_1 = sensebert_model.tokenizer.convert_ids_to_tokens(input_ids[j])
                tokenized_2 = sensebert_model.tokenizer.convert_ids_to_tokens(input_ids[j+1])

                tokenized_1_stem = [ps.stem(w) for w in tokenized_1]
                tokenized_2_stem = [ps.stem(w) if w!= "bought" else "buy" for w in tokenized_2]

                key_word = ps.stem(target_words[j])

    #             print(target_words[j])
    #             print(key_word)
    #             print(tokenized_1, tokenized_2)

    #             print(tokenized_1.index(key_word))
    #             print(tokenized_2.index(key_word))

                found_indices[j] = tokenized_1_stem.index(key_word)
                found_indices[j+1] = tokenized_2_stem.index(key_word)
                found_words[j] = tokenized_1[found_indices[j]]
                found_words[j+1] = tokenized_2[found_indices[j+1]]

                ## find keyword in lemmatized sentences

    #             print(target_words[j])
    #             print(target_words[j+1])

    #     print(found_indices)
    #     print(found_words)
#         print(len(sentences))
        input_ids, input_mask = sensebert_model.tokenize(sentences)
        model_outputs = sensebert_model.run(input_ids, input_mask)
        contextualized_embeddings, mlm_logits, supersense_logits = model_outputs  # these are NumPy arrays
#         print(supersense_logits.shape)
        
        for j in range(0, len(sentences), 2):
            
            pred_1 = sensebert_model.tokenizer.convert_ids_to_senses([np.argmax(supersense_logits[j][found_indices[j]])])
            pred_2 = sensebert_model.tokenizer.convert_ids_to_senses([np.argmax(supersense_logits[j+1][found_indices[j+1]])])
            
            if start_idx==0:
                print(sentences[j], sentences[j+1])
#                 print(sensebert_model.tokenize(sentences[j]), sensebert_model.tokenize(sentences[j+1]))
#                 print(found_indices[j], found_indices[j+1])
                print(found_words[j], found_words[j+1])
                print(pred_1[0], pred_2[0])
                print(f'prediction: {pred_1[0]==pred_2[0]}')
                print(f'actual: {dev_gold[start_idx+j//2]}')
                print()
            
            if pred_1[0] == pred_2[0]:
                if dev_gold[start_idx+j//2] == "True":
                    TP += 1
                else:
                    FP += 1
            else:
                if dev_gold[start_idx+j//2] == "True":
                    FN += 1
                else:
                    TN += 1
        

    print(TP, FP, TN, FN)
            


Loading the known model 'sensebert-large-uncased'
INFO:tensorflow:Restoring parameters from gs://ai21-public-models/sensebert-large-uncased/variables/variables
Loading the known tokenizer 'sensebert-large-uncased'
Room and board . He nailed boards across the windows .
board boards
verb.consumption verb.stative
prediction: False
actual: False

Circulate a rumor . This letter is being circulated among the faculty .
circulate circulated
verb.communication verb.change
prediction: False
actual: False

Hook a fish . He hooked a snake accidentally , and was so scared he dropped his rod into the water .
hook hooked
noun.act adj.all
prediction: False
actual: True

For recreation he wrote poetry and solved crossword puzzles . Drug abuse is often regarded as a form of recreation .
recreation recreation
noun.act noun.act
prediction: True
actual: True

Making a hobby of domesticity . A royal family living in unpretentious domesticity .
domesticity domesticity
noun.act noun.act
prediction: True
actu

In [11]:
dev_data = []
with open("WiC_dataset/test/test.data.txt") as file:
    lines = file.readlines()
    for line in lines:
        l = line.strip().split("\t")
        key, pos, locs, s1, s2 = l[0], l[1], l[2], l[3], l[4]
        loc1, loc2 = locs.split("-")
        dev_data.append((key, s1, s2, loc1, loc2, pos))

dev_gold = []
with open("WiC_dataset/test/test.gold.txt") as file:
    lines = file.readlines()
    dev_gold = ["True" if l.strip("\n") == "T" else "False" for l in lines]

dev = list(zip(dev_data, dev_gold))

BATCH_SIZE = 32
ITERATIONS = (len(dev_data) // BATCH_SIZE) + 1

with tf.Session() as session:
    sensebert_model = SenseBert("sensebert-large-uncased", session=session)  # or sensebert-large-uncased
    
    
    TP, FP, TN, FN = 0, 0, 0, 0
    
    for i in range(ITERATIONS):
        start_idx = i * BATCH_SIZE
        end_idx = min((i+1) * BATCH_SIZE, len(dev_data))
    #     print(start_idx, end_idx)
        sentences = []
        locations = []
        target_words = []
        poses = []
        for j in range(start_idx, end_idx):
            target_words.append(dev_data[j][0])
            target_words.append(dev_data[j][0])
            sentences.append(dev_data[j][1])
            sentences.append(dev_data[j][2])
            locations.append(dev_data[j][3])
            locations.append(dev_data[j][4])
            poses.append(dev_data[j][5].lower())
            poses.append(dev_data[j][5].lower())
    #     print(sentences)
        input_ids, input_mask = sensebert_model.tokenize(sentences)


        found_indices = []
        found_words = []
        ## check that loc1, loc2 are good positions
        for idx, input_id in enumerate(input_ids):
            tokenized = sensebert_model.tokenizer.convert_ids_to_tokens(input_id)
            target_idx = int(locations[idx])+1
            found_indices.append(target_idx)
            found_words.append(tokenized[target_idx])

    #     print(found_indices)
    #     print(found_words)

        for j in range(0, len(sentences), 2):

            if WordNetLemmatizer().lemmatize(found_words[j], pos=poses[j]) != WordNetLemmatizer().lemmatize(found_words[j+1], pos=poses[j]):

                ## lemmatize sentences, and keyword
                tokenized_1 = sensebert_model.tokenizer.convert_ids_to_tokens(input_ids[j])
                tokenized_2 = sensebert_model.tokenizer.convert_ids_to_tokens(input_ids[j+1])

                tokenized_1_stem = [ps.stem(w) for w in tokenized_1]
                tokenized_2_stem = [ps.stem(w) if w!= "bought" else "buy" for w in tokenized_2]
                tokenized_2_stem = [w if w!="felt" else "feel" for w in tokenized_2_stem]
                tokenized_2_stem = [w if w!="ve" else "have" for w in tokenized_2_stem]
                tokenized_2_stem = [w if w!="men" else "man" for w in tokenized_2_stem]
                tokenized_2_stem = [w if w!="shook" else "shake" for w in tokenized_2_stem]
                tokenized_2_stem = [w if w!="drank" else "drink" for w in tokenized_2_stem]
                tokenized_2_stem = [w if w!="sold" else "sell" for w in tokenized_2_stem]
                
                key_word = ps.stem(target_words[j])

    #             print(target_words[j])
    #             print(key_word)
    #             print(tokenized_1, tokenized_2)

    #             print(tokenized_1.index(key_word))
    #             print(tokenized_2.index(key_word))
                try:
                    found_indices[j] = tokenized_1_stem.index(key_word)
                    found_indices[j+1] = tokenized_2_stem.index(key_word)
                    found_words[j] = tokenized_1[found_indices[j]]
                    found_words[j+1] = tokenized_2[found_indices[j+1]]
                except:
                    print(input_ids[j])
                    print(input_ids[j+1])
                    print(tokenized_1_stem)
                    print(tokenized_2_stem)
                    print(key_word)

                ## find keyword in lemmatized sentences

    #             print(target_words[j])
    #             print(target_words[j+1])

    #     print(found_indices)
    #     print(found_words)
#         print(len(sentences))
        input_ids, input_mask = sensebert_model.tokenize(sentences)
        model_outputs = sensebert_model.run(input_ids, input_mask)
        contextualized_embeddings, mlm_logits, supersense_logits = model_outputs  # these are NumPy arrays
#         print(supersense_logits.shape)
        
        for j in range(0, len(sentences), 2):
            
            pred_1 = sensebert_model.tokenizer.convert_ids_to_senses([np.argmax(supersense_logits[j][found_indices[j]])])
            pred_2 = sensebert_model.tokenizer.convert_ids_to_senses([np.argmax(supersense_logits[j+1][found_indices[j+1]])])
            
            if start_idx==0:
                print(sentences[j], sentences[j+1])
#                 print(sensebert_model.tokenize(sentences[j]), sensebert_model.tokenize(sentences[j+1]))
#                 print(found_indices[j], found_indices[j+1])
                print(found_words[j], found_words[j+1])
                print(pred_1[0], pred_2[0])
                print(f'prediction: {pred_1[0]==pred_2[0]}')
                print(f'actual: {dev_gold[start_idx+j//2]}')
                print()
            
            if pred_1[0] == pred_2[0]:
                if dev_gold[start_idx+j//2] == "True":
                    TP += 1
                else:
                    FP += 1
            else:
                if dev_gold[start_idx+j//2] == "True":
                    FN += 1
                else:
                    TN += 1
        

    print(TP, FP, TN, FN)

Loading the known model 'sensebert-large-uncased'
INFO:tensorflow:Restoring parameters from gs://ai21-public-models/sensebert-large-uncased/variables/variables
Loading the known tokenizer 'sensebert-large-uncased'
It was a narrow defeat . The army 's only defeat .
defeat defeat
verb.competition noun.event
prediction: False
actual: True

Groom the dogs . Sheila groomed the horse .
groom groomed
verb.social verb.social
prediction: True
actual: True

The penetration of upper management by women . Any penetration , however slight , is sufficient to complete the offense .
penetration penetration
noun.attribute noun.attribute
prediction: True
actual: True

We hit Detroit at one in the morning but kept driving through the night . An interesting idea hit her .
hit hit
verb.contact verb.motion
prediction: False
actual: False

He was a man of judicial deliberation . A little deliberation would have deterred them .
deliberation deliberation
noun.attribute noun.attribute
prediction: True
actual: F