In [1]:
import os
import pickle
import pandas as pd

import yadisk
import zipfile
import nltk

from for_masked_lm import HypernymMaskedModel, ResultsWrapper
from vector_model import VectorModel
from prompt_utils import basic_prompts



In [2]:
# loading dataset
with open('dataset.pickle', 'rb') as _:
    dataset = pickle.load(_)

In [6]:
dataset

Unnamed: 0,data,base,gold,set
0,maliciousness,Concept,"malevolence,distaste,hatred,hate,malignity",1A.english.test
1,buckler,Concept,body armor,1A.english.test
2,spelunker,Concept,"exploration,adventurer,explorer",1A.english.test
3,quo warranto,Concept,"proceedings,legal proceedings,proceeding,due p...",1A.english.test
4,Jeff Francis,Entity,"thrower,baseball player,jock,person",1A.english.test
...,...,...,...,...
3045,foreshadowing,Concept,"anticipation,prediction,prevision,forecast,for...",1A.english.training
3046,salamander,Concept,"amphibian,animal",1A.english.training
3047,praetor,Concept,"magistrate,judge,person",1A.english.training
3048,endocarditis,Concept,"carditis,inflammation,disorder,disease,sickness",1A.english.training


In [None]:
# load vector model (fasttext)
y = yadisk.YaDisk()
filename = 'fasttext.zip'
url = 'https://disk.yandex.ru/d/dM3Vn2mlExzyZQ'
y.download_public(url, filename)
with zipfile.ZipFile(filename, 'r') as zip_ref:
    zip_ref.extractall()

In [3]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/jupyter/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
fasttext = VectorModel('fasttext/umbc_model_best.bin')

In [4]:
# alternatively, you can load already predicted candidates
with open('fasttext_predictions.pickle', 'rb') as _:
    preds = pickle.load(_)
dataset['pred'] = preds

In [5]:
# define the subset
subset_name = '1A.english.trial'
subset = dataset[dataset['set'] == subset_name]

In [None]:
#if you are using vector model, get predictions 
subset = fasttext.predict(subset, k=15, filter_=True, col_in='data', col_out='pred')

In [9]:
subset.head(3)

Unnamed: 0,data,base,gold,set,pred
1500,dirham,Concept,monetary unit,1A.english.trial,
1501,sociology department,Concept,"academic department,department,dept",1A.english.trial,department
1502,Burger King,Entity,"eating house,eating place,restaurant,eatery,ch...",1A.english.trial,"King,Burger"


In [6]:
# loading model
model = HypernymMaskedModel(model_path="bert-base-cased", device='cpu')
with open('stopwords.pickle', 'rb') as _:
    STOPWORDS = pickle.load(_)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
# use model to get predictions
output = model.prediction_in_dataset(
        subset,
        hyper_prompts=basic_prompts['hyper_prompts'],
        cohypo_prompts=basic_prompts['best_cohypo_prompt'],
        mixed_prompts=basic_prompts['mixed_prompts'],
        candidates=subset['pred'],
        k_out=15,
        k_hyper=15,
        k_hypo=15,
        col_in='data',
        stopwords=STOPWORDS,
        article=(True, True, False),
        seed=42
)

2it [02:20, 76.88s/it]

TypeError: HypernymMaskedModel.probable_words() got an unexpected keyword argument 'article'

In [19]:
# unpack results
results = ResultsWrapper(subset, tables=output, col_in='data')
#results.save()

In [7]:
# or load already saved
results = ResultsWrapper(subset.iloc[:1, :], tables=None, col_in='data')
results.load()

In [8]:
# get metric values
evaluation = results.calculate_metrics(col_gold='gold')
display(evaluation)
#evaluation.to_csv('bert_trial_results.tsv', sep='\t')

Unnamed: 0,MAP@15-bert_hyper,MAP@15-bert_hyper_iter,MAP@15-ft+bert_hyper,MAP@15-ft+bert_hyper_iter,MAP@15-ft+bert_cohypo,MAP@15-ft+bert_cohypo_iter,MRR@15-bert_hyper,MRR@15-bert_hyper_iter,MRR@15-ft+bert_hyper,MRR@15-ft+bert_hyper_iter,MRR@15-ft+bert_cohypo,MRR@15-ft+bert_cohypo_iter
<target> is a [MASK].,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
a <target> is a [MASK].,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"[MASK], such as <target>.",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"[MASK], such as a <target>.",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"a [MASK], such as <target>.",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"a [MASK], such as a <target>.",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
<target> is a type of [MASK].,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
a <target> is a type of [MASK].,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
My favorite [MASK] is <target>.,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
My favorite [MASK] is a <target>.,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
