In [None]:
import os
import pickle
import pandas as pd

import yadisk
import zipfile
import nltk

from for_masked_lm import HypernymMaskedModel, ResultsWrapper
from vector_model import VectorModel
from prompt_utils import basic_prompts

In [None]:
# loading dataset
with open('dataset.pickle', 'rb') as _:
    dataset = pickle.load(_)

In [None]:
dataset

Unnamed: 0,data,base,gold,set
0,maliciousness,Concept,"malevolence,distaste,hatred,hate,malignity",1A.english.test
1,buckler,Concept,body armor,1A.english.test
2,spelunker,Concept,"exploration,adventurer,explorer",1A.english.test
3,quo warranto,Concept,"proceedings,legal proceedings,proceeding,due p...",1A.english.test
4,Jeff Francis,Entity,"thrower,baseball player,jock,person",1A.english.test
...,...,...,...,...
3045,foreshadowing,Concept,"anticipation,prediction,prevision,forecast,for...",1A.english.training
3046,salamander,Concept,"amphibian,animal",1A.english.training
3047,praetor,Concept,"magistrate,judge,person",1A.english.training
3048,endocarditis,Concept,"carditis,inflammation,disorder,disease,sickness",1A.english.training


In [None]:
# load vector model (fasttext)
y = yadisk.YaDisk()
filename = 'fasttext.zip'
url = 'https://disk.yandex.ru/d/dM3Vn2mlExzyZQ'
y.download_public(url, filename)
with zipfile.ZipFile(filename, 'r') as zip_ref:
    zip_ref.extractall()

In [None]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/jupyter/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
fasttext = VectorModel('fasttext/umbc_model_best.bin')

In [None]:
# alternatively, you can load already predicted candidates
with open('fasttext_predictions.pickle', 'rb') as _:
    preds = pickle.load(_)
dataset['pred'] = preds

In [None]:
# define the subset
subset_name = '1A.english.trial'
subset = dataset[dataset['set'] == subset_name]

In [None]:
#if you are using vector model, get predictions
subset = fasttext.predict(subset, k=15, filter_=True, col_in='data', col_out='pred')

In [None]:
subset.head(3)

Unnamed: 0,data,base,gold,set,pred
1500,dirham,Concept,monetary unit,1A.english.trial,
1501,sociology department,Concept,"academic department,department,dept",1A.english.trial,department
1502,Burger King,Entity,"eating house,eating place,restaurant,eatery,ch...",1A.english.trial,"King,Burger"


In [None]:
# loading model
model = HypernymMaskedModel(model_path="bert-base-cased", device='cpu')
with open('stopwords.pickle', 'rb') as _:
    STOPWORDS = pickle.load(_)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
# use model to get predictions
output = model.prediction_in_dataset(
        subset,
        hyper_prompts=basic_prompts['hyper_prompts'],
        cohypo_prompts=basic_prompts['best_cohypo_prompt'],
        mixed_prompts=basic_prompts['mixed_prompts'],
        candidates=subset['pred'],
        k_out=15,
        k_hyper=15,
        k_hypo=15,
        col_in='data',
        stopwords=STOPWORDS,
        article=(True, True, False),
        seed=42
)

50it [1:20:11, 96.23s/it]


In [None]:
# unpack results
results = ResultsWrapper(subset, tables=output, col_in='data')
results.save()

In [None]:
# or load already saved
results = ResultsWrapper(subset, tables=None, col_in='data')
results.load()

In [None]:
import numpy as np
from metrics import *

In [None]:
# get metric values
evaluation = results.calculate_metrics(col_gold='gold')
display(evaluation)
#evaluation.to_csv('bert_trial_results.tsv', sep='\t')

Unnamed: 0,MAP@15-bert_hyper,MAP@15-bert_hyper_iter,MAP@15-ft+bert_hyper,MAP@15-ft+bert_hyper_iter,MAP@15-ft+bert_cohypo,MAP@15-ft+bert_cohypo_iter,MRR@15-bert_hyper,MRR@15-bert_hyper_iter,MRR@15-ft+bert_hyper,MRR@15-ft+bert_hyper_iter,MRR@15-ft+bert_cohypo,MRR@15-ft+bert_cohypo_iter
data,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
<target> is a [MASK].,0.08597,0.07948,0.08382,0.07919,0.03523,0.03474,0.19815,0.19704,0.17173,0.19552,0.09087,0.09576
a <target> is a [MASK].,0.09701,0.10012,0.09811,0.09891,0.05731,0.06173,0.24422,0.24573,0.20668,0.24419,0.12887,0.13889
"[MASK], such as <target>.",0.00909,0.00724,0.01334,0.00741,0.00909,0.00909,0.04848,0.044,0.06067,0.04582,0.04848,0.04848
"[MASK], such as a <target>.",0.00831,0.00606,0.00687,0.00624,0.00831,0.00831,0.03708,0.03472,0.04134,0.03672,0.03708,0.03708
"a [MASK], such as <target>.",0.09252,0.08568,0.08953,0.08424,0.09252,0.09298,0.25623,0.24931,0.21708,0.24244,0.25623,0.25632
"a [MASK], such as a <target>.",0.09436,0.08627,0.09221,0.08421,0.09436,0.09483,0.27388,0.2524,0.23068,0.24001,0.27388,0.27398
<target> is a type of [MASK].,0.08288,0.08061,0.0692,0.08036,0.08288,0.08277,0.25639,0.24156,0.17709,0.24072,0.25639,0.25617
a <target> is a type of [MASK].,0.08686,0.0874,0.07292,0.08651,0.08686,0.08666,0.25008,0.23284,0.18162,0.2332,0.25008,0.24968
My favorite [MASK] is <target>.,0.0901,0.08842,0.08819,0.08819,0.0901,0.08908,0.21168,0.21436,0.1802,0.21686,0.21168,0.21018
