In [1]:
from allennlp.data import Vocabulary

vocab = Vocabulary.from_files(directory="https://allennlp.s3-us-west-2.amazonaws.com/knowbert/models/vocabulary_wordnet_wiki.tar.gz")
print(vocab)

Vocabulary with namespaces:
 	Non Padded Namespaces: {'*labels', '*tags'}
 	Namespace: entity_wiki, Size: 470116 
 	Namespace: entity_wordnet, Size: 117663 



In [6]:
print(vocab.get_token_from_index(2, "entity_wordnet"))
print(vocab.get_token_index("able.a.01", "entity_wordnet"))

able.a.01
2


In [2]:
from test_model import load_soldered_kg_wiki

wiki_soldered_kg = load_soldered_kg_wiki(vocab)

0it [00:00, ?it/s]

Loaded wiki embedding
Init Bert Encoder
Loaded wiki entity linker


In [3]:
from test_model import load_soldered_kg_wordnet

wordnet_soldered_kg = load_soldered_kg_wordnet(vocab)

25
tensor([0, 8, 3,  ..., 2, 6, 7])
Loaded wordnet embedding
Init Bert Encoder
Loaded wordnet entity linker


In [4]:
from test_model import load_model

model = load_model(preloaded_wiki_soldered_kg=wiki_soldered_kg, preloaded_wordnet_soldered_kg=wordnet_soldered_kg, preloaded_vocab=vocab)

Loaded Vocabulary
Vocabulary with namespaces:
 	Non Padded Namespaces: {'*labels', '*tags'}
 	Namespace: entity_wiki, Size: 470116 
 	Namespace: entity_wordnet, Size: 117663 

Loaded wiki soldered KG
Loaded wordnet soldered KG


In [6]:
import torch
device = torch.device('cuda:0')

In [7]:
model.to(device=device)

KnowBert(
  (nsp_loss_function): CrossEntropyLoss()
  (lm_loss_function): CrossEntropyLoss()
  (pretrained_bert): BertForPreTraining(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): BertLayerNorm()
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_f

In [6]:
for name, param in model.named_parameters():
    print(name, ":", param)

pretrained_bert.bert.embeddings.word_embeddings.weight : Parameter containing:
tensor([[-0.0239, -0.0480, -0.0137,  ..., -0.0430, -0.0284, -0.0117],
        [-0.0286, -0.0485, -0.0169,  ..., -0.0304, -0.0246, -0.0132],
        [-0.0282, -0.0523, -0.0279,  ..., -0.0355, -0.0467, -0.0164],
        ...,
        [-0.0287, -0.0492, -0.0217,  ..., -0.0054, -0.0176, -0.0284],
        [-0.0407, -0.0353, -0.0180,  ..., -0.0017, -0.0227, -0.0054],
        [-0.0223, -0.0685, -0.0270,  ..., -0.0316, -0.0430,  0.0405]],
       requires_grad=True)
pretrained_bert.bert.embeddings.position_embeddings.weight : Parameter containing:
tensor([[ 0.0062, -0.0181, -0.0468,  ...,  0.0143,  0.0074,  0.0046],
        [ 0.0040, -0.0042, -0.0177,  ...,  0.0248,  0.0203, -0.0066],
        [-0.0132, -0.0014, -0.0061,  ...,  0.0131,  0.0145, -0.0069],
        ...,
        [ 0.0283,  0.0044, -0.0053,  ...,  0.0186,  0.0102, -0.0332],
        [ 0.0189,  0.0134,  0.0068,  ...,  0.0070, -0.0137, -0.0254],
        [ 0.00

In [5]:
from KBQA.appB.transformer_architectures.kb.knowbert_utils import KnowBertBatchifier

archive_file = 'https://allennlp.s3-us-west-2.amazonaws.com/knowbert/models/knowbert_wiki_wordnet_model.tar.gz'
batcher = KnowBertBatchifier(archive_file)



Building Generators
duplicate_mentions_cnt:  6777
end of p_e_m reading. wall time: 0.9928577423095704  minutes
p_e_m_errors:  0
incompatible_ent_ids:  0
Build Generators
{'_tokenizer_kwargs': {'use_fast': True}, '_model_name': 'bert-base-uncased', 'tokenizer': PreTrainedTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}), '_add_special_tokens': True, '_max_length': None, '_tokenizer_lowercases': True, 'sequence_pair_start_tokens': [[CLS]], 'sequence_pair_mid_tokens': [[SEP]], 'sequence_pair_end_tokens': [[SEP]], 'sequence_pair_first_token_type_id': 0, 'sequence_pair_second_token_type_id': 1, 'single_sequence_start_tokens': [[CLS]], 'single_sequence_end_tokens': [[SEP]], 'single_sequence_token_type_id': 0}
Done building


In [8]:
import torch
sentences = ["Paris is located in France.", "KnowBert is a knowledge enhanced BERT"]
model.eval()

# batcher takes raw untokenized sentences
# and yields batches of tensors needed to run KnowBert
for batch in batcher.iter_batches(sentences, verbose=True):
    # model_output['contextual_embeddings'] is (batch_size, seq_len, embed_dim) tensor of top layer activations
    batch['tokens']['tokens'] = batch['tokens']['tokens']['tokens']
    batch['candidates']['wiki']['candidate_entities']['ids'] = \
        batch['candidates']['wiki']['candidate_entities']['ids']['token_characters']

    candidate_mask = (batch['candidates']['wiki']['candidate_entities']['ids'] > 0).type(torch.uint8)
    batch['candidates']['wiki']['candidate_entities']['ids'] -= candidate_mask
    print(candidate_mask)

    batch['candidates']['wordnet']['candidate_entities']['ids'] = \
        batch['candidates']['wordnet']['candidate_entities']['ids']['token_characters']

    candidate_mask = (batch['candidates']['wordnet']['candidate_entities']['ids'] > 0).type(torch.uint8)
    batch['candidates']['wordnet']['candidate_entities']['ids'] -= candidate_mask
    print(batch)
    model_output = model(**batch)
    print(model_output)

offsets: [1, 2, 3, 4, 5, 6]
word_piece_tokens: [['paris'], ['is'], ['located'], ['in'], ['france'], ['.']]
tokens: ['Paris', 'is', 'located', 'in', 'France', '.']
name wiki
mention_generator <KBQA.appB.transformer_architectures.kb.wiki_linking_util.WikiCandidateMentionGenerator object at 0x7f387c127610>
name wordnet
mention_generator <KBQA.appB.transformer_architectures.kb.wordnet.WordNetCandidateMentionGenerator object at 0x7f387c29ef40>
token_candidates: {'tokens': ['[CLS]', 'paris', 'is', 'located', 'in', 'france', '.', '[SEP]'], 'segment_ids': [0, 0, 0, 0, 0, 0, 0, 0], 'candidates': {'wiki': {'tokenized_text': ['Paris', 'is', 'located', 'in', 'France', '.'], 'candidate_spans': [[1, 1], [3, 3], [5, 5], [6, 6]], 'candidate_entities': [['Paris', 'Paris_(mythology)', 'Paris_Hilton', 'Paris,_Texas', 'Paris,_Kentucky', 'Paris_Las_Vegas', 'Paris_Masters', 'Paris_(Paris_Hilton_album)', 'Paris,_Missouri', 'Paris_Metropolitan_Area', 'Paris_(The_Cure_album)', 'Count_Paris', 'University_of_Par

In [10]:
print(batcher.vocab)
print(batcher.vocab.get_token_index('France', namespace='entity_wiki'))

Vocabulary with namespaces:
 	Non Padded Namespaces: {'*labels', '*tags', 'tokens'}
 	Namespace: tokens, Size: 30522 
 	Namespace: entity_wordnet, Size: 117664 
 	Namespace: entity_wiki, Size: 470117 

625


In [18]:
import torch
test_tensor = torch.tensor([2,0,6,4,0,3])
mask_tensor = (test_tensor > 0).type(torch.uint8)
print(mask_tensor)
print(test_tensor - mask_tensor)

tensor([1, 0, 1, 1, 0, 1], dtype=torch.uint8)
tensor([1, 0, 5, 3, 0, 2])
