# Testing FewRel few-shot relation extraction

* goal is to make it like the demo at http://opennre.thunlp.ai/#/fewshot_re - in the demo it seems to work pretty well
* based on https://github.com/thunlp/FewRel
* use torch 1.3.1
* copy val_wiki.json as test_wiki.json in the data folder to make it work
* python requirements same as opennre
* the checkpoint files are very big, so they aren't in the repository. One checkpoint is at https://drive.google.com/file/d/1yiz3q3xNz-llsY55g5OdodxiH1RThYuz/view?usp=sharing
* can't train on prof song's gpus, not enough vram. the hpc computers have enough vram, but i couldn't get pytorch to work on them, it uses a 10 year old version of linux.... 
* however testing using this code does actually work on the gpu (but you would need to use cuda pytorch (refer to pytorch.org) and whereever you are using model do "model = model.cuda()", and also "tensor=tensor.cuda()"). refer to test_script.py
* on my laptop cpu this code takes about 2 seconds per query, on the gpu it's a lot faster, runs in under 1 second. the speed seems fine, but more testing is required to really find out. 


In [27]:
checkpoint_path = "checkpoint/pair-bert-train_wiki-val_wiki-5-1.pth.tar"
bert_pretrained_checkpoint = 'bert-base-uncased'
max_length = 128

In [80]:
from fewshot_re_kit.data_loader import FewRelDatasetPair, get_loader_pair
from fewshot_re_kit.framework import FewShotREFramework
from fewshot_re_kit.sentence_encoder import BERTPAIRSentenceEncoder
from models.pair import Pair
import os
import torch

from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
import spacy
import neuralcoref


In [3]:
sentence_encoder = BERTPAIRSentenceEncoder(
                    bert_pretrained_checkpoint,
                    max_length)

I0112 21:57:26.851216 140734809875904 configuration_utils.py:185] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json from cache at /Users/GuruSenthil/.cache/torch/transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.bf3b9ea126d8c0001ee8a1e8b92229871d06d36d8808208cc2449280da87785c
I0112 21:57:26.852715 140734809875904 configuration_utils.py:199] Model config {
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 2,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": 

In [4]:
# meow_loader = get_loader_pair('val_wiki', sentence_encoder,
#                 N=5, K=1, Q=1, na_rate=0, batch_size=1, encoder_name='bert')

val_data_loader = iter(FewRelDatasetPair('val_wiki', sentence_encoder, N=5, K=1, Q=1, na_rate=0, root='./data', encoder_name='bert'))


In [5]:
model = Pair(sentence_encoder, hidden_size=768)

In [6]:
type(next(val_data_loader)[0]['word'])

{'tokens': ['Since', '1962', 'the', 'claimant', 'to', 'the', 'throne', 'has', 'been', 'Taw', 'Phaya', ',', 'the', 'second', 'son', 'of', 'Princess', 'Myat', 'Phaya', '.'], 'h': ['taw phaya', 'Q3601421', [[9, 10]]], 't': ['myat phaya', 'Q6946802', [[17, 18]]]}
{'tokens': ['His', 'wife', 'Nonia', 'Celsa', 'bore', 'him', 'a', 'son', ',', 'Diadumenianus', ',', 'whom', 'he', 'made', 'co', '-', 'Emperor', 'in', '218', ';', 'both', 'were', 'executed', 'by', 'partisans', 'of', '"', 'Elagabalus', '"', '(', 'see', 'below', ')', '.'], 'h': ['diadumenianus', 'Q46840', [[9]]], 't': ['nonia celsa', 'Q2724125', [[2, 3]]]}
{'tokens': ['Major', 'General', 'Robert', 'Maxwell', 'Johnstone', '(', '9', 'March', '1914', '–', '11', 'March', '1990', ')', 'was', 'a', 'senior', 'British', 'Army', 'officer', '.'], 'h': ['robert maxwell johnstone', 'Q23071389', [[2, 3, 4]]], 't': ['major general', 'Q287709', [[0, 1]]]}
{'tokens': ['At', 'the', 'same', 'time', 'the', '11th', '(', 'East', 'Africa', ')', 'Division',

list

In [7]:
def __load_model__(ckpt):
    '''
    ckpt: Path of the checkpoint
    return: Checkpoint dict
    '''
    if os.path.isfile(ckpt):
        checkpoint = torch.load(ckpt)
        print("Successfully loaded checkpoint '%s'" % ckpt)
        return checkpoint
    else:
        raise Exception("No checkpoint found at '%s'" % ckpt)

        
def item(x):
    '''
    PyTorch before and after 0.4
    '''
    torch_version = torch.__version__.split('.')
    if int(torch_version[0]) == 0 and int(torch_version[1]) < 4:
        return x[0]
    else:
        return x.item()
    
def tokenize(tokens, head_indices, tail_indices):
    word = sentence_encoder.tokenize(tokens,
            head_indices,
            tail_indices)
    return word

In [74]:
nlp = spacy.load("en_core_web_sm")
# Create a Tokenizer with the default settings for English
# including punctuation rules and exceptions
tokenizer = nlp.Defaults.create_tokenizer(nlp)
list(map(str, tokenizer("""hello meow. meow is donald trump's friend""")))

['hello', 'meow', '.', 'meow', 'is', 'donald', 'trump', "'s", 'friend']

In [8]:
# loading from the model checkpoint state

model.eval()
state_dict = __load_model__(checkpoint_path)['state_dict']
own_state = model.state_dict()
for name, param in state_dict.items():
    if name not in own_state:
        continue
    own_state[name].copy_(param)

Successfully loaded checkpoint '/Users/GuruSenthil/Desktop/pair-bert-train_wiki-val_wiki-5-1.pth.tar'


In [None]:
#evaluating on the wikidata dataset, which is what they have already implemented.

N = 5
K = 1
Q = 1
na_rate = 0
with torch.no_grad():
    for it in range(10):
        batch, label = next(val_data_loader)
        label = torch.tensor(label)
        batch['word'] = torch.stack(batch['word'])
        batch['seg'] = torch.stack(batch['seg'])
        batch['mask'] = torch.stack(batch['mask'])
        logits, pred = model(batch, N, K, Q * N + Q * na_rate)
        print(pred, label)
        right = model.accuracy(pred, label)
        print(item(right.data))

In [46]:
N = 5
K = 2
Q = 1
na_rate = 0
example_relation_data = [
    {'name':'love',
    'examples':[
        {'sentence':'meow loves mo', 'head':'meow', 'tail':'mo'},
        {'sentence':'tom is in love with jull', 'head':'tom', 'tail':'jull'}
    ]},
    {'name':'hate',
    'examples':[
        {'sentence':'trump hates the mooch', 'head':'trump', 'tail':'mooch'},
        {'sentence':'ivanka and jared dislike each other intensely', 'head':'ivanka', 'tail':'jared'}
    ]},
    {'name':'spouse',
    'examples':[
        {'sentence':'trump is married to ivanka', 'head':'trump', 'tail':'ivanka'},
        {'sentence':"bill went out with his wife jill on saturday", 'head':'bill', 'tail':'jill'}
    ]},
        {'name':'insult',
    'examples':[
        {'sentence':'The president said that michael cohen is a rat', 'head':'The president', 'tail':'michael'},
        {'sentence':'meow and tom threw jabs at each other', 'head':'meow', 'tail':'tom'}
    ]},
        {'name':'capital',
    'examples':[
        {'sentence':'austin is the capital of texas', 'head':'austin', 'tail':'texas'},
        {'sentence':"the capital of china is located in beijing", 'head':'beijing', 'tail':"china"}
    ]}
    
]

queries = [{
    'sentence':'furball and fluffy are very loving to each other','head':'furball','tail':'fluffy'
},
{
    'sentence':"""US's capital is washington""",'head':'washington','tail':'US'
}]

In [47]:
nlp = spacy.load("en_core_web_sm")

def spacy_tokenize(sentence):
    return list(map(str, nlp(sentence)))

max_length = 128
for q in queries:
    fusion_set = {'word': [], 'mask': [], 'seg': []}
#     tokens = q['sentence'].split(" ")  #TODO: generalize, make it tokenize like in the example wikidata, would probably need to use some nlp library to do it
    tokens = spacy_tokenize(q['sentence'])
    tokenized_head = spacy_tokenize(q['head'])
    tokenized_tail = spacy_tokenize(q['tail'])
    head_indices = list(range(tokens.index(tokenized_head[0]), tokens.index(tokenized_head[0])+len(tokenized_head)))   #TODO: make it work with multi-word entities
    tail_indices = list(range(tokens.index(tokenized_tail[0]), tokens.index(tokenized_tail[0])+len(tokenized_tail)))
    bert_query_tokens = tokenize(tokens, head_indices, tail_indices)
    for relation in example_relation_data:
        for ex in relation['examples']:
#             tokens = ex['sentence'].split(" ")  #TODO: generalize
            tokens = spacy_tokenize(ex['sentence'])
            tokenized_head = spacy_tokenize(ex['head'])
            tokenized_tail = spacy_tokenize(ex['tail'])
            head_indices = list(range(tokens.index(tokenized_head[0]), tokens.index(tokenized_head[0])+len(tokenized_head)))
            tail_indices = list(range(tokens.index(tokenized_tail[0]), tokens.index(tokenized_tail[0])+len(tokenized_tail)))
            bert_relation_example_tokens = tokenize(tokens, head_indices, tail_indices)
            
            SEP = sentence_encoder.tokenizer.convert_tokens_to_ids(['[SEP]'])
            CLS = sentence_encoder.tokenizer.convert_tokens_to_ids(['[CLS]'])
            word_tensor = torch.zeros((max_length)).long()
            
            new_word = CLS + bert_relation_example_tokens + SEP + bert_query_tokens + SEP
            for i in range(min(max_length, len(new_word))):
                word_tensor[i] = new_word[i]
            mask_tensor = torch.zeros((max_length)).long()
            mask_tensor[:min(max_length, len(new_word))] = 1
            seg_tensor = torch.ones((max_length)).long()
            seg_tensor[:min(max_length, len(bert_relation_example_tokens) + 1)] = 0
            fusion_set['word'].append(word_tensor)
            fusion_set['mask'].append(mask_tensor)
            fusion_set['seg'].append(seg_tensor)
    
    fusion_set['word'] = torch.stack(fusion_set['word'])
    fusion_set['seg'] = torch.stack(fusion_set['seg'])
    fusion_set['mask'] = torch.stack(fusion_set['mask'])
    logits, pred = model(fusion_set, N, K, 1)
    print(pred, logits)
    
            
            

tensor([0]) tensor([[[ 6.0336,  5.4629,  4.9938,  2.5218, -3.1381, -4.6057]]],
       grad_fn=<CatBackward>)
tensor([4]) tensor([[[-3.2909, -2.9994, -3.9807, -2.8774,  5.9306, -4.5772]]],
       grad_fn=<CatBackward>)


In [112]:
nlp = spacy.load("en_core_web_sm")
# Create a Tokenizer with the default settings for English
# including punctuation rules and exceptions
tokenizer = nlp.Defaults.create_tokenizer(nlp)
neuralcoref.add_to_pipe(nlp)

# ex = "hello Meow. Meow is Donald Trump's friend"
ex = "David is a cool boy, he went to narnia on saturday, and Sally is great, She played with me yesterday."
doc = nlp(ex)
print(doc._.coref_resolved)
doc = nlp(doc._.coref_resolved)

David is a cool boy, David went to narnia on saturday, and Sally is great, Sally played with David yesterday.


In [113]:
print([(X.text, X.label_) for X in doc.ents])

[('David', 'PERSON'), ('David', 'PERSON'), ('saturday', 'DATE'), ('Sally', 'PERSON'), ('Sally', 'PERSON'), ('David', 'PERSON'), ('yesterday', 'DATE')]


In [114]:
list(map(str, doc))

['David',
 'is',
 'a',
 'cool',
 'boy',
 ',',
 'David',
 'went',
 'to',
 'narnia',
 'on',
 'saturday',
 ',',
 'and',
 'Sally',
 'is',
 'great',
 ',',
 'Sally',
 'played',
 'with',
 'David',
 'yesterday',
 '.']