# BioBERT/SciBERT-based event graph embeddings

##Install

In [None]:
! pip install transformers



##Import

In [None]:
import numpy as np
import json
import torch
from transformers import AutoTokenizer, AutoModel

## Load the textual attributes of entities and triggers

In [None]:
DIR = ''

In [None]:
fn = 'st09'

In [None]:
graphs = []
with open(DIR + 'stratified_samples/'+ fn +'_stratified_sample.json') as ff:
    for g in json.load(ff):
        graphs.append([n['name'] for n in g['nodes'] ])

## Load BioBERT or SciBERT model

In [None]:
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-v1.1")
model = AutoModel.from_pretrained("dmis-lab/biobert-v1.1")

In [None]:
tokenizer = BertTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')
model = BertModel.from_pretrained('allenai/scibert_scivocab_uncased', output_hidden_states=True)

##Embeddings

How many layers employ?

*   last hidden layer
*   concatenation of the last four hidden layer
*   pooling layer
*   mean of the stacking of the last four layers

How to aggregate the embeddings contained within multi-token span?

*   Average of all token embeddings excluding SEP and CLS
*   CLS token embedding
*   DeepEventMine strategy


##Mean

In [None]:
def compute_embeddings(graph, layer=0, mode=0):
  """
     This function computes the embedding for a graph given
     the layers to consider and the mode to aggregate the embeddings of each token

     Parameters:
     graph list(string): the graph as a list of entities name  
     layer (int): 0 to use the last hidden layer, 1 to use the concatenation of the last four hidden layer, 2 to use the pooling layer, 3 to consider the mean of the stacking of the last 4 layers
     mode (int): 0 to compute the mean of the token, 1 to consider only the embedding of CLS, 2 to concatenate the embeddings like deep event mine
     
     Returns:
     torch tensor: the embedding of the graph

   """
  emb = []
  for text in graph:

    # tokenize
    tokens = tokenizer.encode(text)
    input_ids = torch.tensor(tokens).unsqueeze(0) # Batch size 1

    # obtain encoded tokens
    with torch.no_grad():
      outputs = model(input_ids)
    
    # consider only the last hidden layer
    if layer == 0:
      tokens_emb = outputs[0]
    
    # consider the concatenation of the last four
    elif layer == 1:
      tokens_emb = torch.cat([outputs[2][i] for i in [-1,-2,-3,-4]], dim=-1)
    
    # consider the pooling layer
    elif layer == 2:
      entity_emb = outputs[1][0]

    # consider the mean of the stacking of the last four layers
    elif layer == 3:
      tokens_emb = torch.stack(outputs[2][-4:]).mean(0)
      print(tokens_emb.shape)

    if layer != 2:
      # aggregate the embeddings of each token to get the entity embedding
      # mean
      if mode == 0:
        entity_emb = tokens_emb[0][1:-1].mean(0)
      
      # CLS embedding
      elif mode == 1:
        entity_emb = tokens_emb[0][0]
      
      # deep event mine strategy
      elif mode == 2:
        entity_emb = torch.cat((tokens_emb[0][0], tokens_emb[0][1:-1].mean(0), tokens_emb[0][-1]))

      # store the embeddings of each entity
    emb.append(entity_emb)
    
    
  # aggregate the embeddings of each entity stacking them and computing mean
  return torch.stack(emb).mean(0)

In [None]:
# compute embeddings for each graph
data = []
for i in range(len(graphs)):
   data.append({
    'idx': i,
    'embedding': compute_embeddings(graphs[i]).tolist()
  })

In [None]:
# write the embeddings to file
with open(DIR + 'biobert_mean_1000' + fn + '.json', 'w') as outfile:
    json.dump(data, outfile)

##Sum

In [None]:
def compute_embeddings(graph, layer=0):
  """
     This function computes the embedding for a graph given
     the layers to consider and the mode to aggregate the embeddings of each token

     Parameters:
     graph list(string): the graph as a list of entities name  
     layer (int): 0 to use the last hidden layer, 1 to use the concatenation of the last four hidden layer, 2 to use the pooling layer, 3 to consider the mean of the stacking of the last 4 layers
     
     Returns:
     torch tensor: the embedding of the graph

   """
  emb = []
  for text in graph:

    # tokenize
    tokens = tokenizer.encode(text)
    input_ids = torch.tensor(tokens).unsqueeze(0) # Batch size 1

    # obtain encoded tokens
    with torch.no_grad():
      outputs = model(input_ids)
    
    # consider only the last hidden layer
    if layer == 0:
      tokens_emb = outputs[0]
    
    # consider the concatenation of the last four
    elif layer == 1:
      tokens_emb = torch.cat([outputs[2][i] for i in [-1,-2,-3,-4]], dim=-1)
    
    # consider the pooling layer
    elif layer == 2:
      entity_emb = outputs[1][0]

    # consider the mean of the stacking of the last four layers
    elif layer == 3:
      tokens_emb = torch.stack(outputs[2][-4:]).sum(0)
      print(tokens_emb.shape)

    if layer != 2:
      # aggregate the embeddings of each token to get the entity embedding
      # sum
      entity_emb = tokens_emb[0][1:-1].sum(0)

      # store the embeddings of each entity
    emb.append(entity_emb)
    
    
  # aggregate the embeddings of each entity stacking them and computing mean
  return torch.stack(emb).sum(0)

In [None]:
# compute embeddings for each graph
data = []
for i in range(len(graphs)):
   data.append({
    'idx': i,
    'embedding': compute_embeddings(graphs[i]).tolist()
  })

In [None]:
# write the embeddings to file
with open(DIR + 'biobert_sum_1000' + fn + '.json', 'w') as outfile:
    json.dump(data, outfile)

##Max


In [None]:
def compute_embeddings(graph, layer=0):
  """
     This function computes the embedding for a graph given
     the layers to consider and the mode to aggregate the embeddings of each token

     Parameters:
     graph list(string): the graph as a list of entities name  
     layer (int): 0 to use the last hidden layer, 1 to use the concatenation of the last four hidden layer, 2 to use the pooling layer, 3 to consider the mean of the stacking of the last 4 layers
     
     Returns:
     torch tensor: the embedding of the graph

   """
  emb = []
  for text in graph:

    # tokenize
    tokens = tokenizer.encode(text)
    input_ids = torch.tensor(tokens).unsqueeze(0) # Batch size 1

    # obtain encoded tokens
    with torch.no_grad():
      outputs = model(input_ids)
    
    # consider only the last hidden layer
    if layer == 0:
      tokens_emb = outputs[0]
    
    # consider the concatenation of the last four
    elif layer == 1:
      tokens_emb = torch.cat([outputs[2][i] for i in [-1,-2,-3,-4]], dim=-1)
    
    # consider the pooling layer
    elif layer == 2:
      entity_emb = outputs[1][0]

    # consider the mean of the stacking of the last four layers
    elif layer == 3:
      tokens_emb, _ = torch.stack(outputs[2][-4:]).max(0)
      print(tokens_emb.shape)

    if layer != 2:
      # aggregate the embeddings of each token to get the entity embedding
      # sum
      entity_emb, _ = torch.max(tokens_emb[0][1:-1], 0)

      # store the embeddings of each entity
    emb.append(entity_emb)
    
    
  # aggregate the embeddings of each entity stacking them and computing mean
  return torch.max(torch.stack(emb), 0)[0]

In [None]:
# compute embeddings for each graph
data = []
for i in range(len(graphs)):
   data.append({
    'idx': i,
    'embedding': compute_embeddings(graphs[i]).tolist()
  })

In [None]:
# write the embeddings to file
with open(DIR + 'biobert_max_1000' + fn + '.json', 'w') as outfile:
    json.dump(data, outfile)