Following:

https://mccormickml.com/2019/05/14/BERT-word-embeddings-tutorial/

Credits:

Chris McCormick and Nick Ryan. (2019, May 14). BERT Word Embeddings Tutorial. Retrieved from http://www.mccormickml.com


In [234]:
import torch
from transformers import BertTokenizer, BertModel
import numpy as np

In [96]:
torch.__version__

'1.6.0'

In [2]:
train_texts = ["Patient loopt wankel en bibbert.",
         "Patient is moe van traplopen",
        "Ze fiets elke dag naar de winkel"]
train_labels = ['l2', 'i2', 'i4']

test_texts = ["Patient is wankel en wiebelt.",
         "Ik ben uitgeput van een rondje op straat.",
        "De man gaat met de fiets naar zijn werk."]
test_labels = ['l2', 'i2', 'i4']

In [60]:
bertje='wietsedv/bert-base-dutch-cased'
#bertje='bert-base-uncased'

In [61]:
# Load pre-trained model tokenizer (vocabulary)
bertje_tokenizer = BertTokenizer.from_pretrained(bertje)
bertje_model = BertModel.from_pretrained(bertje, output_hidden_states = True) # Whether the model returns all hidden-states.)
# Put the model in "evaluation" mode, meaning feed-forward operation.
bertje_model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30000, 768, padding_idx=3)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [62]:
#Checking out BERTje's vocabulary
list(bertje_tokenizer.vocab.keys())[15000:15020]

['levensverhaal',
 'levenswijze',
 'lever',
 'leverancier',
 'leveranciers',
 'leverbaar',
 'leverde',
 'leveren',
 'levering',
 'levert',
 'lezen',
 'lezer',
 'lezers',
 'lezing',
 'lezingen',
 'li',
 'lib',
 'libellen',
 'liberaal',
 'liberale']

In [195]:
#### note the tokenizer is called differently than the documented examples in:
## https://huggingface.co/transformers/model_doc/bert.html#bertmodel
## where it says: 

#>>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
#>>> model = BertModel.from_pretrained('bert-base-uncased', return_dict=True)

#>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
#>>> outputs = model(**inputs)

#>>> last_hidden_states = outputs.last_hidden_state

marked_text = '[CLS] '+"Patient loopt wankel en bibbert."+ ' [SEP]'
tokenized_text = bertje_tokenizer.tokenize(marked_train_text[0])
tokenized_text = tokenizer(marked_text, return_tensors="pt")


token_ids = bertje_tokenizer.convert_tokens_to_ids(tokenized_text)
segment_ids = [1] * len(tokenized_text)

print('marked trained text[0]', marked_train_text[0])
print(type(marked_train_text[0]))
print('marked text', marked_text)
print(type(marked_text))
print('tokenized text', tokenized_text)
print('token ids', token_ids)
print('segment ids', segment_ids)

# Display the words with their indeces.
for tup in zip(tokenized_text, token_ids):
    print('{:<12} {:>6,}'.format(tup[0], tup[1]))

marked trained text[0] [CLS] Patient loopt wankel en bibbert. [SEP]
<class 'str'>
marked text [CLS] Patient loopt wankel en bibbert. [SEP]
<class 'str'>
tokenized text {'input_ids': tensor([[    1,     1,  5512, 26105, 15177, 22227, 11281,  9529,   132,    13,
             2,     2]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
token ids [0, 0, 0]
segment ids [1, 1, 1]
input_ids         0
token_type_ids      0
attention_mask      0


In [196]:
type(tokenized_text)

transformers.tokenization_utils_base.BatchEncoding

In [198]:
# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([token_ids])
segments_tensors = torch.tensor([segments_ids])

In [199]:
type(tokens_tensor)

torch.Tensor

In [200]:
print(tokens_tensor.data[0])

tensor([0, 0, 0])


In [201]:
# Run the text through BERT, and collect all of the hidden states produced
# from all 12 layers. 
with torch.no_grad():
    print (tokenized_text)
    #outputs = bertje_model(tokens_tensor, segments_tensors)
    outputs = bertje_model(**tokenized_text)

    # Evaluating the model will return a different number of objects based on 
    # how it's  configured in the `from_pretrained` call earlier. In this case, 
    # becase we set `output_hidden_states = True`, the third item will be the 
    # hidden states from all layers. See the documentation for more details:
    # https://huggingface.co/transformers/model_doc/bert.html#bertmodel
    hidden_states = outputs[2]

{'input_ids': tensor([[    1,     1,  5512, 26105, 15177, 22227, 11281,  9529,   132,    13,
             2,     2]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [202]:
print(hidden_states)

(tensor([[[-0.7453,  0.3119,  1.2152,  ...,  0.5540,  1.7434,  0.2024],
         [-0.8552,  0.1490,  0.8565,  ...,  0.6793,  1.7598,  0.2586],
         [ 1.7831,  1.0930,  0.3459,  ...,  0.5615, -0.3803,  0.3299],
         ...,
         [ 0.3265, -0.0293,  0.4527,  ...,  0.3535, -0.4330, -0.8264],
         [ 0.4432,  0.0589, -0.6798,  ...,  1.1498,  0.4757,  0.4151],
         [ 0.1965, -0.3239, -0.7725,  ...,  1.2099,  0.3377,  0.3097]]]), tensor([[[-0.0350,  0.7342,  0.8716,  ...,  0.2325,  1.6016, -0.1824],
         [-0.2450,  0.4725,  0.7208,  ...,  0.1403,  1.4009, -0.1233],
         [ 2.0377,  1.2377, -0.0117,  ...,  0.4454,  0.0245,  0.4253],
         ...,
         [ 0.5513, -0.3122,  0.3779,  ..., -0.0190,  0.3434, -0.7196],
         [ 0.4867,  0.0281, -0.4082,  ...,  0.2102,  0.6290,  0.2026],
         [ 0.3889, -0.1795, -0.4462,  ...,  0.2124,  0.5301,  0.1593]]]), tensor([[[-0.1514,  0.7004,  0.3314,  ...,  0.3998,  1.2812,  0.1254],
         [-0.1171,  0.4879,  0.1425,  ...,

In [203]:
print ("Number of layers:", len(hidden_states), "  (initial embeddings + 12 BERT layers)")
layer_i = 0

print ("Number of batches:", len(hidden_states[layer_i]))
batch_i = 0

print ("Number of tokens:", len(hidden_states[layer_i][batch_i]))
token_i = 0

print ("Number of hidden units:", len(hidden_states[layer_i][batch_i][token_i]))

Number of layers: 13   (initial embeddings + 12 BERT layers)
Number of batches: 1
Number of tokens: 12
Number of hidden units: 768


In [204]:
# `hidden_states` is a Python list.
print('      Type of hidden_states: ', type(hidden_states))

# Each layer in the list is a torch tensor.
print('Tensor shape for each layer: ', hidden_states[0].size())

      Type of hidden_states:  <class 'tuple'>
Tensor shape for each layer:  torch.Size([1, 12, 768])


In [205]:
# Concatenate the tensors for all layers. We use `stack` here to
# create a new dimension in the tensor.
token_embeddings = torch.stack(hidden_states, dim=0)

token_embeddings.size()

torch.Size([13, 1, 12, 768])

In [206]:
# Remove dimension 1, the "batches".
token_embeddings = torch.squeeze(token_embeddings, dim=1)

token_embeddings.size()

torch.Size([13, 12, 768])

In [207]:
# Swap dimensions 0 and 1. so that the first element are the tokens, 2nd are the layers and 3rd are the dimensions
token_embeddings = token_embeddings.permute(1,0,2)

token_embeddings.size()

torch.Size([12, 13, 768])

In [208]:
def convert_hidden_states_to_token_embeddings(hidden_states):
    token_embeddings = torch.stack(hidden_states, dim=0)
    token_embeddings = torch.squeeze(token_embeddings, dim=1)
    token_embeddings = token_embeddings.permute(1,0,2)
    return token_embeddings

## Word Vectors
To give you some examples, let’s create word vectors two ways.

First, let’s concatenate the last four layers, giving us a single word vector per token. Each vector will have length 4 x 768 = 3,072.

In [209]:
# Stores the token vectors, with shape [22 x 3,072]
token_vecs_cat = []

# `token_embeddings` is a [22 x 12 x 768] tensor.

# For each token in the sentence...
for token in token_embeddings:
    
    # `token` is a [12 x 768] tensor

    # Concatenate the vectors (that is, append them together) from the last 
    # four layers.
    # Each layer vector is 768 values, so `cat_vec` is length 3,072.
    cat_vec = torch.cat((token[-1], token[-2], token[-3], token[-4]), dim=0)
    
    # Use `cat_vec` to represent `token`.
    token_vecs_cat.append(cat_vec)

print ('Shape is: %d x %d' % (len(token_vecs_cat), len(token_vecs_cat[0])))

Shape is: 12 x 3072


As an alternative method, let’s try creating the word vectors by summing together the last four layers.

In [210]:
#Stores the token vectors, with shape [22 x 768]
token_vecs_sum = []

# `token_embeddings` is a [22 x 12 x 768] tensor.

# For each token in the sentence...
for token in token_embeddings:

    # `token` is a [12 x 768] tensor

    # Sum the vectors from the last four layers.
    sum_vec = torch.sum(token[-4:], dim=0)
    
    # Use `sum_vec` to represent `token`.
    token_vecs_sum.append(sum_vec)

print ('Shape is: %d x %d' % (len(token_vecs_sum), len(token_vecs_sum[0])))

Shape is: 12 x 768


## Sentence Vectors
To get a single vector for our entire sentence we have multiple application-dependent strategies, but a simple approach is to average the second to last hiden layer of each token producing a single 768 length vector.

In [211]:
# `hidden_states` has shape [13 x 1 x 22 x 768]

# `token_vecs` is a tensor with shape [22 x 768]
token_vecs = hidden_states[-2][0]

# Calculate the average of all 22 token vectors.
sentence_embedding = torch.mean(token_vecs, dim=0)

In [212]:
print ("Our final sentence embedding vector of shape:", sentence_embedding.size())

Our final sentence embedding vector of shape: torch.Size([768])


## Trying a batch input

In [213]:
def mark_text (texts):
    marked_text_list = []
    for text in texts:
        marked_text = "[CLS] " + text + " [SEP]"
        marked_text_list.append(marked_text)
    return marked_text_list

In [214]:
marked_train_texts = mark_text(train_texts)
marked_test_texts = mark_text(test_texts)

In [218]:
def get_sentence_tokens(texts, bertje_tokenizer, verbose=0):
    tokens = []
    for text in texts:
        #tokenized_text = bertje_tokenizer.tokenize([text])
        tokenized_text = tokenizer(text, return_tensors="pt")
        if verbose:
            print('tokenized text', tokenized_text)
        tokens.append(tokenized_text)
    return tokens

In [219]:
# not needed
def get_sentence_token_ids(tokenized_texts, bertje_tokenizer, verbose=0):
    token_ids = []
    for tokenized_text in tokenized_texts:
        token_ids = bertje_tokenizer.convert_tokens_to_ids(tokenized_text)
        if verbose:
            print('token ids', token_ids)
        token_ids.append(token_ids)
    return token_ids

In [220]:
# not needed
def get_segment_ids(tokenized_texts, verbose=0):
    segment_ids = []
    for tokenized_text in tokenized_texts:
        segment_ids = [1] * len(tokenized_text)
        if verbose:
            print('segment ids', segment_ids)
        segment_ids.append(segment_ids)
    return segment_ids

In [221]:
tokenized_train_texts = get_sentence_tokens(marked_train_texts, bertje_tokenizer, 1)
#token_ids_train_text = get_sentence_token_ids(tokenized_train_texts, bertje_tokenizer, 0)
#segment_ids_train_text = get_segment_ids(tokenized_train_texts)

tokenized_test_texts = get_sentence_tokens(marked_test_texts, bertje_tokenizer, 0)
#token_ids_test_text = get_sentence_token_ids(tokenized_test_texts, bertje_tokenizer, 0)
#segment_ids_test_text = get_segment_ids(tokenized_test_texts, 0)

tokenized text {'input_ids': tensor([[    1,     1,  5512, 26105, 15177, 22227, 11281,  9529,   132,    13,
             2,     2]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
tokenized text {'input_ids': tensor([[    1,     1,  5512, 26105, 13903, 15723, 20722, 20321, 26954,     2,
             2]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
tokenized text {'input_ids': tensor([[    1,     1,  7769, 11572, 11262, 10494, 15892, 10537, 22500,     2,
             2]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [241]:
#torch.no_grad tells PyTorch not to construct the compute graph during this forward pass
#(since we won’t be running backprop here)–this just reduces memory consumption and 
# speeds things up a little.

# Run the text through BERT, and collect all of the hidden states produced
# from all 12 layers.

def get_hidden_states_from_tokenized_text (tokenized_texts, model):
    hidden_states_list =[]

    with torch.no_grad():
        for tokenized_text in tokenized_texts:
            print(type(tokenized_text))
            #print (tokenized_text)
            outputs = bertje_model(**tokenized_text)
            # Evaluating the model will return a different number of objects based on 
            # how it's  configured in the `from_pretrained` call earlier. In this case, 
            # becase we set `output_hidden_states = True`, the third item will be the 
            # hidden states from all layers. See the documentation for more details:
            # https://huggingface.co/transformers/model_doc/bert.html#bertmodel
            hidden_states = outputs[2]
            hidden_states_list.append(hidden_states)
    return hidden_states_list

In [242]:
#### Creating sentence vectors for input

def get_sentence_embeddings (tokenized_text, model):
    sentence_embeddings = []
    hidden_states_list = get_hidden_states_from_tokenized_text(tokenized_text, bertje_model)


    for hidden_states in hidden_states_list:
        convert_hidden_states_to_token_embeddings(hidden_states)
        # `hidden_states` has shape [13 x 1 x 22 x 768]

        # `token_vecs` is a tensor with shape [22 x 768]
        token_vecs = hidden_states[-2][0]  ### change to -4 to get the last 4 layers as used for NERC

        # Calculate the average of all 22 token vectors.
        sentence_embedding = torch.mean(token_vecs, dim=0)
        sentence_embeddings.append(np.array(sentence_embedding))
    return sentence_embeddings


In [243]:
bertje_training_vectors = get_sentence_embeddings(tokenized_train_texts, bertje_model)
bertje_test_vectors = get_sentence_embeddings(tokenized_test_texts, bertje_model)

<class 'transformers.tokenization_utils_base.BatchEncoding'>
<class 'transformers.tokenization_utils_base.BatchEncoding'>
<class 'transformers.tokenization_utils_base.BatchEncoding'>
<class 'transformers.tokenization_utils_base.BatchEncoding'>
<class 'transformers.tokenization_utils_base.BatchEncoding'>
<class 'transformers.tokenization_utils_base.BatchEncoding'>


In [244]:
print(bertje_training_vectors[0])

[ 2.26854280e-01  3.64366889e-01 -2.64939189e-01 -2.16569901e-02
  4.23036218e-01 -3.06041837e-01  7.25228250e-01 -1.34269550e-01
  1.27096102e-01  1.61743760e-01  4.45829965e-02 -4.70153898e-01
 -3.71987037e-02 -2.58300692e-01  2.84657449e-01  1.02166511e-01
  3.29507142e-01 -3.30159180e-02 -4.24469709e-02  1.92498222e-01
  2.21560910e-01 -1.77030399e-01 -2.20597863e-01  6.96013123e-02
  1.62168499e-02 -2.80006558e-01 -2.88313478e-02  2.14737728e-01
  9.42061841e-02 -2.29923546e-01  3.07494015e-01 -2.76333988e-02
  3.78727436e-01 -9.70472302e-03  9.25650299e-02 -3.06998361e-02
  1.50071934e-01  1.40077965e-02  6.63034767e-02 -2.60442317e-01
  1.27322808e-01  8.55716541e-02  2.44976371e-03  1.07875913e-01
 -8.77190009e-02  7.83961639e-02 -1.42496660e-01 -2.05088541e-01
  3.85075003e-01 -1.66923106e-01  3.49197626e-01  2.90625125e-01
  2.15229526e-01  3.59508425e-01  3.64061266e-01  1.26831690e-02
 -2.75044858e-01  2.63085157e-01  2.10178673e-01  1.62515894e-01
 -1.90799654e-01  2.55870

In [245]:
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neural_network import MLPClassifier

BERT_classifier = LinearSVC(random_state=0, tol=1e-5)
BERT_classifier.fit(bertje_training_vectors, train_labels)
SVM_predictions = list(BERT_classifier.predict(bertje_test_vectors))
predicted_test_scores= BERT_classifier.decision_function(bertje_test_vectors) 
print(classification_report(test_labels, SVM_predictions))

              precision    recall  f1-score   support

          i2       0.00      0.00      0.00         1
          i4       0.50      1.00      0.67         1
          l2       1.00      1.00      1.00         1

    accuracy                           0.67         3
   macro avg       0.50      0.67      0.56         3
weighted avg       0.50      0.67      0.56         3



  _warn_prf(average, modifier, msg_start, len(result))


## Clustering and similarity with BERTje

In [1]:
from sentence_transformers import SentenceTransformer, util
from sklearn.cluster import KMeans

num_clusters = 2
clustering_model = KMeans(n_clusters=num_clusters)
clustering_model.fit(bertje_training_vectors)
cluster_assignment = clustering_model.labels_

print(cluster_assignment)

clustered_sentences = [[] for i in range(num_clusters)]
for sentence_id, cluster_id in enumerate(cluster_assignment):
    clustered_sentences[cluster_id].append(train_texts[sentence_id])

for i, cluster in enumerate(clustered_sentences):
    print("Cluster ", i+1)
    print(cluster)
    print("")

NameError: name 'bertje_training_vectors' is not defined

In [None]:
# Query sentences:
queries = ['Fietsen lukt nog niet.', 'Eerste stapjes met lopen.', 'Neemt iedere dag de trap.']


# Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
top_k = 2
for query in queries:
    query_embedding = get_sentence_embedding_vector_from_layer(query, bertje_model, 0)
    cos_scores = util.pytorch_cos_sim(query_embedding, bertje_training_vectors)[0]
    cos_scores = cos_scores.cpu()

    #We use np.argpartition, to only partially sort the top_k results
    top_results = np.argpartition(-cos_scores, range(top_k))[0:top_k]

    print("\n\n======================\n\n")
    print("Query:", query)
    print("\nTop 5 most similar sentences in corpus:")

    for idx in top_results[0:top_k]:
        print(train_texts[idx].strip(), "(Score: %.4f)" % (cos_scores[idx]))