In [1]:
import torch
from transformers import BertTokenizer, BertModel
import numpy as np

In [5]:
train_texts = ["Patient loopt wankel en bibbert.",
         "Patient is moe van traplopen",
        "Ze fiets elke dag naar de winkel"]
train_labels = ['l2', 'i2', 'i4']

test_texts = ["Patient is wankel en wiebelt.",
         "Ik ben uitgeput van een rondje op straat.",
        "De man gaat met de fiets naar zijn werk."]
test_labels = ['l2', 'i2', 'i4']

In [6]:
bertje='wietsedv/bert-base-dutch-cased'

In [7]:
tokenizer = BertTokenizer.from_pretrained(bertje)
bertje_model = BertModel.from_pretrained(bertje)
bertje_model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30000, 768, padding_idx=3)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [16]:
list(tokenizer.vocab.keys())[15000:15020]

['levensverhaal',
 'levenswijze',
 'lever',
 'leverancier',
 'leveranciers',
 'leverbaar',
 'leverde',
 'leveren',
 'levering',
 'levert',
 'lezen',
 'lezer',
 'lezers',
 'lezing',
 'lezingen',
 'li',
 'lib',
 'libellen',
 'liberaal',
 'liberale']

In [17]:
def get_sentence_embedding_vector_from_layer(text, bertje_model, verbose=1):
    embedding = np.array
    tokenized_text = tokenizer(text, return_tensors="pt")
    if verbose:
        print(type(tokenized_text))
        print('tokenized_text',tokenized_text)
    bertje_embeddings, _ = bertje_model(**tokenized_text)
    if verbose:
        print('bertje_embeddings',bertje_embeddings)
    hidden_states = bertje_embeddings[0][0]
    if verbose:
        print('hidden_states',hidden_states)
    embedding= np.array(hidden_states.data)
    return embedding

In [18]:
##https://huggingface.co/transformers/model_doc/bert.html
def get_embedding_vector_from_layer(texts, bertje_model, verbose=1):
    bert_vectors = []
    for text in texts:
        bert_vectors.append(get_sentence_embedding_vector_from_layer(text, bertje_model, verbose))
        break
    return bert_vectors

In [19]:
bertje_training_vectors = get_embedding_vector_from_layer(train_texts, bertje_model)
bertje_test_vectors = get_embedding_vector_from_layer(test_texts, bertje_model)

<class 'transformers.tokenization_utils_base.BatchEncoding'>
tokenized_text {'input_ids': tensor([[    1,  5512, 26105, 15177, 22227, 11281,  9529,   132,    13,     2]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
bertje_embeddings tensor([[[ 0.0226,  0.1662, -0.0553,  ...,  0.0046,  0.3408,  0.1251],
         [ 0.8116,  0.4269, -0.2955,  ...,  0.1448, -0.1442, -0.3056],
         [ 0.4599,  0.9605, -1.1806,  ..., -0.0358, -0.2691,  0.3070],
         ...,
         [ 0.9056,  0.3693,  0.0430,  ...,  0.1419,  0.2632,  0.3169],
         [ 0.9549,  0.8497, -0.1188,  ..., -1.2821,  0.1217,  0.5042],
         [ 0.2007,  0.1240, -0.0035,  ...,  0.2183,  0.1006, -0.0155]]],
       grad_fn=<NativeLayerNormBackward>)
hidden_states tensor([ 2.2616e-02,  1.6616e-01, -5.5318e-02, -2.5050e-01,  2.5351e-01,
        -6.1643e-01,  2.8565e-01, -1.7592e-01,  1.9504e-02,  4.3671e-01,
         3.0322e-01, -1.0086e+00, -2.0590e-01,

In [12]:
print(bertje_training_vectors[0])

[ 2.26161405e-02  1.66163564e-01 -5.53178824e-02 -2.50504553e-01
  2.53512800e-01 -6.16430938e-01  2.85651296e-01 -1.75916582e-01
  1.95039138e-02  4.36707139e-01  3.03218156e-01 -1.00856233e+00
 -2.05901951e-01 -3.16312671e-01 -5.76927543e-01  7.16767430e-01
 -5.60092986e-01 -4.60336387e-01 -9.70969737e-01  1.18297851e+00
  3.81742597e-01 -2.19168901e-01  3.49814177e-01  9.08892155e-01
 -3.08154255e-01 -1.87626630e-01  3.51316750e-01  6.72851384e-01
 -6.96464419e-01 -3.26653898e-01  6.32064521e-01  8.62086594e-01
  5.78729749e-01 -1.43359944e-01 -6.98897243e-01 -4.30311471e-01
 -7.17981905e-02  4.00970727e-02 -1.25266403e-01 -9.43717301e-01
  4.54088598e-01  4.68614511e-03  1.15309402e-01  5.83860695e-01
  1.68755591e-01  6.61436796e-01  3.96007657e-01  5.35910800e-02
  2.26860344e-02 -2.37796813e-01 -1.97778374e-01  2.28860781e-01
  1.59032297e+00  2.67556131e-01  1.64299905e+00 -3.45458478e-01
 -5.06569669e-02  3.54866236e-01  3.13925833e-01  4.84945476e-01
  1.58762187e-02 -2.34038

In [10]:
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neural_network import MLPClassifier

BERT_classifier = LinearSVC(random_state=0, tol=1e-5)
BERT_classifier.fit(bertje_training_vectors, train_labels)
SVM_predictions = list(BERT_classifier.predict(bertje_test_vectors))
predicted_test_scores= BERT_classifier.decision_function(bertje_test_vectors) 
print(classification_report(test_labels, SVM_predictions))

              precision    recall  f1-score   support

          i2       0.00      0.00      0.00         1
          i4       0.50      1.00      0.67         1
          l2       1.00      1.00      1.00         1

    accuracy                           0.67         3
   macro avg       0.50      0.67      0.56         3
weighted avg       0.50      0.67      0.56         3



  _warn_prf(average, modifier, msg_start, len(result))


## Clustering and similarity with BERTje

In [11]:
from sentence_transformers import SentenceTransformer, util
from sklearn.cluster import KMeans

num_clusters = 2
clustering_model = KMeans(n_clusters=num_clusters)
clustering_model.fit(bertje_training_vectors)
cluster_assignment = clustering_model.labels_

print(cluster_assignment)

clustered_sentences = [[] for i in range(num_clusters)]
for sentence_id, cluster_id in enumerate(cluster_assignment):
    clustered_sentences[cluster_id].append(train_texts[sentence_id])

for i, cluster in enumerate(clustered_sentences):
    print("Cluster ", i+1)
    print(cluster)
    print("")

[1 1 0]
Cluster  1
['Ze fiets elke dag naar de winkel']

Cluster  2
['Patient loopt wankel en bibbert.', 'Patient is moe van traplopen']



In [12]:
# Query sentences:
queries = ['Fietsen lukt nog niet.', 'Eerste stapjes met lopen.', 'Neemt iedere dag de trap.']


# Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
top_k = 2
for query in queries:
    query_embedding = get_sentence_embedding_vector_from_layer(query, bertje_model, 0)
    cos_scores = util.pytorch_cos_sim(query_embedding, bertje_training_vectors)[0]
    cos_scores = cos_scores.cpu()

    #We use np.argpartition, to only partially sort the top_k results
    top_results = np.argpartition(-cos_scores, range(top_k))[0:top_k]

    print("\n\n======================\n\n")
    print("Query:", query)
    print("\nTop 5 most similar sentences in corpus:")

    for idx in top_results[0:top_k]:
        print(train_texts[idx].strip(), "(Score: %.4f)" % (cos_scores[idx]))





Query: Fietsen lukt nog niet.

Top 5 most similar sentences in corpus:
Patient is moe van traplopen (Score: 0.6751)
Ze fiets elke dag naar de winkel (Score: 0.6608)




Query: Eerste stapjes met lopen.

Top 5 most similar sentences in corpus:
Ze fiets elke dag naar de winkel (Score: 0.6962)
Patient is moe van traplopen (Score: 0.6672)




Query: Neemt iedere dag de trap.

Top 5 most similar sentences in corpus:
Ze fiets elke dag naar de winkel (Score: 0.7464)
Patient is moe van traplopen (Score: 0.6450)
