In [5]:
import torch
from transformers import BertTokenizer, BertModel
import numpy as np
#from sklearn import svm

In [None]:
train_texts = ["Patient loopt wankel en bibbert.",
         "Patient is moe van traplopen",
        "Ze fiets elke dag naar de winkel"]
train_labels = ['l2', 'i2', 'i4']

test_texts = ["Patient is wankel en wiebelt.",
         "Ik ben uitgeput van een rondje op straat.",
        "De man gaat met de fiets naar zijn werk."]
test_labels = ['l2', 'i2', 'i4']

In [None]:
bertje='wietsedv/bert-base-dutch-cased'

In [None]:
tokenizer = BertTokenizer.from_pretrained(bertje)
bertje_model = BertModel.from_pretrained(bertje)
bertje_model.eval()

In [None]:
list(tokenizer.vocab.keys())[15000:15020]

In [None]:
def get_sentence_embedding_vector_from_layer(text, bertje_model, verbose=1):
    embedding = np.array
    tokenized_text = tokenizer(text, return_tensors="pt")
    if verbose:
        print(type(tokenized_text))
        print('tokenized_text',tokenized_text)
    bertje_embeddings, _ = bertje_model(**tokenized_text)
    if verbose:
        print('bertje_embeddings',bertje_embeddings)
    hidden_states = bertje_embeddings[0][0]
    if verbose:
        print('hidden_states',hidden_states)
    embedding= np.array(hidden_states.data)
    return embedding

In [None]:
##https://huggingface.co/transformers/model_doc/bert.html
def get_embedding_vector_from_layer(texts, bertje_model, verbose=1):
    bert_vectors = []
    for text in texts:
        bert_vectors.append(get_sentence_embedding_vector_from_layer(text, bertje_model, verbose))
        break
    return bert_vectors

In [None]:
bertje_training_vectors = get_embedding_vector_from_layer(train_texts, bertje_model)
bertje_test_vectors = get_embedding_vector_from_layer(test_texts, bertje_model)

In [None]:
print(bertje_training_vectors[0])

In [2]:
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neural_network import MLPClassifier

BERT_classifier = LinearSVC(random_state=0, tol=1e-5)
BERT_classifier.fit(bertje_training_vectors, train_labels)
SVM_predictions = list(BERT_classifier.predict(bertje_test_vectors))
predicted_test_scores= BERT_classifier.decision_function(bertje_test_vectors) 
print(classification_report(test_labels, SVM_predictions))

ModuleNotFoundError: No module named 'sklearn'

## Clustering and similarity with BERTje

In [None]:
from sentence_transformers import SentenceTransformer, util
from sklearn.cluster import KMeans

num_clusters = 2
clustering_model = KMeans(n_clusters=num_clusters)
clustering_model.fit(bertje_training_vectors)
cluster_assignment = clustering_model.labels_

print(cluster_assignment)

clustered_sentences = [[] for i in range(num_clusters)]
for sentence_id, cluster_id in enumerate(cluster_assignment):
    clustered_sentences[cluster_id].append(train_texts[sentence_id])

for i, cluster in enumerate(clustered_sentences):
    print("Cluster ", i+1)
    print(cluster)
    print("")

In [None]:
# Query sentences:
queries = ['Fietsen lukt nog niet.', 'Eerste stapjes met lopen.', 'Neemt iedere dag de trap.']


# Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
top_k = 2
for query in queries:
    query_embedding = get_sentence_embedding_vector_from_layer(query, bertje_model, 0)
    cos_scores = util.pytorch_cos_sim(query_embedding, bertje_training_vectors)[0]
    cos_scores = cos_scores.cpu()

    #We use np.argpartition, to only partially sort the top_k results
    top_results = np.argpartition(-cos_scores, range(top_k))[0:top_k]

    print("\n\n======================\n\n")
    print("Query:", query)
    print("\nTop 5 most similar sentences in corpus:")

    for idx in top_results[0:top_k]:
        print(train_texts[idx].strip(), "(Score: %.4f)" % (cos_scores[idx]))