# k-means em BERT

In [1]:
from transformers import BertTokenizer, BertModel
import torch

# BERT_MODEL='../models/bert-base-cased'
BERT_MODEL='../models/biobert'

# BERT_MODEL='bert-base-cased'
# BERT_MODEL='dmis-lab/biobert-base-cased-v1.1'

tokenizer = BertTokenizer.from_pretrained(BERT_MODEL, do_lower_case=False)
model = BertModel.from_pretrained(BERT_MODEL, output_hidden_states = True)

In [2]:
def get_hidden_states(model, tokenizer, text):
    tokenized_text = tokenizer.tokenize(text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    
    segments_ids = [1] * len(tokenized_text)

    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])
    
    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors)
        hidden_states = outputs[2]
        return hidden_states

In [3]:
def get_embeddings(text):
   
    hidden_states = get_hidden_states(model, tokenizer, text)
    token_embeddings = torch.stack(hidden_states, dim=0)
    token_embeddings = torch.squeeze(token_embeddings, dim=1)
    token_embeddings = token_embeddings.permute(1,0,2)
    
    return token_embeddings

In [4]:
sentence = "A 58-year-old African-American woman presents to the ER with episodic pressing/burning anterior chest pain that began two days earlier for the first time in her life. The pain started while she was walking, radiates to the back, and is accompanied by nausea, diaphoresis and mild dyspnea, but is not increased on inspiration. The latest episode of pain ended half an hour prior to her arrival. She is known to have hypertension and obesity. She denies smoking, diabetes, hypercholesterolemia, or a family history of heart disease. She currently takes no medications. Physical examination is normal. The EKG shows nonspecific changes."

input_sentence = "[CLS] " + sentence + " [SEP]"

tokenized_input = tokenizer.tokenize(input_sentence)

In [5]:
last_embeddings = []

token_embeddings = get_embeddings(input_sentence)

for token_emb in token_embeddings:
    last_embedding_layer = token_emb[12]
    last_embeddings.append(last_embedding_layer)       

## salva os embeddings da sentença

In [6]:
vectors = []
for l in range(len(last_embeddings)):
    vector = last_embeddings[l]
    tsv_row = ''
    for m in range(len(vector)):
        tsv_row += str(vector[m].tolist()) + '\t'

    vectors.append(tsv_row)

with open('tsv_files/sentence_word_embeddings.tsv', "w") as f:
    for e in vectors:
        print (e, file=f)   
        
labels = []
with open('tsv_files/sentence_labels.tsv', "w") as labels_file:
    for label in tokenized_input:
        print(label.replace("\n", ""), file=labels_file)

# Clusterização com NLTK

In [7]:
from nltk.cluster import KMeansClusterer
import nltk

NUM_CLUSTERS=10

kclusterer = KMeansClusterer(NUM_CLUSTERS, distance=nltk.cluster.util.cosine_distance, repeats=25)
assigned_clusters = kclusterer.cluster(last_embeddings, assign_clusters=True)

In [8]:
# # words = list(model.wv.vocab)
# i = 0
# for word in tokenized_input:  
#     print (word + ":" + str(assigned_clusters[i]))
#     i+=1

# Clusterização com scikitlearn

In [9]:
# Transformar de list  de tensores pra ndArray de n_input_tokens x 768
import numpy as np

aux_array = []
for last_embedding in last_embeddings:
    aux_array.append(last_embedding.numpy())
X = np.array(aux_array)

In [10]:
from sklearn import cluster
from sklearn import metrics
kmeans = cluster.KMeans(n_clusters=NUM_CLUSTERS)
kmeans.fit(X)
 
cluster_labels = kmeans.labels_
centroids = kmeans.cluster_centers_
 
print ("Cluster id labels for input data")
print (cluster_labels)

print ("Score (Opposite of the value of X on the K-means objective which is Sum of distances of samples to their closest cluster center):")
print (kmeans.score(X))
 
# silhouette_score = metrics.silhouette_score(X, labels, metric='euclidean')
 
# print ("Silhouette_score: ")
# print (silhouette_score)

Cluster id labels for input data
[1 2 3 9 4 5 0 7 0 6 0 0 0 0 0 0 0 0 0 8 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0]
Score (Opposite of the value of X on the K-means objective which is Sum of distances of samples to their closest cluster center):
-8.293188429571686e+16


## salva os embeddings da sentença e dos clusters da sentença

In [11]:
vectors = []
labels = []

i = 0
for c in centroids:
    tsv_row = ''
    for cc in c:
        tsv_row += str(cc.tolist()) + '\t'
    
    vectors.append(tsv_row)
    labels.append("Centroid cluster " + str(i))
    i+=1

    
for l in range(len(last_embeddings)):
    vector = last_embeddings[l]
    tsv_row = ''
    for m in range(len(vector)):
        tsv_row += str(vector[m].tolist()) + '\t'

    vectors.append(tsv_row)

for t_i in tokenized_input:
    labels.append(t_i)
    
    
with open('tsv_files/sentence_and_clusters_word_embeddings_aux.tsv', "w") as f:
    for e in vectors:
        print (e, file=f)   
with open('tsv_files/sentence_and_clusters_labels_aux.tsv', "w") as labels_file:
    for label in labels:
        print(label.replace("\n", ""), file=labels_file)