<a href="https://colab.research.google.com/github/finardi/tutos/blob/master/CLustering_KMEANS_with_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
! pip install -q transformers

[K     |████████████████████████████████| 2.2MB 9.1MB/s 
[K     |████████████████████████████████| 870kB 49.1MB/s 
[K     |████████████████████████████████| 3.3MB 56.7MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


In [5]:
from transformers import BertModel, BertTokenizer

from sklearn.cluster import KMeans
import torch
import pandas as pd

In [4]:
text_list = [
        'arroz com feijão e batata frita|1001',
        'arroz salada e mandioca|1010',
        'arroz com feijão e abobrinha|1101',
        'arroz com feijão batata frit|1001',
        'feijão arroz e cenoura|1011',
        'beterraba, chuchu e arroz com feijão|1111',
        'arroz com milho e feijão com couve|2001',
        'batata frita com legumes e soja|2010',
        'arroz com bata frita|2010',
        'madioquinha com sopa de feijão|2010',
        'arroz feijo e batata fita|1001',
        'mesa, cadeira e escrivaninha|2711',
        'cadeira, mesa e escrivaninha|2711',
        'escrivaninha, mesa e cadeira|2711',
        'geladeira e fogão|1711',
        'microondas e geraleira|1711',
        'geladeira e mesa azul|1711',
        'roupeiro, criado-mudo e cama|2804',
        'criado-mudo e cama|2804',
        'roupeiro e criado-mudo|2804',
        'cama e roupeiro|2804',
        'microondas e cama|2805',
        'mochila e roupeiro|2805',
        'ps1, ps2, ps3, ps4 e ps5|1311',
        'n64, snes, ps3, ps4 e ps5|1211',
        'ps1, ps2, n64, ps4 e ps5|1341',
        'ps5, ps4, ps3, ps2 e ps1|1311',
        'ps1, n64, snes, switch|2311',
        'ps6, ps7, ps8, ps9 e ps10|3211',
        'ps10, ps2, ps3, ps4 e ps5|3211',
        'mega-drive, nes, snes|1301',
        'mega drive, n64, snes|1301',
        'megadrive, nes, n64|1301',
        'mega-drive, snes, nes|1311',
        'mega drive, nes, n64|1311',
        'tom, planck, ozzy, mel, nina, pingado e gisele|0000',
        'tom, planck, ozzy, mel, nina, pingado gisele|0000',
        'paulo, tom, planck, ozzy, mel, nina, pingado e gisele|0001',
        'paulo, eli, tom, planck, ozzy, mel, nina, pingado e gisele|0001',
        'jan, paulo, eli, tom, planck, ozzy, mel, nina, pingado e gisele|0001',
]

mydict = {}
for i, item in enumerate(text_list):
    mydict[i] = item.split('|')[0], item.split('|')[1]

df = pd.DataFrame(mydict).T
df = df.rename(columns={0:'text', 1:'doc'})

# BERTaú - Embeddings

In [6]:
%%time

def get_embs(text_list:[], path_model:str):
    tokenizer = BertTokenizer.from_pretrained(path_model)
    model = BertModel.from_pretrained(path_model)
    hidden_size = model.config.hidden_size
    embs = torch.zeros(len(df)*hidden_size).reshape(len(df), hidden_size)

    for i, text in enumerate(text_list):
        tokens = tokenizer(text, return_tensors='pt')
        outs = model(**tokens, return_dict=True)
        last_hidden_state = outs['last_hidden_state']
        # cls = last_hidden_state[:,0, :].squeeze(0).detach().numpy()
        
        # without CLS and SEP
        embs_mean = torch.einsum("ijk -> k", last_hidden_state[:,1:-1,])#.detach().numpy()
        embs[i] = embs_mean

    return embs
# --------------------------------------------------
path_model = 'Itau-Unibanco/BERTau'
embeddings = get_embs(df.text.to_list(), path_model)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=268101.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=288.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=451518251.0, style=ProgressStyle(descri…


CPU times: user 12.5 s, sys: 1.3 s, total: 13.8 s
Wall time: 14.3 s


In [7]:
# Perform kmean clustering
num_clusters = 8
clustering_model = KMeans(n_clusters=num_clusters)
clustering_model.fit(embeddings.detach().numpy())
cluster_assignment = clustering_model.labels_

clustered_sentences = [[] for i in range(num_clusters)]
for sentence_id, cluster_id in enumerate(cluster_assignment):
    clustered_sentences[cluster_id].append(df.text.to_list()[sentence_id])

for i, cluster in enumerate(clustered_sentences):
    print("Cluster ", i+1)
    print(cluster)
    print("")

Cluster  1
['arroz salada e mandioca', 'mesa, cadeira e escrivaninha', 'cadeira, mesa e escrivaninha', 'escrivaninha, mesa e cadeira', 'roupeiro, criado-mudo e cama', 'roupeiro e criado-mudo']

Cluster  2
['tom, planck, ozzy, mel, nina, pingado e gisele', 'tom, planck, ozzy, mel, nina, pingado gisele', 'paulo, tom, planck, ozzy, mel, nina, pingado e gisele']

Cluster  3
['ps1, ps2, ps3, ps4 e ps5', 'n64, snes, ps3, ps4 e ps5', 'ps1, ps2, n64, ps4 e ps5', 'ps5, ps4, ps3, ps2 e ps1', 'ps6, ps7, ps8, ps9 e ps10', 'ps10, ps2, ps3, ps4 e ps5']

Cluster  4
['arroz com feijão e batata frita', 'arroz com feijão e abobrinha', 'arroz com feijão batata frit', 'arroz com milho e feijão com couve', 'batata frita com legumes e soja', 'madioquinha com sopa de feijão', 'arroz feijo e batata fita']

Cluster  5
['paulo, eli, tom, planck, ozzy, mel, nina, pingado e gisele', 'jan, paulo, eli, tom, planck, ozzy, mel, nina, pingado e gisele']

Cluster  6
['ps1, n64, snes, switch', 'mega-drive, nes, snes', '