# Clustering

In [11]:
!nvidia-smi

Thu Jan 13 09:25:12 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.29.05    Driver Version: 495.29.05    CUDA Version: 11.5     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  On   | 00000000:3B:00.0 Off |                    0 |
| N/A   35C    P0    57W / 300W |  28612MiB / 32510MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

In [None]:
import pandas as pd
import os
import time
from tqdm import tqdm
import numpy as np
import torch
from transformers import AutoConfig, AutoModel,AutoModelForPreTraining, AutoTokenizer
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import calinski_harabasz_score, davies_bouldin_score, silhouette_score
from sklearn.decomposition import PCA


# df_clean = pd.read_pickle("df_clean_v1_07122021_py35.pkl")
df_clean = pd.read_pickle("df_clean_v4_14122021_py35.pkl")
print(df_clean.shape)
df_clean.head()

# load model

In [26]:
model_str = "all-mpnet-base-v2"
model_str = "all-mpnet-base-v2-finetuned-genre_unfrozen_base-checkpoint-1735/checkpoint-1735"
model_str = "all-mpnet-base-v2-finetuned-genre_unfrozen_base-checkpoint-1041/checkpoint-1041"

In [27]:
if os.path.isdir(model_str):
    config = AutoConfig.from_pretrained(f'{model_str}/config.json')
    model = AutoModel.from_config(config)
    model = AutoModel.from_pretrained(f'{model_str}/pytorch_model.bin',config=config)
    model.eval()
    model.cuda()
    tokenizer = AutoTokenizer.from_pretrained(model_str, use_fast=True)
else:
    model = SentenceTransformer(model_str)

Some weights of the model checkpoint at all-mpnet-base-v2-finetuned-genre_unfrozen_base-checkpoint-1041/checkpoint-1041/pytorch_model.bin were not used when initializing MPNetModel: ['classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']
- This IS expected if you are initializing MPNetModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing MPNetModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of MPNetModel were not initialized from the model checkpoint at all-mpnet-base-v2-finetuned-genre_unfrozen_base-checkpoint-1041/checkpoint-1041/pytorch_model.bin and are newly initialized: ['mpnet.pooler.dense.weight', 'mpnet.pooler.dense.

# compute embeddings

In [34]:
song_lyrics = df_clean['lyrics'].tolist()
song_names = df_clean['song_name'].tolist()
song_artists = df_clean['artist'].tolist()
song_genres = df_clean['genre'].tolist()
num_sentences = len(song_lyrics)


if os.path.isdir(model_str):
    embedding_fp = f"{os.path.split(model_str)[0]}_embeddings.pt"
else:
    embedding_fp = f"{model_str}_embeddings.pt"
embedding_fp = os.path.join("embeddings", embedding_fp)
print(embedding_fp)

if os.path.exists(embedding_fp):
    print("loading already computed embeddings")
    corpus_embeddings = torch.load(embedding_fp)
else:
    start_time = time.time()
    if os.path.isdir(model_str):
        tokens = tokenizer.batch_encode_plus(
            song_lyrics,
            max_length = 512,
            padding=True,
            truncation=True
        )
        
        
        embed = []
        with torch.no_grad():
            for i in tqdm(range(len( df_clean['lyrics']))):
                tkin = tokens['input_ids'][i:i+1]
                tkam = tokens['attention_mask'][i:i+1]

                tkin = torch.tensor(tkin).cuda()
                tkam = torch.tensor(tkam).cuda()

                out = model(tkin,tkam)['last_hidden_state']
                out = out.mean(1).cpu().numpy()

                embed.append(out)
        corpus_embeddings = np.vstack(embed)
        
    else:
        corpus_embeddings = model.encode(song_lyrics)
        corpus_embeddings = corpus_embeddings.cpu().data.numpy()
    proc_time = time.time() - start_time
    print(f"Time for computing embeddings : {proc_time} seconds")
    print(f"{proc_time / num_sentences} seconds per song")
    torch.save(corpus_embeddings, embedding_fp)
    
    
print(corpus_embeddings.shape)

embeddings/all-mpnet-base-v2-finetuned-genre_unfrozen_base-checkpoint-1041_embeddings.pt
loading already computed embeddings
(15863, 768)


# compute clusters

In [39]:
n_clusters = 11
affinity = "cosine"   # “euclidean”, “l1”, “l2”, “manhattan”, “cosine”, or “precomputed”. If linkage is “ward”, only “euclidean” is accepted
linkage = "complete"   # {‘ward’, ‘complete’, ‘average’, ‘single’}, default=’ward’

# normalize
embedding_norms = np.linalg.norm(corpus_embeddings, axis=1, keepdims=True)
corpus_embeddings = corpus_embeddings /  embedding_norms

if os.path.isdir(model_str):
    clustering_fp = os.path.split(model_str)[0]
else:
    clustering_fp = model_str
clustering_fp += f"_{n_clusters}clusters_affinity={affinity}_linkage={linkage}.npy"
clustering_fp = os.path.join("clustering", clustering_fp)
print(clustering_fp)

if os.path.exists(clustering_fp):
    print("loading already computed cluster assignment")
    cluster_assignment = np.load(clustering_fp)
else:
    print("start clustering")
    start_time = time.time()
    clustering_model = AgglomerativeClustering(n_clusters=n_clusters, affinity=affinity, linkage=linkage, distance_threshold=None)
    clustering_model.fit(corpus_embeddings)
    cluster_assignment = clustering_model.labels_
    proc_time = time.time() - start_time
    print(f"clustering time : {proc_time} seconds")
    np.save(clustering_fp, cluster_assignment)
    
# print metrics
score = calinski_harabasz_score(corpus_embeddings, cluster_assignment)
print("calinski_harabasz_score : ", score)

score = davies_bouldin_score(corpus_embeddings, cluster_assignment)
print("davies_bouldin_score : ", score)

score = silhouette_score(corpus_embeddings, cluster_assignment)
print("silhouette_score : ", score)

clustering/all-mpnet-base-v2-finetuned-genre_unfrozen_base-checkpoint-1041_11clusters_affinity=cosine_linkage=complete.npy
loading already computed cluster assignment
calinski_harabasz_score :  366.4661612437651
davies_bouldin_score 4.587638110067143
silhouette_score 0.03756221


# sweep parameters

# PCA for visualization

In [None]:
pca = PCA(n_components=3)
pca_result = pca.fit_transform(X=corpus_embeddings)
print(pca_result.shape)

In [None]:
fig = plt.figure(figsize=(10, 7))
cm = plt.get_cmap('gist_rainbow')
# clrs = [cm(i//3*3.0/n_clusters) for i in range(n_clusters)]
clrs = sns.color_palette('husl', n_colors=n_clusters)  # a list of RGB tuples
ax = plt.axes(projection ="3d")
start_idx = 0
for i in range(n_clusters):
    cluster_mask = cluster_assignment == i
    pca_cluster = pca_result[cluster_mask]
    ax.scatter3D(pca_cluster[:, 0], pca_cluster[:, 1], pca_cluster[:, 2], color=clrs[i], label=i)
plt.legend()