# CLUSTERING WITH KMEDOIDS

- The objective is to find k cluster of similar documents given a certain corpus
- The main analysis are:
 - The distribution of texts in each class (if the texts are already classified)
 - Find the texts closest to the center of each cluster
 - Measure the density of each cluster

In [1]:
# LOAD DATA
import pandas as pd

df = pd.read_csv("text_class.csv")

In [2]:
import sys
from pathlib import Path

sys.path.insert(0, str(Path().absolute().parent.parent))

from src.text_vectorization import hashing_texts

X = hashing_texts(df["text"], 2**15)

In [3]:
import numpy as np

np.shape(X)

(97, 32768)

In [4]:
from sklearn_extra.cluster import KMedoids

def closest_n_index(X, n_clusters=10):
    kmedoids = KMedoids(n_clusters=n_clusters, random_state=0).fit(X)
    return kmedoids.cluster_centers_, kmedoids.labels_

In [26]:
indices_centers, labels = closest_n_index(X, n_clusters=2)

In [27]:
labels

array([0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0,
       0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1,
       1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [28]:
from sklearn.metrics import silhouette_score as ss

print("Silhouette score of ", ss(X, labels))

Silhouette score of  0.012571669984514694


In [8]:
y = df["class"]
y

0     0
1     0
2     0
3     0
4     0
     ..
92    1
93    1
94    1
95    1
96    1
Name: class, Length: 97, dtype: int64

In [9]:
from collections import Counter
print(Counter(labels))

Counter({3: 37, 4: 22, 2: 17, 1: 12, 0: 9})


In [10]:
dic_distribuicao_classes_cluster = {}
for index, item in enumerate(labels):
    if item not in dic_distribuicao_classes_cluster:
        dic_distribuicao_classes_cluster[item] = {0:0,1:0}
    dic_distribuicao_classes_cluster[item][y[index]] += 1

In [11]:
dic_distribuicao_classes_cluster

{2: {0: 12, 1: 5},
 3: {0: 23, 1: 14},
 0: {0: 8, 1: 1},
 4: {0: 20, 1: 2},
 1: {0: 10, 1: 2}}