# CLUSTERING WITH KMEDOIDS

- The objective is to find k clusters of similar documents given a certain corpus
- The main analysis are:
 - Find the texts closest to the center of each cluster and manually read them for insight
 - Measure the density of each cluster and how far apart they are from one another
 - The distribution of texts in each class (if the texts are already classified)

### LOAD DATA

In [1]:
import pandas as pd
from pathlib import Path

df = pd.read_csv(Path().absolute().parent.parent / "data/text_class.csv")

In [2]:
import sys
from pathlib import Path

sys.path.insert(0, str(Path().absolute().parent.parent))

from src.text_vectorization import hashing_texts

X = hashing_texts(df["text"], 2**15)

In [3]:
import numpy as np

np.shape(X)

(97, 32768)

### CLUSTERING

In [4]:
from sklearn_extra.cluster import KMedoids

def closest_n_index(X, n_clusters=10):
    kmedoids = KMedoids(n_clusters=n_clusters, random_state=0).fit(X)
    return kmedoids.cluster_centers_, kmedoids.labels_

In [5]:
indices_centers, labels = closest_n_index(X, n_clusters=5)

In [6]:
labels

array([2, 3, 3, 0, 3, 3, 2, 2, 3, 3, 3, 2, 3, 2, 3, 3, 3, 3, 3, 2, 4, 4,
       2, 0, 1, 0, 4, 0, 0, 3, 4, 1, 2, 4, 4, 4, 4, 4, 4, 1, 3, 3, 3, 3,
       3, 2, 4, 1, 0, 1, 4, 4, 3, 1, 0, 4, 0, 4, 1, 2, 2, 3, 1, 3, 4, 4,
       4, 4, 1, 2, 1, 3, 4, 2, 3, 3, 1, 2, 1, 0, 3, 3, 4, 3, 3, 3, 3, 2,
       2, 3, 3, 4, 3, 3, 2, 3, 3], dtype=int64)

In [7]:
from sklearn.metrics import silhouette_score as ss

print("Silhouette score of ", ss(X, labels))

Silhouette score of  0.0071815754142927485


In [8]:
y = df["class"]
y

0     0
1     0
2     0
3     0
4     0
     ..
92    1
93    1
94    1
95    1
96    1
Name: class, Length: 97, dtype: int64

In [9]:
from collections import Counter
print(Counter(labels))

Counter({3: 37, 4: 22, 2: 17, 1: 12, 0: 9})


In [10]:
classes_clusters = {}
for index, item in enumerate(labels):
    if item not in classes_clusters:
        classes_clusters[item] = {0:0,1:0}
    classes_clusters[item][y[index]] += 1

In [11]:
classes_clusters

{2: {0: 12, 1: 5},
 3: {0: 23, 1: 14},
 0: {0: 8, 1: 1},
 4: {0: 20, 1: 2},
 1: {0: 10, 1: 2}}