# CLUSTERING WITH KMEDOIDS

- The objective is to find k clusters of similar documents given a certain corpus
- The main analysis are:
 - Find the texts closest to the center of each cluster and manually read them for insight
 - Measure the density of each cluster and how far apart they are from one another
 - The distribution of texts in each class (if the texts are already classified)

### LOAD DATA

In [22]:
import pandas as pd
import re
from pathlib import Path

df = pd.read_csv(Path().absolute().parent.parent / "data/text_class_8k.csv")

In [23]:
new_rows = []
for row in df.to_dict("records"):
    if re.search(r"alienação fiduciária", row["text"], flags=re.I|re.S):
        new_rows.append(row)
df = pd.DataFrame(new_rows)

In [26]:
import sys
from pathlib import Path

sys.path.insert(0, str(Path().absolute().parent.parent))

from src.text_vectorization import hashing_texts

X = hashing_texts(df["text"], 2**14)

In [27]:
import numpy as np

np.shape(X)

(268, 16384)

### CLUSTERING

In [28]:
from sklearn_extra.cluster import KMedoids

def closest_n_index(X, n_clusters=10):
    kmedoids = KMedoids(n_clusters=n_clusters, random_state=0).fit(X)
    return kmedoids.medoid_indices_, kmedoids.labels_

In [29]:
indices_centers, labels = closest_n_index(X, n_clusters=5)

In [30]:
labels

array([1, 1, 2, 2, 4, 3, 1, 0, 2, 1, 4, 1, 1, 1, 1, 3, 3, 2, 1, 4, 4, 3,
       3, 1, 0, 1, 1, 1, 3, 2, 0, 3, 2, 0, 4, 0, 0, 0, 0, 1, 4, 1, 3, 0,
       2, 3, 1, 1, 4, 4, 4, 4, 3, 3, 3, 2, 4, 4, 0, 4, 2, 0, 2, 1, 4, 3,
       1, 1, 4, 4, 1, 0, 0, 4, 0, 4, 4, 4, 2, 1, 1, 0, 2, 4, 4, 2, 4, 3,
       2, 2, 2, 4, 0, 4, 3, 3, 4, 1, 0, 0, 3, 2, 4, 1, 2, 3, 3, 3, 4, 1,
       1, 1, 1, 1, 3, 4, 1, 1, 0, 1, 4, 1, 1, 3, 1, 4, 0, 3, 1, 4, 1, 2,
       1, 1, 2, 4, 4, 1, 2, 0, 4, 4, 2, 4, 3, 4, 1, 4, 4, 0, 2, 4, 1, 1,
       1, 4, 0, 1, 2, 2, 3, 1, 1, 4, 2, 4, 1, 2, 0, 0, 0, 4, 3, 4, 3, 2,
       2, 2, 2, 4, 4, 4, 4, 4, 4, 1, 4, 2, 0, 0, 0, 2, 2, 4, 4, 4, 4, 2,
       2, 4, 1, 1, 1, 1, 3, 4, 1, 2, 4, 4, 3, 1, 1, 4, 2, 2, 4, 2, 3, 4,
       1, 0, 0, 2, 2, 1, 1, 1, 1, 4, 1, 4, 1, 1, 2, 3, 3, 2, 0, 1, 3, 4,
       0, 3, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 3, 3, 3, 1, 3, 1, 1,
       1, 4, 4, 3], dtype=int64)

In [31]:
df_texts = df.iloc[indices_centers]

In [34]:
df_texts.to_excel(Path().absolute().parent.parent / "data/text_class_8k_medoids.xlsx", index=False)

In [35]:
df["grupo"] = labels

In [36]:
from sklearn.metrics import silhouette_score as ss

print("Silhouette score of ", ss(X, labels))

Silhouette score of  0.028147100620629865


In [64]:
regex_mato = ("mato", r"mandado")
regex_deferimento_liminar = ("liminar_deferida", r"( defiro|concedo).{,5}liminar")
regex_ato_ordinatorio = ("ato_ordinatorio", r"intimada|intimação")

In [70]:
y = []
for row in df.to_dict("records"):
    if re.search(regex_ato_ordinatorio[1], row["text"], flags=re.I|re.S) and re.search(regex_mato[1], row["text"], flags=re.I|re.S):
        y.append(1)
    else:
        y.append(0)

In [71]:
sum(y)

46

In [72]:
from collections import Counter
print(Counter(labels))

Counter({1: 81, 4: 70, 2: 44, 3: 39, 0: 34})


In [73]:
classes_clusters = {}
for index, item in enumerate(labels):
    if item not in classes_clusters:
        classes_clusters[item] = {0:0,1:0}
    classes_clusters[item][y[index]] += 1

In [74]:
classes_clusters

{1: {0: 54, 1: 27},
 2: {0: 41, 1: 3},
 4: {0: 61, 1: 9},
 3: {0: 39, 1: 0},
 0: {0: 27, 1: 7}}