# Prova clustering documenti testuali finale PEM
Proviamo a fare un riassunto finale. Vediamo di importare tutti i pacchetti e le funzioni che mi serviranno.

In [1]:
import numpy as np

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.metrics import homogeneity_score, completeness_score, v_measure_score, adjusted_rand_score, silhouette_score

In [2]:
# Importazione dei dati testuali

categories = [
    "alt.atheism",
    "talk.religion.misc",
    "comp.graphics",
    "sci.space",
]

dataset = fetch_20newsgroups(
    remove=("headers", "footers", "quotes"),
    subset="all",
    categories=categories,
    shuffle=True,
    random_state=42,
)

labels = dataset.target
unique_labels, category_sizes = np.unique(labels, return_counts=True)
true_k = unique_labels.shape[0]

print(f"{len(dataset.data)} documents - {true_k} categories")

3387 documents - 4 categories


In [3]:
# Ora creiamo il vettore con le features

vectorizer = TfidfVectorizer(
    max_df=0.5,
    min_df=5,
    stop_words="english",
)
#t0 = time()
X_tfidf = vectorizer.fit_transform(dataset.data)

#print(f"vectorization done in {time() - t0:.3f} s")
print(f"n_samples: {X_tfidf.shape[0]}, n_features: {X_tfidf.shape[1]}")

n_samples: 3387, n_features: 7929


In [4]:
# Funzione per plottare i dendrogrammi

def plot_dendrogram(model, **kwargs):
    # Create linkage matrix and then plot the dendrogram

    # create the counts of samples under each node
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack(
        [model.children_, model.distances_, counts]
    ).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)

### K-Means clustering
Dovremmo aver creato tutte le funzioni che ci servivano per il nostro scopo. Proviamo ora a eseguire Kmeans (con 4 cluster) e clustering agglomerativo (sempre con 4 cluster) per poi andare a valutarne le metriche che abbiamo importato. Possiamo ad esempio cercare di usare diverse metriche e dissimilarità per il clustering gerarchico per vedere quale sia il migliore.

In [5]:
kmeans = KMeans(n_clusters = true_k, max_iter = 100, n_init = 20)

model_kmeans = kmeans.fit(X_tfidf)
cluster_ids, cluster_sizes = np.unique(model_kmeans.labels_, return_counts=True)
print(f"Number of elements assigned to each cluster: {cluster_sizes}")

print(
    "True number of documents in each category according to the class labels: "
    f"{category_sizes}"
)

Number of elements assigned to each cluster: [ 297  506  754 1830]
True number of documents in each category according to the class labels: [799 973 987 628]


In [6]:
# Vediamo le varie metriche
labels_pred_kmeans = model_kmeans.predict(X_tfidf)
hom_kmeans = homogeneity_score(labels, labels_pred_kmeans)
comp_kmeans = completeness_score(labels, labels_pred_kmeans)
print(f"Homogeneity: {hom_kmeans} ")
print(f"Completeness: {comp_kmeans} ")

Homogeneity: 0.3550608049315449 
Completeness: 0.4179607709646504 


### Clustering agglomerativo
Proviamo ora con il clustering agglomerativo. Usiamo come metrica quella euclidea perchè quella del coseno dà problemi.

In [7]:
agg_clust1 = AgglomerativeClustering(n_clusters=4,
                                metric = 'euclidean',
                                linkage = 'complete')
labels_pred_agg1 = agg_clust1.fit_predict(X_tfidf.toarray())

In [8]:
cluster_ids_agg1, cluster_sizes_agg1 = np.unique(labels_pred_agg1, return_counts=True)
print(f"Number of elements assigned to each cluster: {cluster_sizes_agg1}")

print(
    "True number of documents in each category according to the class labels: "
    f"{category_sizes}"
)

Number of elements assigned to each cluster: [2977  189  135   86]
True number of documents in each category according to the class labels: [799 973 987 628]


In [9]:
# Vediamo ora le metriche
hom_agg1 = homogeneity_score(labels, labels_pred_agg1)
comp_agg1 = completeness_score(labels, labels_pred_agg1)
print(f"Homogeneity: {hom_agg1} ")
print(f"Completeness: {comp_agg1} ")

Homogeneity: 0.02501046896780025 
Completeness: 0.06909987879550528 


Direi che non risulta molto efficace in questo caso. Proviamo un linkage diverso.

In [10]:
agg_clust2 = AgglomerativeClustering(n_clusters=4,
                                metric = 'euclidean',
                                linkage = 'single')
labels_pred_agg2 = agg_clust2.fit_predict(X_tfidf.toarray())

cluster_ids_agg2, cluster_sizes_agg2 = np.unique(labels_pred_agg2, return_counts=True)
print(f"Number of elements assigned to each cluster: {cluster_sizes_agg2}")

print(
    "True number of documents in each category according to the class labels: "
    f"{category_sizes}"
)

Number of elements assigned to each cluster: [3384    1    1    1]
True number of documents in each category according to the class labels: [799 973 987 628]


In [11]:
# Vediamo ora le metriche
hom_agg2 = homogeneity_score(labels, labels_pred_agg2)
comp_agg2 = completeness_score(labels, labels_pred_agg2)
print(f"Homogeneity: {hom_agg2} ")
print(f"Completeness: {comp_agg2} ")

Homogeneity: 0.0008486690602323611 
Completeness: 0.14390319976072855 


Terribile. Proviamo con l'average.

In [12]:
agg_clust3 = AgglomerativeClustering(n_clusters=4,
                                metric = 'euclidean',
                                linkage = 'average')
labels_pred_agg3 = agg_clust3.fit_predict(X_tfidf.toarray())

cluster_ids_agg3, cluster_sizes_agg3 = np.unique(labels_pred_agg3, return_counts=True)
print(f"Number of elements assigned to each cluster: {cluster_sizes_agg3}")

print(
    "True number of documents in each category according to the class labels: "
    f"{category_sizes}"
)

Number of elements assigned to each cluster: [3381    3    2    1]
True number of documents in each category according to the class labels: [799 973 987 628]


In [13]:
# Vediamo ora le metriche
hom_agg3 = homogeneity_score(labels, labels_pred_agg3)
comp_agg3 = completeness_score(labels, labels_pred_agg3)
print(f"Homogeneity: {hom_agg3} ")
print(f"Completeness: {comp_agg3} ")

Homogeneity: 0.0009849064613946799 
Completeness: 0.09131352582073328 


Proviamo infine la ward (che però non ho spiegato nel documento).

In [14]:
agg_clust4 = AgglomerativeClustering(n_clusters=4,
                                metric = 'euclidean',
                                linkage = 'ward')
labels_pred_agg4 = agg_clust4.fit_predict(X_tfidf.toarray())

cluster_ids_agg4, cluster_sizes_agg4 = np.unique(labels_pred_agg4, return_counts=True)
print(f"Number of elements assigned to each cluster: {cluster_sizes_agg4}")

print(
    "True number of documents in each category according to the class labels: "
    f"{category_sizes}"
)

Number of elements assigned to each cluster: [2100  324   18  945]
True number of documents in each category according to the class labels: [799 973 987 628]


In [15]:
# Vediamo ora le metriche
hom_agg4 = homogeneity_score(labels, labels_pred_agg4)
comp_agg4 = completeness_score(labels, labels_pred_agg4)
print(f"Homogeneity: {hom_agg4} ")
print(f"Completeness: {comp_agg4} ")

Homogeneity: 0.27864739790528836 
Completeness: 0.42212919748123157 


Questo non è male tutto sommato. Dovrei provare magari con altre metriche.

## Codice dopo incontro del 25.03.24
Vediamo di trovare se ci sono effettivamente elementi nulli nella matrice. Le entrate sono tutte non negative: quindi basta che controlliamo la somma. Il codice dopo è poco performante ma almeno dovrebbe restituire un risultato sensato. Ci mette qualche minuto.

In [16]:
prova = np.zeros(len(X_tfidf.toarray()))
zeri = []
#caso.append(1)
for i in range(len(X_tfidf.toarray())):
    a = X_tfidf.toarray()[i].sum()
    if (a==0):
        zeri.append(i)
print(zeri)

[144, 155, 168, 177, 229, 301, 321, 363, 386, 397, 398, 529, 553, 566, 585, 598, 603, 612, 666, 667, 678, 831, 902, 934, 980, 1010, 1020, 1088, 1112, 1137, 1154, 1158, 1182, 1236, 1298, 1353, 1363, 1426, 1438, 1465, 1485, 1494, 1510, 1554, 1561, 1565, 1574, 1598, 1631, 1718, 1719, 1731, 1767, 1802, 1820, 1870, 1924, 1949, 1982, 1987, 2009, 2025, 2028, 2037, 2094, 2095, 2142, 2160, 2249, 2280, 2284, 2363, 2371, 2380, 2390, 2413, 2414, 2473, 2558, 2564, 2616, 2635, 2750, 2769, 2864, 2938, 2958, 2996, 2998, 3007, 3056, 3059, 3084, 3114, 3184, 3194, 3215, 3224, 3226, 3233, 3241, 3293, 3347, 3360]


Copiamo per semplicità la lista degli indici in cui abbiamo solo elementi zero: [144, 155, 168, 177, 229, 301, 321, 363, 386, 397, 398, 529, 553, 566, 585, 598, 603, 612, 666, 667, 678, 831, 902, 934, 980, 1010, 1020, 1088, 1112, 1137, 1154, 1158, 1182, 1236, 1298, 1353, 1363, 1426, 1438, 1465, 1485, 1494, 1510, 1554, 1561, 1565, 1574, 1598, 1631, 1718, 1719, 1731, 1767, 1802, 1820, 1870, 1924, 1949, 1982, 1987, 2009, 2025, 2028, 2037, 2094, 2095, 2142, 2160, 2249, 2280, 2284, 2363, 2371, 2380, 2390, 2413, 2414, 2473, 2558, 2564, 2616, 2635, 2750, 2769, 2864, 2938, 2958, 2996, 2998, 3007, 3056, 3059, 3084, 3114, 3184, 3194, 3215, 3224, 3226, 3233, 3241, 3293, 3347, 3360]

In [17]:
dataset.data[177]

'\n----------'

Ce ne sono alcuni che non contengono parole. Potremmo provare semplicemente a rimuoverli. Proviamo una cosa del genere.

In [18]:
#caso.pop(0)
#dataset.remove
print(zeri)

[144, 155, 168, 177, 229, 301, 321, 363, 386, 397, 398, 529, 553, 566, 585, 598, 603, 612, 666, 667, 678, 831, 902, 934, 980, 1010, 1020, 1088, 1112, 1137, 1154, 1158, 1182, 1236, 1298, 1353, 1363, 1426, 1438, 1465, 1485, 1494, 1510, 1554, 1561, 1565, 1574, 1598, 1631, 1718, 1719, 1731, 1767, 1802, 1820, 1870, 1924, 1949, 1982, 1987, 2009, 2025, 2028, 2037, 2094, 2095, 2142, 2160, 2249, 2280, 2284, 2363, 2371, 2380, 2390, 2413, 2414, 2473, 2558, 2564, 2616, 2635, 2750, 2769, 2864, 2938, 2958, 2996, 2998, 3007, 3056, 3059, 3084, 3114, 3184, 3194, 3215, 3224, 3226, 3233, 3241, 3293, 3347, 3360]


In [19]:
# Proviamo ora a rimuovere gli indici sopra

# Ordiniamo il tutto in senso decrescente
dataset1 = dataset
zeri.sort(reverse=True)

for indice in zeri:
    del dataset1[indice]
    
print(dataset1)

KeyError: 3360

In [22]:
# Proviamo un metodo mio personale

dataset_rem = dataset.data
print(len(dataset_rem))
n_rem = len(zeri)

for i in range(n_rem):
    dataset_rem.pop(zeri[i] - i)
    
print(len(dataset_rem))

3387
3283


In [23]:
X1_tfidf = vectorizer.fit_transform(dataset_rem)

#print(f"vectorization done in {time() - t0:.3f} s")
print(f"n_samples: {X1_tfidf.shape[0]}, n_features: {X1_tfidf.shape[1]}")

n_samples: 3283, n_features: 7718


In [24]:
agg_clust4 = AgglomerativeClustering(n_clusters=4,
                                metric = 'cosine',
                                linkage = 'average')
labels_pred_agg4 = agg_clust4.fit_predict(X1_tfidf.toarray())

cluster_ids_agg4, cluster_sizes_agg4 = np.unique(labels_pred_agg4, return_counts=True)
print(f"Number of elements assigned to each cluster: {cluster_sizes_agg4}")

print(
    "True number of documents in each category according to the class labels: "
    f"{category_sizes}"
)

ValueError: Cosine affinity cannot be used when X contains zero vectors

In [25]:
# Proviamo un metodo mio personale

X1 = X_tfidf.toarray()
print(X1)
#print(len(X1))
n_rem = len(zeri)

for i in range(n_rem):
    X1.remove(zeri[i] - i)
    
#print(len(X1))

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


AttributeError: 'numpy.ndarray' object has no attribute 'remove'

In [26]:
len(X1)

3387

In [27]:
X1 = X_tfidf.toarray()
print(X1)
n_rem = len(zeri)

# Creare una nuova matrice senza gli elementi corrispondenti agli indici in caso
X1_nuovo = [row for index, row in enumerate(X1) if index not in zeri]

print(len(X1_nuovo))

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
3283


In [29]:
agg_clust4 = AgglomerativeClustering(n_clusters=4,
                                metric = 'cosine',
                                linkage = 'average')
labels_pred_agg4 = agg_clust4.fit_predict(X1)

cluster_ids_agg4, cluster_sizes_agg4 = np.unique(labels_pred_agg4, return_counts=True)
print(f"Number of elements assigned to each cluster: {cluster_sizes_agg4}")

print(
    "True number of documents in each category according to the class labels: "
    f"{category_sizes}"
)

ValueError: Cosine affinity cannot be used when X contains zero vectors

In [30]:
X1

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [31]:
prova = np.zeros(len(X1_nuovo))
zeri1 = []
#caso.append(1)
for i in range(len(X1_nuovo)):
    a = X1_nuovo[i].sum()
    if (a==0):
        zeri1.append(i)
print(zeri1)

[]


Errore: in teoria mi dice che la matrice contiene vettori zero, ma li ho rimossi quindi non capisco perchè non me lo esegua.