In [1]:
import sys
from os.path import abspath
sys.path.insert(0, abspath('..'))

from os.path import join

from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn import metrics

import numpy as np


import torch
from torchSTC.data import load_data
from torchSTC.modules import STC
from torchSTC.metrics import SpacePlot, Evaluate
from torchSTC.utils.cluster import SphericalKmeans

from spherecluster import SphericalKMeans, VonMisesFisherMixture

plot = SpacePlot()
eval = Evaluate()

[nltk_data] Downloading package punkt to /home/godwin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /home/godwin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


>>>>> /home/godwin/Documents/academic/PPD/torchSTC/demos/SearchSnippets
>>>>> data_loader.py cwd:  /home/godwin/Documents/academic/PPD/torchSTC/demos/SearchSnippets


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
cur = abspath("")
dataset = 'SearchSnippets'
data_in_dir=join(cur, "../..", "datasets")
dataset_dir=join(data_in_dir, dataset)

In [3]:
checkpoint_dir=join(dataset_dir, 
                    "artefacts",
                    "STC-d48:2048:512:512:384-epoch30-datSearchSnippets-wdeWord2Vec-scaMinMax-tfeSIF-normNone-initKmeans"
                    )

checkpoint = "STC-datSearchSnippets-wdeWord2Vec-scaMinMax-tfeSIF-normNone-initKmeans.pth"
checkpoint_path = join(checkpoint_dir, checkpoint)
checkpoint_path

'/home/godwin/Documents/academic/PPD/torchSTC/demos/SearchSnippets/../../datasets/SearchSnippets/artefacts/STC-d48:2048:512:512:384-epoch30-datSearchSnippets-wdeWord2Vec-scaMinMax-tfeSIF-normNone-initKmeans/STC-datSearchSnippets-wdeWord2Vec-scaMinMax-tfeSIF-normNone-initKmeans.pth'

In [4]:
x, y = load_data(dataset=dataset_dir, word_emb='Word2Vec', transform='SIF', scaler='MinMax', norm='l2')
n_clusters = len(torch.unique(torch.tensor(y)))

# Division des données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=0)

# conversion des données en tenseurs
X_train = torch.tensor(X_train, dtype=torch.float)
X_test = torch.tensor(X_test, dtype=torch.float)
y_train = torch.tensor(y_train, dtype=torch.long)
y_test = torch.tensor(y_test, dtype=torch.long)

hidden_dims = [torch.Tensor(X_train).shape[-1], 2048, 512, 512, 384]
stc = STC(hidden_dims=hidden_dims, n_clusters=n_clusters)
stc.from_pretrained(checkpoint_path)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

### Embedding started...


Word2Vec words embedding loaded...
#### SIF embedding started...
SIF-Embedding 12339 documents with 48-dimensional word vectors...
SVD decomposition...
### SIF embedding completed...
### Embedding completed...
[embed_docs] XX shape:  (12340, 48)
>>> |  (12340, 48)
MinMax scaling completed...
l2 normalization completed...


(torch.Size([11106, 48]),
 torch.Size([1234, 48]),
 torch.Size([11106]),
 torch.Size([1234]))

In [5]:
z = stc.autoencoder.encoder(X_train)

In [6]:
# comprehension list with 5 runs of kmeans, get average and std of metrics
avg_hgf_mmx_ikm = []
tmp = []
for i in range(5):
    kmeans = KMeans(n_clusters=n_clusters, n_init=50)
    kmeans.fit(z.detach().numpy())
    y_km_pred = kmeans.labels_
    tmp.append(eval.allMetrics(y_train.detach().numpy(), y_km_pred))

avg_hgf_mmx_ikm = np.array(tmp)
np.round(avg_hgf_mmx_ikm.mean(axis=0), 3) * 100, avg_hgf_mmx_ikm.std(axis=0)

(array([53.1, 36.9, 28.8]), array([0., 0., 0.]))

In [7]:
vmf_soft = VonMisesFisherMixture(n_clusters=n_clusters, posterior_type='soft')
vmf_soft.fit(z.detach().numpy())
print()




In [8]:
print('weights: {}'.format(vmf_soft.weights_))
print('concentrations: {}'.format(vmf_soft.concentrations_))

print("-----------------------------")
print("Accuracy: %.3f" % eval.accuracy(y_train.detach().numpy() , vmf_soft.labels_))
print("Normalized Mutual Information: %.3f" % metrics.normalized_mutual_info_score(y_train.detach().numpy(), vmf_soft.labels_))
print("Adjusted Rand-Index: %.3f"
      % metrics.adjusted_rand_score(y_train.detach().numpy(), vmf_soft.labels_))
print("Adjusted Mututal Information: %.3f"
      % metrics.adjusted_mutual_info_score(y_train.detach().numpy(), vmf_soft.labels_))
print("Normalized Mututal Information: %.3f"
      % metrics.normalized_mutual_info_score(y_train.detach().numpy(), vmf_soft.labels_))
print("Silhouette Coefficient (euclidean): %0.3f"
      % metrics.silhouette_score(z.detach().numpy() , vmf_soft.labels_, metric='euclidean'))
print("Silhouette Coefficient (cosine): %0.3f"
      % metrics.silhouette_score(z.detach().numpy(), vmf_soft.labels_, metric='cosine'))
print("Homogeneity: %0.3f" % metrics.homogeneity_score(y_train.detach().numpy(), vmf_soft.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(y_train.detach().numpy(), vmf_soft.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(y_train.detach().numpy(), vmf_soft.labels_))

print()

weights: [0.14898777 0.10127717 0.06715479 0.11608927 0.15468218 0.11753136
 0.14697831 0.14729916]
concentrations: [13853.50674574 15178.15122564 25991.97559776  7301.11641314
  7314.41932353 16301.0791993   6667.5529408  13116.36371907]
-----------------------------
Accuracy: 0.500
Normalized Mutual Information: 0.356
Adjusted Rand-Index: 0.262
Adjusted Mututal Information: 0.355
Normalized Mututal Information: 0.356


Silhouette Coefficient (euclidean): 0.216
Silhouette Coefficient (cosine): 0.349
Homogeneity: 0.361
Completeness: 0.350
V-measure: 0.356



In [9]:
###############################################################################
# Spherical K-Means clustering
skm = SphericalKMeans(n_clusters=n_clusters, n_init=20)

print("Clustering with %s" % skm)
skm.fit(z.detach().numpy())
print()

print("-----------------------------")
print("Accuracy: %.3f" % eval.accuracy(y_train.detach().numpy(), skm.labels_))
print("Normalized Mutual Information: %.3f" % metrics.normalized_mutual_info_score(y_train.detach().numpy(), skm.labels_))
print("Adjusted Rand-Index: %.3f"
      % metrics.adjusted_rand_score(y_train.detach().numpy(), skm.labels_))
print("Adjusted Mututal Information: %.3f"
      % metrics.adjusted_mutual_info_score(y_train.detach().numpy(), skm.labels_))
print("Normalized Mututal Information: %.3f"
      % metrics.normalized_mutual_info_score(y_train.detach().numpy(), skm.labels_))
print("Silhouette Coefficient (euclidean): %0.3f"
      % metrics.silhouette_score(z.detach().numpy(), skm.labels_, metric='euclidean'))
print("Silhouette Coefficient (cosine): %0.3f"
      % metrics.silhouette_score(z.detach().numpy(), skm.labels_, metric='cosine'))
print("Homogeneity: %0.3f" % metrics.homogeneity_score(y_train.detach().numpy(), skm.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(y_train.detach().numpy(), skm.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(y_train.detach().numpy(), skm.labels_))

print()

Clustering with SphericalKMeans(n_init=20)


TypeError: _BaseKMeans._init_centroids() missing 1 required positional argument: 'sample_weight'