In [2]:
import warnings
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")

import sys
from os.path import abspath
sys.path.insert(0, abspath('..'))

from os.path import join

from sklearn import metrics
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from spherecluster import SphericalKMeans, VonMisesFisherMixture

import numpy as np
from tabulate import tabulate

import logging

import torch


from torchSTC.data import load_data
from torchSTC.modules import STC
from torchSTC.metrics import SpacePlot, Evaluate
from torchSTC.utils.cluster import SphericalKmeans

plot = SpacePlot()
eval = Evaluate()

[nltk_data] Downloading package punkt to /home/godwin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /home/godwin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


>>>>> /home/godwin/Documents/academic/PPD/torchSTC/demos/SearchSnippets
>>>>> data_loader.py cwd:  /home/godwin/Documents/academic/PPD/torchSTC/demos/SearchSnippets


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Display progress logs on stdout
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(levelname)s %(message)s')

In [4]:
cur = abspath("")
dataset = 'SearchSnippets'
data_in_dir=join(cur, "../..", "datasets")
dataset_dir=join(data_in_dir, dataset)

In [5]:
checkpoint_dir=join(dataset_dir, 
                    "artefacts",
                    "STC-d384:500:500:2000:20-epoch30-datSearchSnippets-wdeHuggingFace-scaMinMax-tfeNone-norml2-initKmeans"
                    )

checkpoint = "STC-datSearchSnippets-wdeHuggingFace-scaMinMax-tfeNone-norml2-initKmeans.pth"
checkpoint_path = join(checkpoint_dir, checkpoint)
checkpoint_path

'/home/godwin/Documents/academic/PPD/torchSTC/demos/SearchSnippets/../../datasets/SearchSnippets/artefacts/STC-d384:500:500:2000:20-epoch30-datSearchSnippets-wdeHuggingFace-scaMinMax-tfeNone-norml2-initKmeans/STC-datSearchSnippets-wdeHuggingFace-scaMinMax-tfeNone-norml2-initKmeans.pth'

### STC-datstackoverflow-wdeHuggingFace-scaMinMax-tfeNone-normNone-initKmeans + Kmeans

In [6]:
x, y = load_data(dataset=dataset_dir, word_emb='HuggingFace', transform=None, scaler='MinMax', norm='l2')
n_clusters = len(torch.unique(torch.tensor(y)))

# Division des données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=0)

# conversion des données en tenseurs
X_train = torch.tensor(X_train, dtype=torch.float)
X_test = torch.tensor(X_test, dtype=torch.float)
y_train = torch.tensor(y_train, dtype=torch.long)
y_test = torch.tensor(y_test, dtype=torch.long)

stc = STC(hidden_dims=[torch.Tensor(X_train).shape[-1], 500, 500, 2000, 20], n_clusters=n_clusters)
stc.from_pretrained(checkpoint_path)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

MinMax scaling completed...


(torch.Size([11106, 384]),
 torch.Size([1234, 384]),
 torch.Size([11106]),
 torch.Size([1234]))

In [7]:
zt = stc.autoencoder.encoder(X_train)
zy = zt.detach().numpy()
zy.shape

zyn = normalize(zy, norm='l2')

In [8]:
xty = X_train.detach().numpy()
yty = y_train.detach().numpy()

In [25]:
# table for results display
table = []

In [10]:
# movMF-soft
vmf_soft = VonMisesFisherMixture(n_clusters=n_clusters, posterior_type='soft')
vmf_soft.fit(xty)

# vmf_soft.cluster_centers_
# vmf_soft.labels_
# vmf_soft.weights_
# vmf_soft.concentrations_
# vmf_soft.inertia_

In [11]:
print()
print('weights: {}'.format(vmf_soft.weights_))
print('concentrations: {}'.format(vmf_soft.concentrations_))

print("-----------------------------")
print("Accuracy: %.3f" % eval.accuracy(yty, vmf_soft.labels_))
print("Normalized Mutual Information: %.3f" % metrics.normalized_mutual_info_score(yty, vmf_soft.labels_))
print("Adjusted Rand-Index: %.3f"
      % metrics.adjusted_rand_score(yty, vmf_soft.labels_))
print("Adjusted Mututal Information: %.3f"
      % metrics.adjusted_mutual_info_score(yty, vmf_soft.labels_))
print("Normalized Mututal Information: %.3f"
      % metrics.normalized_mutual_info_score(yty, vmf_soft.labels_))
print("Silhouette Coefficient (euclidean): %0.3f"
      % metrics.silhouette_score(xty, vmf_soft.labels_, metric='euclidean'))
print("Silhouette Coefficient (cosine): %0.3f"
      % metrics.silhouette_score(xty, vmf_soft.labels_, metric='cosine'))
print("Homogeneity: %0.3f" % metrics.homogeneity_score(yty, vmf_soft.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(yty, vmf_soft.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(yty, vmf_soft.labels_))

print()


weights: [0.17461282 0.10222068 0.12561772 0.14099895 0.1132711  0.10805085
 0.05966506 0.17556281]
concentrations: [6502.16152862 6569.5254014  6480.73338393 6131.02420901 6699.41793633
 6528.2453156  6989.07034233 6102.28040553]
-----------------------------
Accuracy: 0.736
Normalized Mutual Information: 0.586
Adjusted Rand-Index: 0.536
Adjusted Mututal Information: 0.586
Normalized Mututal Information: 0.586
Silhouette Coefficient (euclidean): 0.035
Silhouette Coefficient (cosine): 0.066
Homogeneity: 0.593
Completeness: 0.579
V-measure: 0.586



In [26]:
table.append([
    'movMF-soft',
    eval.accuracy(yty, vmf_soft.labels_),
    metrics.normalized_mutual_info_score(yty, vmf_soft.labels_),
    metrics.adjusted_rand_score(yty, vmf_soft.labels_),
    metrics.adjusted_mutual_info_score(yty, vmf_soft.labels_),
    metrics.homogeneity_score(yty, vmf_soft.labels_),
    metrics.completeness_score(yty, vmf_soft.labels_),
    metrics.v_measure_score(yty, vmf_soft.labels_),
    metrics.silhouette_score(xty, vmf_soft.labels_, metric='cosine'),
    metrics.silhouette_score(xty, vmf_soft.labels_, metric='euclidean')])

In [13]:
# movMF-hard
from spherecluster import VonMisesFisherMixture
vmf_hard = VonMisesFisherMixture(n_clusters=n_clusters, posterior_type='hard')
vmf_hard.fit(xty)

# vmf_hard.cluster_centers_
# vmf_hard.labels_
# vmf_hard.weights_
# vmf_hard.concentrations_
# vmf_hard.inertia_

In [14]:
print()
print('weights: {}'.format(vmf_hard.weights_))
print('concentrations: {}'.format(vmf_hard.concentrations_))

print("-----------------------------")
print("Accuracy: %.3f" % eval.accuracy(yty, vmf_hard.labels_))
print("Normalized Mutual Information: %.3f" % metrics.normalized_mutual_info_score(yty, vmf_hard.labels_))
print("Adjusted Rand-Index: %.3f"
      % metrics.adjusted_rand_score(yty, vmf_hard.labels_))
print("Adjusted Mututal Information: %.3f"
      % metrics.adjusted_mutual_info_score(yty, vmf_hard.labels_))
print("Normalized Mututal Information: %.3f"
      % metrics.normalized_mutual_info_score(yty, vmf_hard.labels_))
print("Silhouette Coefficient (euclidean): %0.3f"
      % metrics.silhouette_score(xty, vmf_hard.labels_, metric='euclidean'))
print("Silhouette Coefficient (cosine): %0.3f"
      % metrics.silhouette_score(xty, vmf_hard.labels_, metric='cosine'))
print("Homogeneity: %0.3f" % metrics.homogeneity_score(yty, vmf_hard.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(yty, vmf_hard.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(yty, vmf_hard.labels_))

print()


weights: [0.11336215 0.12893931 0.14100486 0.17810193 0.16459571 0.06167837
 0.10867999 0.10363767]
concentrations: [6697.76765084 6456.98091941 6122.77575673 6473.72745141 6157.26310546
 6951.51672984 6524.76913129 6549.29912504]
-----------------------------
Accuracy: 0.742
Normalized Mutual Information: 0.593
Adjusted Rand-Index: 0.550
Adjusted Mututal Information: 0.592
Normalized Mututal Information: 0.593


Silhouette Coefficient (euclidean): 0.035
Silhouette Coefficient (cosine): 0.067
Homogeneity: 0.601
Completeness: 0.585
V-measure: 0.593



In [27]:
table.append([
    'movMF-hard',
    eval.accuracy(yty, vmf_hard.labels_),
    metrics.normalized_mutual_info_score(yty, vmf_hard.labels_),
    metrics.adjusted_rand_score(yty, vmf_hard.labels_),
    metrics.adjusted_mutual_info_score(yty, vmf_hard.labels_),
    metrics.homogeneity_score(yty, vmf_hard.labels_),
    metrics.completeness_score(yty, vmf_hard.labels_),
    metrics.v_measure_score(yty, vmf_hard.labels_),
    metrics.silhouette_score(xty, vmf_hard.labels_, metric='cosine'),
    metrics.silhouette_score(xty, vmf_hard.labels_, metric='euclidean')])

In [28]:

###############################################################################
# Print all results in table
headers = [
    'Accuracy',
    'Norm MI',
    'Adj Rand',
    'Adj MI',
    'Homogeneity',
    'Completeness',
    'V-Measure',
    'Silhouette (cos)',
    'Silhouette (euc)']
print(tabulate(table, headers, tablefmt="fancy_grid"))

╒════════════╤════════════╤═══════════╤════════════╤══════════╤═══════════════╤════════════════╤═════════════╤════════════════════╤════════════════════╕
│            │   Accuracy │   Norm MI │   Adj Rand │   Adj MI │   Homogeneity │   Completeness │   V-Measure │   Silhouette (cos) │   Silhouette (euc) │
╞════════════╪════════════╪═══════════╪════════════╪══════════╪═══════════════╪════════════════╪═════════════╪════════════════════╪════════════════════╡
│ movMF-soft │   0.735728 │  0.586135 │   0.536404 │ 0.58568  │      0.593443 │       0.579005 │    0.586135 │          0.0660351 │          0.0347537 │
├────────────┼────────────┼───────────┼────────────┼──────────┼───────────────┼────────────────┼─────────────┼────────────────────┼────────────────────┤
│ movMF-hard │   0.741761 │  0.592512 │   0.550204 │ 0.592064 │      0.600519 │       0.584716 │    0.592512 │          0.0667392 │          0.0351312 │
╘════════════╧════════════╧═══════════╧════════════╧══════════╧═══════════════╧═══

In [None]:
plot.commonSpace_plot(z2.detach().numpy(), comp=[0, 1], tagLabels=y_km_pred2,
                      data_name="Snippets hgf final Kmeans assign", dimred = 'UMAP')