# URL Embedding Clustering

In [1]:
import os
import numpy as np
import pandas as pd
import sys
sys.path.append(os.path.abspath(".."))

from hdbscan import HDBSCAN
from sklearn import metrics
from sklearn.manifold import TSNE
from gensim.models import Word2Vec
from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans

from url2vec.util.plotter import *
from url2vec.util.metrics import *
from url2vec.util.seqmanager import *

import plotly.plotly as py
from plotly.graph_objs import *
from __future__ import print_function
from plotly.tools import FigureFactory as FF

In [2]:
# available datasets
# cs.illinois.edu    cs.stanford.edu    eecs.mit.edu    cs.princeton.edu    cs.ox.ac.uk
site = "cs.princeton.edu"
words = "10000"
depth = "7"

The crawling proccess has been done in two different ways:

- **No costraint**: the crawler follows a random outlink from all of the outlinks in a given page
- **List costraint**: the crawler follows a random outlink but only from the outlinks in "lists"

## Word2Vec - no-costraint
Word embedding algorithm. Word2vec is a two-layer neural net that processes text. Its input is a text corpus and its output is a set of vectors: feature vectors for words in that corpus. 

Here we're loading all the files that the crawler has generated to train word2vec model.

See the [Dataset README](https://github.com/chrisPiemonte/url2vec/tree/master/dataset "Dataset") for further information.

In [3]:
nocostraint_path = os.getcwd() + "/../dataset/" + site + "/no_constraint/words" + words + "_depth" + depth + "/"
nocostraint_urlmap_path = nocostraint_path + "urlsMap.txt"
nocostraint_seq_path = nocostraint_path + "sequenceIDs.txt"

nocostraint_urlmap = get_urlmap(nocostraint_urlmap_path)

Training the model. No need to keep everything in RAM so we're passing two generators.

**PARAMETERS**:

- **min_count**: ignore all words with total frequency lower than this
- **window**: is the maximum distance between the current and predicted word within a sentence
- **negative**: if > 0, negative sampling will be used, the int for negative specifies how many “noise words” should be drawn (usually between 5-20). Default is 5. If set to 0, no negative samping is used

In [4]:
# because of generator
vocab_sequences_nc = get_sequences(nocostraint_seq_path)
train_sequences_nc = get_sequences(nocostraint_seq_path)

w2v_model_nc = Word2Vec(min_count=1, window=5, negative=5, sg=1)
%time w2v_model_nc.build_vocab(vocab_sequences_nc)
%time w2v_model_nc.train(train_sequences_nc)

CPU times: user 381 ms, sys: 9.44 ms, total: 391 ms
Wall time: 403 ms
CPU times: user 2.63 s, sys: 39.1 ms, total: 2.67 s
Wall time: 1.01 s


601750

In [50]:

diz = {i: w2v_model_nc[i] for i in w2v_model_nc.vocab}
diz2 = w2v_model_nc.vocab
# set(diz.items()) == set(diz2.items())
diz.values()[0]

array([-0.18416683, -0.29268011,  0.3756364 , -1.01170123,  0.87275463,
        0.75541896,  0.5868122 ,  0.10207894,  0.64905965,  0.12065787,
       -0.17724983, -0.52417189,  0.06934015, -0.116202  ,  0.55760622,
       -0.81905454,  0.49707201, -0.27901283, -0.65249157, -0.51395857,
       -1.22680342,  0.26579887,  1.03209412,  0.52609581,  0.68975943,
       -0.45319003, -0.2502383 , -0.25482571, -0.12971771,  0.060135  ,
       -0.68587554,  1.01519752, -0.4740569 ,  0.9070183 , -0.20834909,
        0.03024027,  0.14006303,  0.19124502, -0.59924573,  0.71004677,
        0.39372641, -0.60762513,  0.65921474,  0.09642118, -0.26362443,
        0.41781989, -0.40456322,  0.17181772, -1.03776526,  0.10366169,
       -1.12846363,  0.93439329,  0.16849816,  0.33545765, -0.23753636,
       -0.05055157, -0.14717096, -0.00191312, -0.34755304,  0.10337683,
        0.25634915,  0.57596964,  0.01621638,  0.04680486, -0.15129951,
        0.64662731,  0.38645822,  0.68142247, -0.06355645, -0.31

In [5]:
url_not_in_sequences = list(set(nocostraint_urlmap) - set(w2v_model_nc.vocab))
print(len(url_not_in_sequences))
for url in url_not_in_sequences:
    del nocostraint_urlmap[url]

3843


### t-SNE
Applying t-SNE for dimensionality reduction. We need two dimensional vectors for visualization purposes.

In [6]:
print(len(nocostraint_urlmap))
print(len(w2v_model_nc.vocab))
# 100-dim vecs
wordvecs_nc = np.array([w2v_model_nc[key] for key in nocostraint_urlmap], dtype="float64")

# URL labels
urls_nc = [nocostraint_urlmap[key] for key in nocostraint_urlmap]

# 2-dim vecs
tsne = TSNE(n_components=2)
twodim_wordvecs_nc = %time tsne.fit_transform(wordvecs_nc)

10909
10909


KeyboardInterrupt: 

## Clustering - No-Costraint Word2Vec model

### DBSCAN
DBSCAN - Density-Based Spatial Clustering of Applications with Noise. Finds core samples of high density and expands clusters from them. Good for data which contains clusters of similar density.

**PARAMETERS**:

- **eps** : The maximum distance between two samples for them to be considered as in the same neighborhood.
- **min_samples** : The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. This includes the point itself.

In [None]:
dbscan = DBSCAN(eps=0.8, min_samples=7)
%time dbscan.fit(wordvecs_nc)
dbscan_labels_nc = dbscan.labels_

dbscan_colors_nc = [get_color(clust) for clust in dbscan_labels_nc]

print("Clusters found with DBSCAN:", len(set(dbscan_labels_nc)))
print ([label for label in set(dbscan_labels_nc)])
print("\n\n")

### DBSCAN Plot

In [None]:
dbscan_data_nc = scatter_plot(twodim_wordvecs_nc, urls_nc, dbscan_colors_nc)
py.iplot(dbscan_data_nc, filename='Word Vectors Nocostraint - Scatter plot DBSCAN')

<div>
    <a href="https://plot.ly/~chrispolo/0" 
        target="_blank" title="y" 
        style="display: none; text-align: center;">
            <img src="../dataset/img/nc_wordvectors_scatter_plot_DBSCAN.png" 
                alt="y" style="max-width: 100%;width: 1121px;"  
                width="100%" onerror="this.onerror=null;this.src='https://plot.ly/404';" />
    </a>
    <script data-plotly="chrispolo:0"  src="https://plot.ly/embed.js" async></script>
</div>

### HDBSCAN
HDBSCAN - Hierarchical Density-Based Spatial Clustering of Applications with Noise. Performs DBSCAN over varying epsilon values and integrates the result to find a clustering that gives the best stability over epsilon. This allows HDBSCAN to find clusters of varying densities (unlike DBSCAN), and be more robust to parameter selection.

**PARAMETERS**:
- **min_cluster_size** : minimum nodes to form a cluster

In [None]:
hdbscan = HDBSCAN(min_cluster_size=10)
%time hdbscan_labels_nc = hdbscan.fit_predict(wordvecs_nc)

hdbscan_colors_nc = [get_color(clust) for clust in hdbscan_labels_nc]

print("Clusters found with HDBSCAN:", len(set(hdbscan_labels_nc)))
print([label for label in set(hdbscan_labels_nc)])
print("\n\n")

### HDBSCAN Plot

In [None]:
hdbscan_data_nc = scatter_plot(twodim_wordvecs_nc, urls_nc, hdbscan_colors_nc)
py.iplot(hdbscan_data_nc, filename='Word Vectors Nocostraint - Scatter plot HDBSCAN')

<div>
    <a href="https://plot.ly/~chrispolo/2" 
        target="_blank" title="y" 
        style="display: none; text-align: center;">
            <img src="../dataset/img/nc_wordvectors_scatter_plot_HDBSCAN.png" 
                alt="y" style="max-width: 100%;width: 1121px;"  
                width="100%" onerror="this.onerror=null;this.src='https://plot.ly/404';" />
    </a>
    <script data-plotly="chrispolo:2"  src="https://plot.ly/embed.js" async></script>
</div>

### K-MEANS
The first step is to initialize the algorithm by choosing K initial cluster centroid locations. This is typically done by randomly choosing K points from the input set. With these initial K centroids, the algorithm proceeds by repeating the following two main steps:

- Cluster Assignment - Here, each observation (e.g., each point in the data set) is assigned to a cluster centroid such that the WCSS objective function is minimized. This can often be translated to assigning each observation to the closest cluster centroid (which coincidentally minimizes WCSS for many distance metrics), though for some distance metrics and spaces this need not be the case.
- Update Centroids - After all of the input observations have been assigned to a cluster centroid, each centroid is re-computed. For each cluster, the new centroid is computed by averaging the observations that were assigned to it (e.g., computing the 'mean' of the observations).

These steps are repeated until the algorithm "converges".

**PARAMETERS**:

- **n_clusters**: number of clusters

In [None]:
kmeans = KMeans(n_clusters=30)
%time kmeans.fit(wordvecs_nc)

kmeans_labels_nc = kmeans.labels_

kmeans_colors_nc = [get_color(clust) for clust in kmeans_labels_nc]

print("Clusters found with K-MEANS:", len(set(kmeans_labels_nc)))
print([label for label in set(kmeans_labels_nc)])

### K-MEANS Plot

In [None]:
kmeans_data_nc = scatter_plot(twodim_wordvecs_nc, urls_nc, kmeans_colors_nc)
py.iplot(kmeans_data_nc, filename='Word Vectors Nocostraint - Scatter plot K-MEANS')

<div>
    <a href="https://plot.ly/~chrispolo/4" 
        target="_blank" title="y" 
        style="display: none; text-align: center;">
            <img src="../dataset/img/nc_wordvectors_scatter_plot_KMEANS.png" 
                alt="y" style="max-width: 100%;width: 1121px;"  
                width="100%" onerror="this.onerror=null;this.src='https://plot.ly/404';" />
    </a>
    <script data-plotly="chrispolo:4"  src="https://plot.ly/embed.js" async></script>
</div>

### GROUND TRUTH

In [None]:
gt = GroundTruth(os.getcwd() + "/../dataset/" + site + "/ground_truth/urlToMembership.txt")
ground_truth_nc = [int(gt.get_groundtruth(nocostraint_urlmap[key])) for key in nocostraint_urlmap]

real_colors_nc = [get_color(n) for n in ground_truth_nc]

print("Clusters found manually:", len(set(ground_truth_nc)))
print([label for label in set(ground_truth_nc)])

### GROUND TRUTH Plot

In [None]:
groundtruth_data_nc = scatter_plot(twodim_wordvecs_nc, urls_nc, real_colors_nc)
py.iplot(groundtruth_data_nc, filename='Word Vectors Nocostraint - Scatter plot Ground Truth')

<div>
    <a href="https://plot.ly/~chrispolo/56" 
        target="_blank" title="y" 
        style="display: none; text-align: center;">
            <img src="../dataset/img/nc_wordvectors_scatter_plot_GROUNDTRUTH.png" 
                alt="y" style="max-width: 100%;width: 1121px;"  
                width="100%" onerror="this.onerror=null;this.src='https://plot.ly/404';" />
    </a>
    <script data-plotly="chrispolo:56"  src="https://plot.ly/embed.js" async></script>
</div>

---

## Word2Vec - list-costraint
Word embedding alorithm. Word2vec is a two-layer neural net that processes text. Its input is a text corpus and its output is a set of vectors: feature vectors for words in that corpus. 

Here we're loading all the files that the crawler has generated to train word2vec model.

See the [Dataset README](https://github.com/chrisPiemonte/url2vec/tree/master/dataset "Dataset") for further information.

In [None]:
listcostraint_path        = os.getcwd() + "/../dataset/" + site + "/list_constraint/words" + words + "_depth" + depth + "/"
listcostraint_urlmap_path = listcostraint_path + "urlsMap.txt"
listcostraint_seq_path    = listcostraint_path + "sequenceIDs.txt"

Training the model. No need to keep everything in RAM so we're passing two generators.

**PARAMETERS**:

- **min_count**: ignore all words with total frequency lower than this
- **window**: is the maximum distance between the current and predicted word within a sentence
- **negative**: if > 0, negative sampling will be used, the int for negative specifies how many “noise words” should be drawn (usually between 5-20). Default is 5. If set to 0, no negative samping is used

In [None]:
# because of generator
vocab_seq_lc = get_sequences(listcostraint_seq_path)
train_seq_lc = get_sequences(listcostraint_seq_path)

word2vec_lc = Word2Vec(min_count=1, window=5, negative=5, sg=1)
%time word2vec_lc.build_vocab(vocab_seq_lc)
%time word2vec_lc.train(train_seq_lc)

### t-SNE
Applying t-SNE for dimensionality reduction. We need two dimensional vectors for visualization purposes.

In [None]:
listcostraint_urlmap = get_urlmap(listcostraint_urlmap_path)

url_not_in_sequences = list(set(listcostraint_urlmap) - set(word2vec_lc.vocab))
print(len(url_not_in_sequences))
for url in url_not_in_sequences:
    del listcostraint_urlmap[url]

# 100-dim vecs
wordvecs_lc = np.array([word2vec_lc[key] for key in listcostraint_urlmap], dtype="float64")

# URL labels
urls_lc = [listcostraint_urlmap[key] for key in listcostraint_urlmap]

# 2-dim vecs
tsne = TSNE(n_components=2)
twodim_wordvecs_lc = %time tsne.fit_transform(wordvecs_lc)

### DBSCAN
DBSCAN - Density-Based Spatial Clustering of Applications with Noise. Finds core samples of high density and expands clusters from them. Good for data which contains clusters of similar density.

**PARAMETERS**:

- **eps** : The maximum distance between two samples for them to be considered as in the same neighborhood.
- **min_samples** : The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. This includes the point itself.

In [None]:
dbscan = DBSCAN(eps=0.7, min_samples=5)
%time dbscan.fit(wordvecs_lc)
dbscan_labels_lc = dbscan.labels_

dbscan_colors_lc = [get_color(clust) for clust in dbscan_labels_lc]

print("Clusters found with DBSCAN:", len(set(dbscan_labels_lc)))
print ([label for label in set(dbscan_labels_lc)])
print("\n\n")

### DBSCAN Plot

In [None]:
dbscan_data_lc = scatter_plot(twodim_wordvecs_lc, urls_lc, dbscan_colors_lc)
py.iplot(dbscan_data_lc, filename='Word Vectors Lists - DBSCAN')

<div>
    <a href="https://plot.ly/~chrispolo/60" 
        target="_blank" title="y" 
        style="display: none; text-align: center;">
            <img src="../dataset/img/lc_wordvectors_scatter_plot_DBSCAN.png" 
                alt="y" style="max-width: 100%;width: 1121px;"  
                width="100%" onerror="this.onerror=null;this.src='https://plot.ly/404';" />
    </a>
    <script data-plotly="chrispolo:60"  src="https://plot.ly/embed.js" async></script>
</div>

### HDBSCAN
HDBSCAN - Hierarchical Density-Based Spatial Clustering of Applications with Noise. Performs DBSCAN over varying epsilon values and integrates the result to find a clustering that gives the best stability over epsilon. This allows HDBSCAN to find clusters of varying densities (unlike DBSCAN), and be more robust to parameter selection.

**PARAMETERS**:
- **min_cluster_size** : minimum nodes to form a cluster

In [None]:
hdbscan = HDBSCAN(min_cluster_size=8)
%time hdbscan_labels_lc = hdbscan.fit_predict(wordvecs_lc)

hdbscan_colors_lc = [get_color(clust) for clust in hdbscan_labels_lc]

print("Clusters found with HDBSCAN:", len(set(hdbscan_labels_lc)))
print([label for label in set(hdbscan_labels_lc)])
print("\n\n")

### HDBSCAN Plot

In [None]:
hdbscan_data_lc = scatter_plot(twodim_wordvecs_lc, urls_lc, hdbscan_colors_lc)
py.iplot(hdbscan_data_lc, filename='Word Vectors Lists - Scatter plot HDBSCAN')

<div>
    <a href="https://plot.ly/~chrispolo/62" 
        target="_blank" title="y" 
        style="display: none; text-align: center;">
            <img src="../dataset/img/lc_wordvectors_scatter_plot_HDBSCAN.png" 
                alt="y" style="max-width: 100%;width: 1121px;"  
                width="100%" onerror="this.onerror=null;this.src='https://plot.ly/404';" />
    </a>
    <script data-plotly="chrispolo:62"  src="https://plot.ly/embed.js" async></script>
</div>

### K-MEANS
The first step is to initialize the algorithm by choosing K initial cluster centroid locations. This is typically done by randomly choosing K points from the input set. With these initial K centroids, the algorithm proceeds by repeating the following two main steps:

- Cluster Assignment - Here, each observation (e.g., each point in the data set) is assigned to a cluster centroid such that the WCSS objective function is minimized. This can often be translated to assigning each observation to the closest cluster centroid (which coincidentally minimizes WCSS for many distance metrics), though for some distance metrics and spaces this need not be the case.
- Update Centroids - After all of the input observations have been assigned to a cluster centroid, each centroid is re-computed. For each cluster, the new centroid is computed by averaging the observations that were assigned to it (e.g., computing the 'mean' of the observations).

These steps are repeated until the algorithm "converges".

**PARAMETERS**:

- **n_clusters**: number of clusters

In [None]:
kmeans = KMeans(n_clusters=30)
%time kmeans.fit(wordvecs_lc)

kmeans_labels_lc = kmeans.labels_

kmeans_colors_lc = [get_color(clust) for clust in kmeans_labels_lc]

print("Clusters found with K-MEANS:", len(set(kmeans_labels_lc)))
print([label for label in set(kmeans_labels_lc)])

### K-MEANS Plot

In [None]:
kmeans_data_lc = scatter_plot(twodim_wordvecs_lc, urls_lc, kmeans_colors_lc)
py.iplot(kmeans_data_lc, filename='Word Vectors Lists - Scatter plot K-MEANS')

<div>
    <a href="https://plot.ly/~chrispolo/64" 
        target="_blank" title="y" 
        style="display: none; text-align: center;">
            <img src="../dataset/img/lc_wordvectors_scatter_plot_KMEANS.png" 
                alt="y" style="max-width: 100%;width: 1121px;"  
                width="100%" onerror="this.onerror=null;this.src='https://plot.ly/404';" />
    </a>
    <script data-plotly="chrispolo:64"  src="https://plot.ly/embed.js" async></script>
</div>

### GROUND TRUTH

In [None]:
gt = GroundTruth(os.getcwd() + "/../dataset/" + site + "/ground_truth/urlToMembership.txt")
ground_truth_lc = [int(gt.get_groundtruth(listcostraint_urlmap[key])) for key in listcostraint_urlmap]

real_colors_lc = [get_color(n) for n in ground_truth_lc]

print("Clusters found manually:", len(set(ground_truth_lc)))
print([label for label in set(ground_truth_lc)])

### GROUND TRUTH Plot

In [None]:
groundtruth_data_lc = scatter_plot(twodim_wordvecs_lc, urls_lc, real_colors_lc)
py.iplot(groundtruth_data_lc, filename='Word Vectors Lists - Scatter plot Ground Truth')

<div>
    <a href="https://plot.ly/~chrispolo/66" 
        target="_blank" title="y" 
        style="display: none; text-align: center;">
            <img src="../dataset/img/lc_wordvectors_scatter_plot_GROUNDTRUTH.png" 
                alt="y" style="max-width: 100%;width: 1121px;"  
                width="100%" onerror="this.onerror=null;this.src='https://plot.ly/404';" />
    </a>
    <script data-plotly="chrispolo:66"  src="https://plot.ly/embed.js" async></script>
</div>

## Using the model

#### Analogies
Training the model allows some operations like the famous "king – man + woman = queen“:

Here we are providing the System Programming course **(33)** and its instructor **(253)**, and subtracting another course, Computer Architecture **(32)**

33:  http://cs.illinois.edu/courses/profile/CS241-120158

253: http://cs.illinois.edu/directory/profile/angrave

32:  http://cs.illinois.edu/courses/profile/CS233-120158

and expecting its instructor: http://cs.illinois.edu/directory/profile/zilles **(251)**

In [None]:
# hand is to palm as foot is to ____ (it's sole if you're wondering)
# HAND : PALM : : FOOT : ____   
hand = "33"
palm = "253"
foot = "32"

most_similar_list = word2vec_lc.most_similar(positive=[hand, palm], negative=[foot], topn=1)
sole = most_similar_list[0][0]

print(listcostraint_urlmap[hand].replace("http://cs.illinois.edu/courses/profile/", ""), "is to", 
      listcostraint_urlmap[palm].replace("http://cs.illinois.edu/directory/profile/", ""), "as",
      listcostraint_urlmap[foot].replace("http://cs.illinois.edu/courses/profile/", ""), "is to",
      listcostraint_urlmap[sole].replace("http://cs.illinois.edu/directory/profile/", "")
)

print("")
print(hand, listcostraint_urlmap[hand])
print(palm, listcostraint_urlmap[palm])
print(foot, listcostraint_urlmap[foot])
print(sole, listcostraint_urlmap[sole])

#### Doesn't match
Word that doesn't go with the others:

Artificial Intelligence research lab:
- 128 http://cs.illinois.edu/research/artificial-intelligence

Its researchers:
- 304 http://cs.illinois.edu/directory/profile/mrebl
- 271 http://cs.illinois.edu/directory/profile/daf
- 305 http://cs.illinois.edu/directory/profile/juliahmr
- 363 http://cs.illinois.edu/directory/profile/dhoiem

Another guy:
- 361 http://cs.illinois.edu/directory/profile/wgropp


In [None]:
word2vec_lc.doesnt_match("304 271 305 361 363".split())

---

## Evaluation
Evaluating the performance of a clustering algorithm is not as trivial as counting the number of errors or the precision and recall of a supervised classification algorithm. In particular any evaluation metric should not take the absolute values of the cluster labels into account but rather if this clustering define separations of the data similar to some ground truth set of classes or satisfying some assumption such that members belong to the same class are more similar that members of different classes according to some similarity metric.

See the [scikit-learn documentaion](http://scikit-learn.org/stable/modules/clustering.html#clustering-performance-evaluation "ti") for futher information

### Metrics:

- **Homogeneity**: each cluster contains only members of a single class


- **Completeness**: all members of a given class are assigned to the same cluster


- **Adjusted Rand index**: Given the knowledge of the *ground truth* class assignments and our clustering algorithm assignments of the same samples, the adjusted Rand index is a function that measures the similarity of the two assignments, ignoring permutations and with chance normalization


- **V-measure**: The V-measure is actually equivalent to the mutual information (NMI) discussed above normalized by the sum of the label entropies


- **Mutual Information based scores**: Given the knowledge of the ground truth class assignments and our clustering algorithm assignments of the same samples, the Mutual Information is a function that measures the agreement of the two assignments, ignoring permutations. Two different normalized versions of this measure are available, Normalized Mutual Information(NMI) and Adjusted Mutual Information(AMI). NMI is often used in the literature while AMI was proposed more recently and is normalized against chance


- **Silhouette**: If the ground truth labels are not known, evaluation must be performed using the model itself. The Silhouette Coefficient is an example of such an evaluation, where a higher Silhouette Coefficient score relates to a model with better defined clusters. The score is bounded between -1 for incorrect clustering and +1 for highly dense clustering. Scores around zero indicate overlapping clusters. The score is higher when clusters are dense and well separated, which relates to a standard concept of a cluster.

In [None]:
metrics_df = pd.DataFrame([
        [
            # dbscan nocostraint
            metrics.homogeneity_score(ground_truth_nc, dbscan_labels_nc),
            metrics.completeness_score(ground_truth_nc, dbscan_labels_nc),
            metrics.v_measure_score(ground_truth_nc, dbscan_labels_nc),
            metrics.adjusted_rand_score(ground_truth_nc, dbscan_labels_nc),
            metrics.adjusted_mutual_info_score(ground_truth_nc, dbscan_labels_nc),
            metrics.silhouette_score(wordvecs_nc, dbscan_labels_nc, metric='euclidean')
        ],
        [
            # hdbscan nocostraint
            metrics.homogeneity_score(ground_truth_nc, hdbscan_labels_nc),
            metrics.completeness_score(ground_truth_nc, hdbscan_labels_nc),
            metrics.v_measure_score(ground_truth_nc, hdbscan_labels_nc),
            metrics.adjusted_rand_score(ground_truth_nc, hdbscan_labels_nc),
            metrics.adjusted_mutual_info_score(ground_truth_nc, hdbscan_labels_nc),
            metrics.silhouette_score(wordvecs_nc, hdbscan_labels_nc, metric='euclidean')
        ],
        [
            # kmeans nocostraint
            metrics.homogeneity_score(ground_truth_nc, kmeans_labels_nc),
            metrics.completeness_score(ground_truth_nc, kmeans_labels_nc),
            metrics.v_measure_score(ground_truth_nc, kmeans_labels_nc),
            metrics.adjusted_rand_score(ground_truth_nc, kmeans_labels_nc),
            metrics.adjusted_mutual_info_score(ground_truth_nc, kmeans_labels_nc),
            metrics.silhouette_score(wordvecs_nc, kmeans_labels_nc, metric='euclidean')
        ],
        [
            # dbscan listcostraint
            metrics.homogeneity_score(ground_truth_lc, dbscan_labels_lc),
            metrics.completeness_score(ground_truth_lc, dbscan_labels_lc),
            metrics.v_measure_score(ground_truth_lc, dbscan_labels_lc),
            metrics.adjusted_rand_score(ground_truth_lc, dbscan_labels_lc),
            metrics.adjusted_mutual_info_score(ground_truth_lc, dbscan_labels_lc),
            metrics.silhouette_score(wordvecs_lc, dbscan_labels_lc, metric='euclidean')
        ],
        [
            # hdbscan listcostraint
            metrics.homogeneity_score(ground_truth_lc, hdbscan_labels_lc),
            metrics.completeness_score(ground_truth_lc, hdbscan_labels_lc),
            metrics.v_measure_score(ground_truth_lc, hdbscan_labels_lc),
            metrics.adjusted_rand_score(ground_truth_lc, hdbscan_labels_lc),
            metrics.adjusted_mutual_info_score(ground_truth_lc, hdbscan_labels_lc),
            metrics.silhouette_score(wordvecs_lc, hdbscan_labels_lc, metric='euclidean')
        ],
        [
            # kmeans listcostraint
            metrics.homogeneity_score(ground_truth_lc, kmeans_labels_lc),
            metrics.completeness_score(ground_truth_lc, kmeans_labels_lc),
            metrics.v_measure_score(ground_truth_lc, kmeans_labels_lc),
            metrics.adjusted_rand_score(ground_truth_lc, kmeans_labels_lc),
            metrics.adjusted_mutual_info_score(ground_truth_lc, kmeans_labels_lc),
            metrics.silhouette_score(wordvecs_lc, kmeans_labels_lc, metric='euclidean')
        ]],
        index=[
            "NoCostraint - DBSCAN", 
            "NoCostraint - HDBSCAN", 
            "NoCostraint - K-MEANS", 
            "ListCostraint - DBSCAN", 
            "ListCostraint - HDBSCAN", 
            "ListCostraint - K-MEANS"
        ],
        columns=[
            "Homogeneity", 
            "Completeness", 
            "V-Measure core", 
            "Adjusted Rand index", 
            "Mutual Information",
            "Silhouette"
        ])

metrics_df