In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
import numpy as np


file_name = "df_vector.pkl"

In [None]:
df_vector = pd.read_pickle(file_name)
df_vector.head()

# Test Train Split For Vectors
No need to standerdize the data as the vectors are standardized

In [None]:
x_vector = df_vector.FeatureVector.copy()
y_vector = df_vector.Theme.copy()
y_vector.head()

In [None]:
x_vec_train, x_vec_test, y_vec_train, y_vec_test = train_test_split(x_vector, y_vector, test_size=0.2, random_state=5)

In [None]:
x_vec_test.head()

In [None]:
y_vec_test.head()

Check for NaN

In [None]:
tmp = [x_vec_train, x_vec_test, y_vec_train, y_vec_test]

for e in tmp:
    print(e.isnull().values.any())

In [None]:
x_vec_train = x_vec_train.to_numpy()

# K-Means


In [None]:
import numpy as np

x_vec_train = np.stack(x_vec_train)

In [None]:
k = 11
kmeans = KMeans(n_clusters=k, random_state=42)
y_pred = kmeans.fit_predict(x_vec_train)

[Homogeneity](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.homogeneity_score.html): It estimates how many of the clusters predicted contain only members of a single class. 1.0 stands for perfectly homogeneous labeling.
[Completeness](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.completeness_score.html): A clustering result satisfies completeness if all the data points that are members of a given class are elements of the same cluster. 1.0 stands for perfectly complete labeling.
[V-measure](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.v_measure_score.htm): The V-measure is the harmonic mean between homogeneity and completeness. 1.0 stands for perfectly complete labeling
[Silhouette Coefficient Mean](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.silhouette_score.html): The best value is 1 and the worst value is -1. Values near 0 indicate overlapping clusters. Negative values generally indicate that a sample has been assigned to the wrong cluster, as a different cluster is more similar.

In [None]:
from sklearn.metrics import silhouette_score, v_measure_score, completeness_score, homogeneity_score

def performance(model, y_pred=None):
    if y_pred is None:
        y_pred = model.labels_
    print("Homogeneity: %0.5f" % homogeneity_score(y_vec_train, y_pred))
    print("Completeness: %0.5f" % completeness_score(y_vec_train, y_pred))
    print("V-measure: %0.5f" % v_measure_score(y_vec_train, y_pred))
    if model:
        print("Silhouette Score")
        print("\tEuclidean: %0.010f" % silhouette_score(x_vec_train, model.labels_, metric='euclidean'))

In [None]:
performance(kmeans)

In [None]:
def plot_data(X):
    plt.plot(X[:, 0], X[:, 1], 'k.', markersize=2)

def plot_centroids(centroids, weights=None, circle_color='w', cross_color='k'):
    if weights is not None:
        centroids = centroids[weights > weights.max() / 10]
    plt.scatter(centroids[:, 0], centroids[:, 1],
                marker='o', s=35, linewidths=8,
                color=circle_color, zorder=10, alpha=0.9)
    plt.scatter(centroids[:, 0], centroids[:, 1],
                marker='x', s=2, linewidths=12,
                color=cross_color, zorder=11, alpha=1)

def plot_decision_boundaries(clusterer, X, resolution=1000, show_centroids=True,
                             show_xlabels=True, show_ylabels=True):
    mins = X.min(axis=0) - 0.1
    maxs = X.max(axis=0) + 0.1
    xx, yy = np.meshgrid(np.linspace(mins[0], maxs[0], resolution),
                         np.linspace(mins[1], maxs[1], resolution))
    Z = clusterer.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)

    plt.contourf(Z, extent=(mins[0], maxs[0], mins[1], maxs[1]),
                cmap="Pastel2")
    plt.contour(Z, extent=(mins[0], maxs[0], mins[1], maxs[1]),
                linewidths=1, colors='k')
    plot_data(X)
    if show_centroids:
        plot_centroids(clusterer.cluster_centers_)

    if show_xlabels:
        plt.xlabel("$x_1$", fontsize=14)
    else:
        plt.tick_params(labelbottom=False)
    if show_ylabels:
        plt.ylabel("$x_2$", fontsize=14, rotation=0)
    else:
        plt.tick_params(labelleft=False)

# Q1: Any alternatives for Voronoi diagram?
I wasn't able to plot using the code from the lab or using scipy

In [None]:
# plt.figure(figsize=(8, 4))
# plot_decision_boundaries(kmeans, x_vec_train)
# plt.show()

In [None]:
# from scipy.spatial import Voronoi, voronoi_plot_2d
# vor = Voronoi(x_vec_train)
# fig = voronoi_plot_2d(vor, show_vertices=False, line_colors='orange',
#                 line_width=2, line_alpha=0.6, point_size=2)
# plt.show()

Plot the first k-means cluster

In [None]:
import matplotlib.pyplot as plt

#filter rows of original data
filtered_label0 = x_vec_train[y_pred == 0]

#plotting the results
plt.scatter(filtered_label0[:,0] , filtered_label0[:,1])
plt.show()

Plot all the k-means clusters

In [None]:
u_labels = np.unique(y_pred)

for i in u_labels:
    plt.scatter(x_vec_train[y_pred == i , 0] , x_vec_train[y_pred == i , 1],label = i)
plt.legend()
plt.show

Plot all the k-means clusters with the cluster centroids

In [None]:
centroids = kmeans.cluster_centers_

for i in u_labels:
    plt.scatter(x_vec_train[y_pred == i , 0] , x_vec_train[y_pred == i , 1],label = i)
plt.scatter(centroids[:,0] , centroids[:,1] , s = 80, color = 'k')
plt.legend()
plt.show

# K-Means for 1 to n number of iterations

In [None]:
n = 20

In [None]:
kmeans_per_k = [KMeans(n_clusters=k, random_state=42).fit(x_vec_train) for k in range(2, n)]
inertias = [model.inertia_ for model in kmeans_per_k]

In [None]:
i = 2
for kmeans in kmeans_per_k:
    print("K =", i)
    print(performance(kmeans), "\n")
    i+=1

There's 11 categories in the dataset. However, the accuracy was best at k = 15 and second was k = 12. k = 11 scored a lot lower.

In [None]:
from matplotlib import pyplot as plt

plt.figure(figsize=(8, 3.5))
plt.plot(range(2, n), inertias, "bo-")
plt.xlabel("$k$", fontsize=14)
plt.ylabel("Inertia", fontsize=14)
plt.axis([1, 19.5, 9.75, 11.5])
plt.show()

Conclusion: No clear elbow, the change in accuracy is rather gradual.

## PCA reduction to 2 categories

In [None]:
from sklearn.decomposition import PCA
reduced_data = PCA(n_components=2).fit_transform(x_vec_train)

In [None]:
kmeans_per_k_PCA = [KMeans(n_clusters=k, random_state=42).fit(reduced_data) for k in range(2, n)]
inertias_PCA = [model.inertia_ for model in kmeans_per_k_PCA]

In [None]:
i = 2
for kmeans in kmeans_per_k_PCA:
    print("K =", i)
    print(performance(kmeans), "\n")
    i+=1

Conclusion: k = 14 and k = 16 scored the best with V-score at .318. However, without reduction the accuracy was higher overall. Without reduction the max was .382

In [None]:
plt.figure(figsize=(8, 3.5))
plt.plot(range(2, n), inertias_PCA, "bo-")
plt.xlabel("$k$", fontsize=14)
plt.ylabel("Inertia", fontsize=14)
plt.axis([1, 19.5, 0, 0.25])
plt.show()

Conclusion: The elbow here is more obvious at 9 or 10.

# PCA reduction to 8 categories from 11

In [None]:
reduced_data_8 = PCA(n_components=8).fit_transform(x_vec_train)
kmeans_per_k_PCA_8 = [KMeans(n_clusters=k, random_state=42).fit(reduced_data_8) for k in range(2, n)]
inertias_PCA_8 = [model.inertia_ for model in kmeans_per_k_PCA_8]
i = 2
for kmeans in kmeans_per_k_PCA_8:
    print("K =", i)
    performance(kmeans)
    print()
    i += 1

In [None]:
plt.figure(figsize=(8, 3.5))
plt.plot(range(2, n), inertias_PCA_8, "bo-")
plt.xlabel("$k$", fontsize=14)
plt.ylabel("Inertia", fontsize=14)
plt.axis([1, 19.5, 0.25, 1])
plt.show()

Conclusion: Reducing the categories from 11 to 8 improved the V-score a lot. The top 3 V-scores are 0.455 (k = 16), 0.454 (k = 19), 0.448 (k =17). However the change in inertia is more gradual compared to when there was only 2 categories.

Lab Question: *Research some of the existing algorithms to compute the optimal number of clusters. For example, look up: Elbow method, the silhouette method, cluster validity and similarity measures. Can these algorithms help you to find the optimal number of clusters for your data set?*

Algorithms used to compute the optimal number of clusters are :
- Elbow method : The elbow method, involves creating a plot with the number of clusters on the x-axis and the total within sum of squares on the y-axis and then identifying where an “elbow” or bend appears in the plot. The point on the x-axis where the “elbow” occurs tells us the optimal number of clusters to use in the k-means clustering algorithm.
- The silhouette method : Elbow is very simple but is not adapted to complex problems, the silhouette method is considered as better. Conceptually, the Silhouette score utilizes some distance parameter to measure how far a point is from its cluster compared to the centroid of a different cluster. The Average Silhouette Method takes the average Silhouette score of each data point for each cluster. This measure as a range between -1 and 1, 1 being the best, we can find the best number of clusters by calculating silhouette score for different cluster number.
- Cluster validity : The term cluster validation is used to design the procedure of evaluating the goodness of clustering algorithm results. This is important to avoid finding patterns in a random data, as well as, in the situation where you want to compare two clustering algorithms. There is 3 principle techniques : -Internal cluster validation, which uses the internal information of the clustering process to evaluate the goodness of a clustering structure -External cluster validation, which consists in comparing the results of a cluster analysis to an externally known result -Relative cluster validation, which evaluates the clustering structure by varying different parameter values for the same algorithm

Yes those algorithms can help us find the optimal number of cluster for our data set, the best fit would probanbly be using the silhouette method and relative cluster validation

# Dimensionality reduction to visualize clustering

In [None]:
import matplotlib.pyplot as plt

kmeans = KMeans(init="k-means++", n_clusters=11, random_state=42)
kmeans.fit(reduced_data)

# Step size of the mesh. Decrease to increase the quality of the VQ.
h = 0.02  # point in the mesh [x_min, x_max]x[y_min, y_max].

# Plot the decision boundary. For that, we will assign a color to each
x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

# Obtain labels for each point in mesh. Use last trained model.
Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure(1)
plt.clf()
plt.imshow(
    Z,
    interpolation="nearest",
    extent=(xx.min(), xx.max(), yy.min(), yy.max()),
    cmap=plt.cm.Paired,
    aspect="auto",
    origin="lower",
)

plt.plot(reduced_data[:, 0], reduced_data[:, 1], ".", markersize=2)
# Plot the centroids as a white X
centroids = kmeans.cluster_centers_
plt.scatter(
    centroids[:, 0],
    centroids[:, 1],
    marker="x",
    s=169,
    linewidths=3,
    color="k",
    zorder=10,
)
plt.title(
    "K-means clustering on (PCA-reduced data)\n"
    "Centroids are marked with white cross"
)
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xticks(())
plt.yticks(())
plt.show()

The PCA-reduced K-Means clustering visualization shows well how there are a lot of overlaying categories. The silhouette score was always near 0 which indicates overlapping clusters.

# GMM

In [None]:
from sklearn.mixture import GaussianMixture
gm = GaussianMixture(n_components=11, random_state=0)
y_pred = gm.fit_predict(x_vec_train)

gm_red = GaussianMixture(n_components=11, random_state=0)
y_pred_red = gm_red.fit_predict(reduced_data)

In [None]:
print("Original Dataset")
print(performance(None, y_pred),"\n")
print("PCA reduced")
print(performance(None, y_pred_red))

# Hierarchical Clustering

In [None]:
from sklearn.cluster import AgglomerativeClustering
clustering  = AgglomerativeClustering(distance_threshold=0, n_clusters=None)
y_pred = clustering.fit_predict(x_vec_train)

clustering_red = AgglomerativeClustering(distance_threshold=0, n_clusters=None)
y_pred_red = clustering_red.fit_predict(reduced_data)

print("Original Dataset")
print(performance(None, y_pred),"\n")
print("PCA reduced")
print(performance(None, y_pred_red))

In [None]:
from scipy.cluster.hierarchy import dendrogram

def plot_dendrogram(model, **kwargs):
    # Create linkage matrix and then plot the dendrogram

    # create the counts of samples under each node
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack(
        [model.children_, model.distances_, counts]
    ).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)

plt.title("Hierarchical Clustering Dendrogram")
# plot the top three levels of the dendrogram
plot_dendrogram(clustering, truncate_mode="level", p=3)
plt.xlabel("Number of points in node (or index of point if no parenthesis).")
plt.show()

In [None]:
plt.title("Hierarchical Clustering Dendrogram")
# plot the top three levels of the dendrogram
plot_dendrogram(clustering_red, truncate_mode="level", p=3)
plt.xlabel("Number of points in node (or index of point if no parenthesis).")
plt.show()

In [None]:
# fig2 = plt.figure()
# ax2 = fig2.add_subplot(121, projection="3d", elev=7, azim=0.45)
# ax2.set_position([0, 0, 0.95, 1])
# for l in np.unique(clustering.labels_):
#     ax2.scatter(
#         x_vec_train[clustering.labels_ == l, 0],
#         x_vec_train[clustering.labels_ == l, 1],
#         x_vec_train[clustering.labels_ == l, 2],
#         color=plt.cm.jet(float(l) / np.max(clustering_red.labels_ + 1)),
#         s=20,
#         edgecolor="k",
#     )
#
# plt.show()

# Conclusion: