# Project 2 - Yanfeng, Garvit, Hyosang

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random
import statistics

from scipy.optimize import linear_sum_assignment
from sklearn.cluster import KMeans
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import TruncatedSVD, NMF
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import confusion_matrix, homogeneity_score, completeness_score, v_measure_score, adjusted_rand_score, adjusted_mutual_info_score

# To suppress FutureWarnings. Reference: https://machinelearningmastery.com/how-to-fix-futurewarning-messages-in-scikit-learn/
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

## Question 1

In [None]:
# We will focus on a subset of samples from the larger dataset
class_0 = ["comp.graphics", "comp.os.ms-windows.misc", "comp.sys.ibm.pc.hardware", "comp.sys.mac.hardware"]
class_1 = ["rec.autos", "rec.motorcycles", "rec.sport.baseball", "rec.sport.hockey"]

# Fetch documents from API. Remove headers and footers.
documents_0 = fetch_20newsgroups(categories=class_0, remove=("headers", "footers"))
documents_1 = fetch_20newsgroups(categories=class_1, remove=("headers", "footers"))

# Save data and assign labels
X_0 = documents_0.data
X_1 = documents_1.data
X = X_0 + X_1
Y = ([0] * len(X_0)) + ([1] * len(X_1))

# Print shapes and examples for sanity check
print("Class 0 example")
print(X[0])
print(Y[0])
print("Class 1 example")
print(X[-1])
print(Y[-1])

In [None]:
# Generate sparse TF-IDF representations
cv = CountVectorizer(stop_words='english', min_df=3)
train_count = cv.fit_transform(X)  # use fit or fit_transform on the training set
print(train_count.toarray().shape)

tfidf = TfidfTransformer()
train_tfidf = tfidf.fit_transform(train_count)
print(train_tfidf.shape)

## Question 2

In [None]:
# Apply K-means clustering
kmeans = KMeans(n_clusters = 2, n_init = 50, max_iter = 5000, random_state = 0)
kmeans.fit(train_tfidf)

Y_pred = kmeans.labels_

In [None]:
import itertools
import matplotlib.colors as colors

# Visualize contingency matrix using helper function

def plot_mat(mat, xticklabels = None, yticklabels = None, pic_fname = None, size=(-1,-1), if_show_values = True,
             colorbar = True, grid = 'k', xlabel = None, ylabel = None, title = None, vmin=None, vmax=None):
    if size == (-1, -1):
        size = (mat.shape[1] / 3, mat.shape[0] / 3)

    fig = plt.figure(figsize=size)
    ax = fig.add_subplot(1,1,1)
    im = ax.pcolor(mat, cmap=plt.cm.Blues, linestyle='-', linewidth=0.5, edgecolor=grid, vmin=vmin, vmax=vmax)

    if colorbar:
        plt.colorbar(im,fraction=0.046, pad=0.06)
    
    lda_num_topics = mat.shape[0]
    nmf_num_topics = mat.shape[1]
    yticks = np.arange(lda_num_topics)
    xticks = np.arange(nmf_num_topics)
    ax.set_xticks(xticks + 0.5)
    ax.set_yticks(yticks + 0.5)
    if xticklabels is None:
        xticklabels = [str(i) for i in xticks]
    if yticklabels is None:
        yticklabels = [str(i) for i in yticks]
    ax.set_xticklabels(xticklabels)
    ax.set_yticklabels(yticklabels)

    ax.tick_params(labelright = True, labeltop = False)

    if ylabel:
        plt.ylabel(ylabel, fontsize=15)
    if xlabel:
        plt.xlabel(xlabel, fontsize=15)
    if title:
        plt.title(title, fontsize=15)

    ax.invert_yaxis()

    def show_values(pc, fmt="%d", **kw):
        pc.update_scalarmappable()
        ax = pc.axes
        for p, color, value in itertools.zip_longest(pc.get_paths(), pc.get_facecolors(), pc.get_array()):
            x, y = p.vertices[:-2, :].mean(0)
            if np.all(color[:3] > 0.5):
                color = (0.0, 0.0, 0.0)
            else:
                color = (1.0, 1.0, 1.0)
            ax.text(x, y, fmt % value, ha="center", va="center", color=color, **kw, fontsize=10)

    if if_show_values:
        show_values(im)
    plt.tight_layout()
    if pic_fname:
        plt.savefig(pic_fname, dpi=300, transparent=True)
    plt.show()
    plt.close()

# Create and visualize contingency matrix while finding the best matching cluster-class pairs
def print_contingency_matrix(Y, Y_pred):
    contingency_matrix = confusion_matrix(Y, Y_pred)
    rows, cols = linear_sum_assignment(contingency_matrix, maximize = True)
    plot_mat(contingency_matrix[rows[:, np.newaxis], cols], xlabel = "Cluster", ylabel = "Class", xticklabels = cols, yticklabels = rows, size = (8,8))

print_contingency_matrix(Y, Y_pred)

## Question 3

In [None]:
# Report 5 clustering measures

def print_clustering_metrics(Y, Y_pred, display=True):
    homogeneity = homogeneity_score(Y, Y_pred)
    completeness = completeness_score(Y, Y_pred)
    v_measure = v_measure_score(Y, Y_pred)
    adjusted_rand = adjusted_rand_score(Y, Y_pred)
    adjusted_mutual_info = adjusted_mutual_info_score(Y, Y_pred)

    if display:
        print("Homogeneity: %0.5f" % homogeneity)
        print("Completeness: %0.5f" % completeness)
        print("V-measure: %0.5f" % v_measure)
        print("Adjusted rand index: %0.5f" % adjusted_rand)
        print("Adjusted mutual info score: %0.5f" % adjusted_mutual_info)
    
    return homogeneity, completeness, v_measure, adjusted_rand, adjusted_mutual_info

print_clustering_metrics(Y, Y_pred)

## Question 4

In [None]:
# Plot percentage of variance retained by r components from 1 to 1000
rs = [1, 5, 10, 50, 100, 250, 500, 750, 1000]
percentage_variance = []

for r in rs:
    svd = TruncatedSVD(n_components = r, random_state = 42)
    svd.fit_transform(train_tfidf)  # fit model and perform dimensionality reduction
    percentage_variance.append(np.sum(svd.explained_variance_ratio_) * 100)

plt.plot(rs, percentage_variance)
plt.xlabel("Number of top principle components")
plt.ylabel("Percentage of variance retained in data")
plt.show()

## Question 5, 6

In [None]:
# Conduct K-Means clustering with Truncated SVD-reduced data for different number of components
rs = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 50, 100, 300]

homogeneity_scores = []
completeness_scores = []
v_measure_scores = []
adjusted_rand_scores = []
adjusted_mutual_info_scores = []

# "Note that you don’t need to perform SVD multiple times." - We can exclude features from the result of larger SVD
svd = TruncatedSVD(n_components = 1000, random_state = 42)
reduced_train_tfidf = svd.fit_transform(train_tfidf)

# Compute K-Means clustering metrics for different values of r
for r in rs:
    r_components = reduced_train_tfidf[:, :r]
    kmeans.fit(r_components)
    Y_pred = kmeans.labels_

    # Create list of clustering metrics for plotting
    homogeneity, completeness, v_measure, rand, mutual_info = print_clustering_metrics(Y, Y_pred, display = False)
    homogeneity_scores.append(homogeneity)
    completeness_scores.append(completeness)
    v_measure_scores.append(v_measure)
    adjusted_rand_scores.append(rand)
    adjusted_mutual_info_scores.append(mutual_info)

In [None]:
# Create plot of metric scores against number of principal components
def plot_clustering_metrics(title, rs, homogeneity_scores, completeness_scores, v_measure_scores, adjusted_rand_scores, adjusted_mutual_info_scores):
    rs_index = list(range(len(rs)))

    plt.plot(rs_index, homogeneity_scores, '-', label='Homogeneity')
    plt.plot(rs_index, completeness_scores, '-', label='Completeness')
    plt.plot(rs_index, v_measure_scores, '-', label = 'V-measure')
    plt.plot(rs_index, adjusted_rand_scores, '-', label = 'Adjusted Rand Index')
    plt.plot(rs_index, adjusted_mutual_info_scores, '-', label = 'Adjusted mutual information score')
    plt.xticks(rs_index, rs)
    plt.xlabel("Number of principal components, r")
    plt.ylabel("Score")
    plt.legend(loc="lower right")
    plt.title(title)
    plt.show()

# Gather average for related question
average_svd_homogeneity = statistics.mean(homogeneity_scores)
average_svd_completeness = statistics.mean(completeness_scores)
average_svd_v_measure = statistics.mean(v_measure_scores)
average_svd_rand = statistics.mean(adjusted_rand_scores)
average_svd_mutual_info = statistics.mean(adjusted_mutual_info_scores)

# Best metrics
best_svd_homogeneity = max(homogeneity_scores)
best_svd_completeness = max(completeness_scores)
best_svd_v_measure = max(v_measure_scores)
best_svd_rand = max(adjusted_rand_scores)
best_svd_mutual_info = max(adjusted_mutual_info_scores)

# Create plots for SVD data gathered above
plot_clustering_metrics("Clustering metrics for different number of SVD components", rs, homogeneity_scores, completeness_scores, v_measure_scores, adjusted_rand_scores, adjusted_mutual_info_scores)

In [None]:
# Conduct K-Means clustering with dimensionality reduction using NMF
rs = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 50, 100, 300]

homogeneity_scores = []
completeness_scores = []
v_measure_scores = []
adjusted_rand_scores = []
adjusted_mutual_info_scores = []

# Compute K-Means clustering metrics for different values of r
for r in rs:
    nmf = NMF(n_components = r, random_state = 42)
    r_components = nmf.fit_transform(train_tfidf)
    kmeans.fit(r_components)
    Y_pred = kmeans.labels_

    # Create list of clustering metrics for plotting
    homogeneity, completeness, v_measure, rand, mutual_info = print_clustering_metrics(Y, Y_pred, display = False)
    homogeneity_scores.append(homogeneity)
    completeness_scores.append(completeness)
    v_measure_scores.append(v_measure)
    adjusted_rand_scores.append(rand)
    adjusted_mutual_info_scores.append(mutual_info)

In [None]:
# Gather average for related question
average_nmf_homogeneity = statistics.mean(homogeneity_scores)
average_nmf_completeness = statistics.mean(completeness_scores)
average_nmf_v_measure = statistics.mean(v_measure_scores)
average_nmf_rand = statistics.mean(adjusted_rand_scores)
average_nmf_mutual_info = statistics.mean(adjusted_mutual_info_scores)

# Best metrics
best_nmf_homogeneity = max(homogeneity_scores)
best_nmf_completeness = max(completeness_scores)
best_nmf_v_measure = max(v_measure_scores)
best_nmf_rand = max(adjusted_rand_scores)
best_nmf_mutual_info = max(adjusted_mutual_info_scores)

# Create plots for SVD data gathered above
plot_clustering_metrics("Clustering metrics for different number of NMF components", rs, homogeneity_scores, completeness_scores, v_measure_scores, adjusted_rand_scores, adjusted_mutual_info_scores)

## Question 7

In [None]:
# Average scores for each reduction method, r = 1 - 300
print("Average SVD")
print(average_svd_homogeneity)
print(average_svd_completeness)
print(average_svd_v_measure)
print(average_svd_rand)
print(average_svd_mutual_info)

print("Average NMF")
print(average_nmf_homogeneity)
print(average_nmf_completeness)
print(average_nmf_v_measure)
print(average_nmf_rand)
print(average_nmf_mutual_info)

# Best scores for each reduction method
print("Best SVD")
print(best_svd_homogeneity)
print(best_svd_completeness)
print(best_svd_v_measure)
print(best_svd_rand)
print(best_svd_mutual_info)

print("Best NMF")
print(best_nmf_homogeneity)
print(best_nmf_completeness)
print(best_nmf_v_measure)
print(best_nmf_rand)
print(best_nmf_mutual_info)

## Question 8, 9

In [None]:
# Optimal choices based on analysis of graphs
optimal_svd_r = 9
optimal_nmf_r = 2

# Helper to create the plot
def create_cluster_plot(title, reduced_matrix, Y, use_clustering_labels = False):
    kmeans.fit(reduced_matrix)
    if use_clustering_labels:
        Y_pred = kmeans.labels_
    else:
        Y_pred = Y
    plt.scatter(reduced_matrix[:,0], reduced_matrix[:,1], c = Y_pred)
    plt.title(title)
    plt.show()

In [None]:
# Create cluster plots for optimal SVD
optimal_svd = TruncatedSVD(n_components = optimal_svd_r, random_state = 42)
reduced_train_tfidf = optimal_svd.fit_transform(train_tfidf)

# Projecting to 2D plane
two_d_svd = TruncatedSVD(n_components = 2, random_state = 42)
projected_train_tfidf = two_d_svd.fit_transform(reduced_train_tfidf)

# Plot for ground truth
create_cluster_plot("SVD visualization by ground truth class label", projected_train_tfidf, Y)

# Plot for clustering label
create_cluster_plot("SVD visualization by clustering label", projected_train_tfidf, Y, use_clustering_labels = True)

In [None]:
# Create cluster plots for optimal NMF
optimal_nmf = NMF(n_components = optimal_nmf_r, random_state = 42)
reduced_train_tfidf = optimal_nmf.fit_transform(train_tfidf)

# No need to project to 2D space, as we already use n_components = 2
# Plot for ground truth
create_cluster_plot("NMF visualization by ground truth class label", reduced_train_tfidf, Y)

# Plot for clustering label
create_cluster_plot("NMF visualization by clustering label", reduced_train_tfidf, Y, use_clustering_labels = True)

## Question 10

In [None]:
# Clustering of entire 20 classes
documents = fetch_20newsgroups(remove=("headers", "footers"))

# Save data and assign labels
X = documents.data
Y = documents.target
classes = documents.target_names

# Print examples
print(X[-1])
print(Y[-1])
print(classes)

In [None]:
# Generate sparse TF-IDF representations
cv = CountVectorizer(stop_words='english', min_df=3)
train_count = cv.fit_transform(X)  # use fit or fit_transform on the training set

tfidf = TfidfTransformer()
train_tfidf = tfidf.fit_transform(train_count)

In [None]:
# First, we get baseline results by using the entire TF-IDF representation without reduction
kmeans = KMeans(n_clusters = 20, n_init = 50, max_iter = 5000, random_state=0)

kmeans.fit(train_tfidf)
Y_pred = kmeans.labels_

# Print clustering metrics and contingency matrix
print_clustering_metrics(Y, Y_pred)
print_contingency_matrix(Y, Y_pred)

In [None]:
kmeans = KMeans(n_clusters = 20, n_init = 50, max_iter = 5000, random_state=0)

# Conduct K-Means clustering with Truncated SVD-reduced data for different number of components
rs = [5, 20, 200]

homogeneity_scores = []
completeness_scores = []
v_measure_scores = []
adjusted_rand_scores = []
adjusted_mutual_info_scores = []

# "Note that you don’t need to perform SVD multiple times." - We can exclude features from the result of larger (> 300) SVD
svd = TruncatedSVD(n_components = 500, random_state=42)
reduced_train_tfidf = svd.fit_transform(train_tfidf)

# Compute K-Means clustering metrics for different values of r
for r in rs:
    r_components = reduced_train_tfidf[:, :r]
    kmeans.fit(r_components)
    Y_pred = kmeans.labels_

    # Create list of clustering metrics for plotting
    homogeneity, completeness, v_measure, rand, mutual_info = print_clustering_metrics(Y, Y_pred, display = False)
    homogeneity_scores.append(homogeneity)
    completeness_scores.append(completeness)
    v_measure_scores.append(v_measure)
    adjusted_rand_scores.append(rand)
    adjusted_mutual_info_scores.append(mutual_info)
    print("r = {0} completed".format(r))

# Gather average for related question
average_svd_homogeneity = statistics.mean(homogeneity_scores)
average_svd_completeness = statistics.mean(completeness_scores)
average_svd_v_measure = statistics.mean(v_measure_scores)
average_svd_rand = statistics.mean(adjusted_rand_scores)
average_svd_mutual_info = statistics.mean(adjusted_mutual_info_scores)

# Create plots for SVD data gathered above
plot_clustering_metrics("Clustering metrics for different number of SVD components", 
                        rs, homogeneity_scores, completeness_scores, v_measure_scores, 
                        adjusted_rand_scores, adjusted_mutual_info_scores)

In [None]:
# Reduce dimensionality using SVD
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=50, random_state=42)
train_SVD = svd.fit_transform(train_tfidf)
print("Reduced the dimensionalities")

# Perform K-Means clustering
kmeans_SVD = KMeans(n_clusters = 20, n_init = 50, max_iter = 5000, random_state = 0).fit(train_SVD)

print("Computed K-Means")

# Visualize the contingency matrix
print_contingency_matrix(Y, kmeans_SVD.labels_)
print_clustering_metrics(Y, kmeans_SVD.labels_)

In [None]:
kmeans = KMeans(n_clusters = 20, n_init = 50, max_iter = 5000, random_state=0)

# Conduct K-Means clustering with dimensionality reduction using NMF
rs = [5, 20, 200]

homogeneity_scores = []
completeness_scores = []
v_measure_scores = []
adjusted_rand_scores = []
adjusted_mutual_info_scores = []

# Compute K-Means clustering metrics for different values of r
for r in rs:
    nmf = NMF(n_components = r, random_state=42)
    r_components = nmf.fit_transform(train_tfidf)
    kmeans.fit(r_components)
    Y_pred = kmeans.labels_

    # Create list of clustering metrics for plotting
    homogeneity, completeness, v_measure, rand, mutual_info = print_clustering_metrics(Y, Y_pred, display = False)
    homogeneity_scores.append(homogeneity)
    completeness_scores.append(completeness)
    v_measure_scores.append(v_measure)
    adjusted_rand_scores.append(rand)
    adjusted_mutual_info_scores.append(mutual_info)

    print("r = {0} completed".format(r))

# Gather average for related question
average_nmf_homogeneity = statistics.mean(homogeneity_scores)
average_nmf_completeness = statistics.mean(completeness_scores)
average_nmf_v_measure = statistics.mean(v_measure_scores)
average_nmf_rand = statistics.mean(adjusted_rand_scores)
average_nmf_mutual_info = statistics.mean(adjusted_mutual_info_scores)

# Create plots for SVD data gathered above
plot_clustering_metrics("Clustering metrics for different number of NMF components", 
                        rs, homogeneity_scores, completeness_scores, v_measure_scores, 
                        adjusted_rand_scores, adjusted_mutual_info_scores)

In [None]:
# Perform the same procedure with NMF transformation
from sklearn.decomposition import NMF
nmf = NMF(n_components=20, random_state=42)
train_NMF = nmf.fit_transform(train_tfidf)
print("Reduced the dimensionalities")

kmeans_NMF = KMeans(n_clusters = 20, n_init = 50, max_iter = 5000, random_state = 0).fit(train_NMF)
print("Computed K-Means")

print_contingency_matrix(Y, kmeans_NMF.labels_)
print_clustering_metrics(Y, kmeans_NMF.labels_)

## Question 11, 12, 13

In [None]:
# Use UMAP to reduce the dimensionality
import umap.umap_ as umap

kmeans_UMAP = KMeans(n_clusters = 20, n_init = 50, max_iter = 5000, random_state = 0)

# Store predictions from each of the hyperparameter combinations for later analysis
y_preds = []

# For each of the two metrics and three n_components values, run UMAP reduce and get clustering results
for metric in ['euclidean', 'cosine']:
    for n_components in [5, 20, 200]:
        umap_reduce = umap.UMAP(n_components=n_components, metric=metric, random_state = 42).fit_transform(train_tfidf)
        kmeans_UMAP.fit(umap_reduce)
        # Store clustering result for analysis
        y_preds.append(kmeans_UMAP.labels_)

In [None]:
# Euclidean, n_components = 5
print_contingency_matrix(Y, y_preds[0])
print_clustering_metrics(Y, y_preds[0])

In [None]:
# Euclidean, n_components = 20
print_contingency_matrix(Y, y_preds[1])
print_clustering_metrics(Y, y_preds[1])

In [None]:
# Euclidean, n_components = 200
print_contingency_matrix(Y, y_preds[2])
print_clustering_metrics(Y, y_preds[2])

In [None]:
# Cosine, n_components = 5
print_contingency_matrix(Y, y_preds[3])
print_clustering_metrics(Y, y_preds[3])

In [None]:
# Cosine, n_components = 20
print_contingency_matrix(Y, y_preds[4])
print_clustering_metrics(Y, y_preds[4])

In [None]:
# Cosine, n_components = 200
print_contingency_matrix(Y, y_preds[5])
print_clustering_metrics(Y, y_preds[5])

In [None]:
import umap.umap_ as umap
import umap.plot

# Get similar plot for UMAP (as for SVD and NMF in Q8)
class_0 = ["comp.graphics", "comp.os.ms-windows.misc", "comp.sys.ibm.pc.hardware", "comp.sys.mac.hardware"]
class_1 = ["rec.autos", "rec.motorcycles", "rec.sport.baseball", "rec.sport.hockey"]

# Fetch documents from API. Remove headers and footers.
documents_0 = fetch_20newsgroups(categories=class_0, remove=("headers", "footers"))
documents_1 = fetch_20newsgroups(categories=class_1, remove=("headers", "footers"))

# Save data and assign labels
X_0 = documents_0.data
X_1 = documents_1.data
X = X_0 + X_1
Y = ([0] * len(X_0)) + ([1] * len(X_1))

cv = CountVectorizer(stop_words='english', min_df=3)
train_count = cv.fit_transform(X)  # use fit or fit_transform on the training set

tfidf = TfidfTransformer()
train_tfidf = tfidf.fit_transform(train_count)

# Run UMAP reduce and get clustering results
umap_reduce = umap.UMAP(n_components=2, metric='cosine', random_state = 42).fit(train_tfidf)

# Plot UMAP data points as labelled by ground truth
umap.plot.points(umap_reduce, labels=np.array(Y))
plt.title('UMAP visualization by ground truth class label')
plt.show()

## Question 14

In [None]:
# Clustering of entire 20 classes
documents = fetch_20newsgroups(remove=("headers", "footers"))

# Save data and assign labels
X = documents.data
Y = documents.target
classes = documents.target_names

# Print examples
print(X[-1])
print(Y[-1])
print(classes)

# Generate sparse TF-IDF representations
cv = CountVectorizer(stop_words='english', min_df=3)
train_count = cv.fit_transform(X)  # use fit or fit_transform on the training set

tfidf = TfidfTransformer()
train_tfidf = tfidf.fit_transform(train_count)

In [None]:
# Use UMAP to reduce the dimensionality

import umap.umap_ as umap
import umap.plot
from sklearn.cluster import AgglomerativeClustering

umap_reduce = umap.UMAP(n_components=200, metric='cosine', random_state=42).fit_transform(train_tfidf)
print(umap_reduce.shape)


In [None]:
# Ward linkage

agg_cluster = AgglomerativeClustering(n_clusters=20, affinity='euclidean', linkage='ward')
clustering = agg_cluster.fit(umap_reduce)
print(clustering.labels_.shape)
print_clustering_metrics(Y, clustering.labels_)

In [None]:
# Single linkage

agg_cluster_single = AgglomerativeClustering(n_clusters=20, affinity='cosine', linkage='single')
clustering = agg_cluster_single.fit(umap_reduce)
print_clustering_metrics(Y, clustering.labels_)

## Question 15

In [None]:
# Apply HDBSCAN
import hdbscan
clusterings = {}

# Compare the results among min_cluster_size = 20, 100, 200
for min_cluster_size in [20, 100, 200]:
    c_hdbscan = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size)
    clustering = c_hdbscan.fit(umap_reduce)
    print("****** min_cluster_size = %d ******" % min_cluster_size)
    print_clustering_metrics(Y, clustering.labels_)
    clusterings[min_cluster_size] = clustering
    print("\n")

## Question 16

In [None]:
print_contingency_matrix(Y, clusterings[100].labels_)

## Question 17

In [None]:
import pickle

def writedb(db, filename):
    with open(filename, 'wb') as handle:
        pickle.dump(db, handle, protocol=pickle.HIGHEST_PROTOCOL)

def readdb(filename):
    with open(filename, 'rb') as handle:
        return pickle.load(handle)

In [None]:
# Dimensionality Reduction From Dataset
# Run this cell if clustering.pkl is empty

db = {}

# Clustering of entire 20 classes
documents = fetch_20newsgroups(remove=("headers", "footers"))

# Save data and assign labels
X = documents.data
Y = documents.target
classes = documents.target_names

# Print examples
print(X[-1])
print(Y[-1])
print(classes)

# Generate sparse TF-IDF representations
cv = CountVectorizer(stop_words='english', min_df=3)
train_count = cv.fit_transform(X)  # use fit or fit_transform on the training set

tfidf = TfidfTransformer()
train_tfidf = tfidf.fit_transform(train_count)

db["None"] = train_tfidf

print("TFIDF Stored")
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
import umap.umap_ as umap
import umap.plot
for r in [5, 20, 200]:
    svd = TruncatedSVD(n_components=r, random_state=42)
    train_SVD = svd.fit_transform(train_tfidf)
    db["svd_%d" % r] = train_SVD
    print("SVD (r = %d) stored" % r)

    nmf = NMF(n_components=r, random_state=42)
    train_NMF = nmf.fit_transform(train_tfidf)
    db["nmf_%d" % r] = train_NMF
    print("NMF (r = %d) stored" % r)

    umap_reduce = umap.UMAP(n_components=r, metric='cosine', random_state=42)
    train_umap = umap_reduce.fit_transform(train_tfidf)
    db["umap_%d" % r] = train_umap
    print("UMAP (r = %d) stored" % r)
writedb(db, 'dimensionality_reduction.pkl')

In [None]:
# Apply Clustering
# Run this cell if dimensionality_reduction.pkl is empty

reductions = readdb("dimensionality_reduction.pkl")
print(reductions.keys())

results = {}

# K-Means
for k in [10, 20, 50]:
    for reduction in reductions:
        reduction_results = reductions[reduction]
        kmeans = KMeans(n_clusters=k, n_init=50, max_iter=5000, random_state=0)
        labels = kmeans.fit(reduction_results).labels_
        results["{0}_kmeans_{1}".format(reduction, str(k))] = labels
        print("{0}: Computed clustering K-Means (k = {1})".format(reduction, str(k)))
print("\n")

# Agglomerative Clustering
from sklearn.cluster import AgglomerativeClustering
for reduction in reductions:
    reduction_results = reductions[reduction]
    agg_cluster = AgglomerativeClustering(n_clusters=20, affinity='euclidean', linkage='ward')
    if isinstance(reduction_results, np.ndarray):
        labels = agg_cluster.fit(reduction_results).labels_
    else:
        labels = agg_cluster.fit(reduction_results.toarray()).labels_
    results["{0}_agg_cluster".format(reduction)] = labels
    print("{0}: Computed clustering Agglomerative Clustering".format(reduction))
print("\n")

# HDBSCAN
import hdbscan
for min_cluster_size in [100, 200]:
    for reduction in reductions:
        reduction_results = reductions[reduction]
        c_hdbscan = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size)
        labels = c_hdbscan.fit(reduction_results).labels_
        results["{0}_hdbscan_{1}".format(reduction, str(min_cluster_size))] = labels
        print("{0}: Computed clustering HDBSCAN (min_cluster_size = {1})".format(reduction, str(min_cluster_size)))
print("\n")

print(results.keys())
writedb(results, "clustering.pkl")


In [None]:
# Plot the clustering metrics for each clustering method

# Clustering of entire 20 classes
documents = fetch_20newsgroups(remove=("headers", "footers"))

# Save data and assign labels
X = documents.data
Y = documents.target
classes = documents.target_names

results = readdb("clustering.pkl")
overall_results = np.empty([0, 6])

for result in results:
    curr_result = results[result]
    if not isinstance(curr_result, np.ndarray):
        curr_result = curr_result.toarray()
    h, c, vm, ri, mis = print_clustering_metrics(Y, curr_result, display=False)
    mis = 0 if mis < 1e-14 else mis
    overall_results = np.vstack((overall_results, np.array([result, h, c, vm, ri, mis])))

overall_results = overall_results.T

metrics = ["Homogeneity", "Completeness", "V-measure", "Adjusted rand index", "Adjusted mutual info score"]

from matplotlib.ticker import StrMethodFormatter
for i in range(5):
    sorted_score = overall_results[[0, i+1]]
    sorted_score = sorted_score[:,sorted_score[1,:].argsort()]
    x, y = sorted_score
    plt.figure(figsize=(5, 15), dpi=80)
    plt.barh(x, [float(j) for j in y])
    plt.yticks(fontsize=10)
    plt.title(metrics[i])
    plt.show()