In [21]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from scipy.sparse import csr_matrix

# Load the vectorized data
import pickle

import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

In [22]:
with open('title_vectors.pkl', 'rb') as f:
    title_vectors = pickle.load(f)

with open('abstract_vectors.pkl', 'rb') as f:
    abstract_vectors = pickle.load(f)

In [23]:
# Combine the title and abstract vectors
combined_vectors = np.vstack((title_vectors, abstract_vectors))  # Fixed the typo in vstack


In [24]:
def visualize_clusters(combined_vectors, labels):
    # Perform t-SNE dimensionality reduction
    tsne = TSNE(n_components=2)
    tsne_vectors = tsne.fit_transform(combined_vectors)

    # Visualize the clusters using a scatter plot
    plt.scatter(tsne_vectors[:, 0], tsne_vectors[:, 1], c=labels)
    plt.title("Cluster Visualization")
    plt.show()

def create_elbow_plot(combined_vectors):
    # Calculate the within-cluster sum of squares (WCSS) for a range of k values
    k_range = range(2, 100)
    wcss = []

    for k in k_range:
        kmeans = KMeans(n_clusters=k)
        kmeans.fit(combined_vectors)
        wcss.append(kmeans.inertia_)

    # Plot the elbow plot
    plt.plot(k_range, wcss)
    plt.title("Elbow Plot")
    plt.xlabel("Number of Clusters (k)")
    plt.ylabel("Within-Cluster Sum of Squares (WCSS)")
    plt.show()
    
def calculate_cohesion(data, labels):
    cohesion_scores = []
    for cluster_label in np.unique(labels):
        cluster_data = data[labels == cluster_label]
        pairwise_distances = np.linalg.norm(cluster_data[:, np.newaxis] - cluster_data, axis=2)
        average_distance = np.mean(pairwise_distances)
        cohesion_scores.append(average_distance)
    return np.mean(cohesion_scores)  # Fixed the calculation of cohesion

def calculate_separation(data, labels):
    separation_scores = []
    for cluster1_label in np.unique(labels):
        cluster1_data = data[labels == cluster1_label]
        for cluster2_label in np.unique(labels):
            if cluster1_label != cluster2_label:
                cluster2_data = data[labels == cluster2_label]
                pairwise_distances = np.linalg.norm(cluster1_data[:, np.newaxis] - cluster2_data, axis=2)
                average_distance = np.mean(pairwise_distances)
                separation_scores.append(average_distance)
    return np.mean(separation_scores)  # Fixed the calculation of separation


In [25]:
# Perform iterative clustering with silhouette score as its metric
silhouettes = []
cluster_centers_list = []
ks = range(2, 11)  # Specify the range of k values to evaluate

# Check if it's a sparse matrix
if isinstance(combined_vectors, csr_matrix):
    dense_combined_vectors = combined_vectors.toarray()  # Convert sparse matrix to dense array for visualization
else:
    dense_combined_vectors = combined_vectors  # It's already a dense array

for k in ks:
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(combined_vectors)
    silhouettes.append(silhouette_score(combined_vectors, kmeans.labels_))
    cluster_centers_list.append(kmeans.cluster_centers_)

# Find the optimal number of clusters based on silhouette score
best_k = ks[silhouettes.index(max(silhouettes))]

# Obtain y_predicted and cluster_centers for the optimal number of clusters
kmeans = KMeans(n_clusters=best_k)
kmeans.fit(combined_vectors)
y_predicted = kmeans.labels_
cluster_centers = kmeans.cluster_centers_

# Visualize the clusters using the dense array for t-SNE
visualize_clusters(dense_combined_vectors, y_predicted)

ValueError: setting an array element with a sequence.

In [None]:
# Evaluate the clustering using silhouette score, cohesion, and separation
silhouette_avg = silhouette_score(combined_vectors, kmeans.labels_)
cohesion_avg = calculate_cohesion(combined_vectors, kmeans.labels_)
separation_avg = calculate_separation(combined_vectors, kmeans.labels_)

print("Silhouette score:", silhouette_avg)
print("Cohesion:", cohesion_avg)
print("Separation:", separation_avg)

In [None]:
# Visualize the clusters
visualize_clusters(combined_vectors, kmeans.labels_)

# Create the elbow plot
create_elbow_plot(combined_vectors)