In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Vectorize the data using TF-IDF
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(preprocessed_train_data)
X_test_tfidf = vectorizer.transform(preprocessed_test_data)

# Example usage of preprocessed and vectorized data
print("Original Sample:", newsgroups_train.data[0])
print("Preprocessed Sample:", preprocessed_train_data[0])
print("TF-IDF Vectorized Shape (Training):", X_train_tfidf.shape)
print("TF-IDF Vectorized Shape (Testing):", X_test_tfidf.shape)

In [None]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

# Elbow method to find the optimal number of clusters
inertia = []
cluster_range = range(1, 26)  # Test for 1 to 25 clusters
for k in cluster_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_train_tfidf)
    inertia.append(kmeans.inertia_)

# Plot the elbow curve
plt.figure(figsize=(8, 5))
plt.plot(cluster_range, inertia, marker='o')
plt.title('Elbow Method for Optimal K')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia')
plt.show()

In [None]:
from sklearn.metrics import silhouette_score

# Silhouette score to find the optimal number of clusters
silhouette_scores = []
cluster_range = range(10, 26)  # Test for 10 to 25 clusters
for k in cluster_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    labels = kmeans.fit_predict(X_train_tfidf)
    score = silhouette_score(X_train_tfidf, labels)
    silhouette_scores.append(score)

# Plot the silhouette scores
plt.figure(figsize=(8, 5))
plt.plot(cluster_range, silhouette_scores, marker='o')
plt.title('Silhouette Score for Optimal K')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Silhouette Score')
plt.show()

# Choose the optimal number of clusters
optimal_clusters = cluster_range[silhouette_scores.index(max(silhouette_scores))]
print("Optimal number of clusters based on silhouette score:", optimal_clusters)

In [None]:
optimal_clusters = 20

# Perform k-means clustering
kmeans = KMeans(n_clusters=optimal_clusters, random_state=42, n_init=10)
kmeans.fit(X_train_tfidf)
labels = kmeans.labels_

# Example output of clustering
print("K-Means clustering performed with", optimal_clusters, "clusters.")
print("Cluster Labels for Training Data (first 10):", labels[:10])

In [None]:
from sklearn.cluster import DBSCAN
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler

# Compute cosine similarity
cosine_sim_matrix = cosine_similarity(X_train_tfidf)

# Normalize similarity values to range [0, 1]
scaler = MinMaxScaler()
cosine_sim_matrix = scaler.fit_transform(cosine_sim_matrix)

# Convert cosine similarity to distance (1 - similarity)
cosine_distance_matrix = 1 - cosine_sim_matrix

# Apply DBSCAN using the distance matrix
dbscan = DBSCAN(metric='precomputed', eps=0.4, min_samples=5)
labels = dbscan.fit_predict(cosine_distance_matrix)

# Print cluster labels
print("DBSCAN Clustering Labels:", labels)

# Count number of clusters and noise points
num_clusters = len(set(labels)) - (1 if -1 in labels else 0)  # -1 is the noise label
num_noise = list(labels).count(-1)
print("Number of Clusters:", num_clusters)
print("Number of Noise Points:", num_noise)

In [None]:
from sklearn.metrics import silhouette_score

# Calculate silhouette score (excluding noise points if present)
if num_clusters > 1:
    valid_indices = labels != -1  # Exclude noise points
    silhouette_avg = silhouette_score(cosine_distance_matrix[valid_indices][:, valid_indices], labels[valid_indices], metric='precomputed')
    print("Silhouette Score:", silhouette_avg)
else:
    print("Silhouette Score cannot be calculated with less than 2 clusters.")

In [None]:
# Compute cosine similarity
cosine_sim_matrix = cosine_similarity(X_train_tfidf)

# Normalize similarity values to range [0, 1]
scaler = MinMaxScaler()
cosine_sim_matrix = scaler.fit_transform(cosine_sim_matrix)

# Convert cosine similarity to distance (1 - similarity)
cosine_distance_matrix = 1 - cosine_sim_matrix

# Explicitly zero out diagonal values
# Check diagonal values before applying the fix
diagonal_values_before = np.diag(cosine_distance_matrix)
print("Diagonal Values Before Fix:", diagonal_values_before)

# Check if the entire diagonal is exactly zero
if np.all(np.diag(cosine_distance_matrix) == 0):
    print("The diagonal is exactly set to zero.")
else:
    print("There are non-zero values in the diagonal.")

# Ensure diagonal values are exactly zero
np.fill_diagonal(cosine_distance_matrix, 0)

# Check diagonal values after applying the fix
diagonal_values_after = np.diag(cosine_distance_matrix)
print("Diagonal Values After Fix:", diagonal_values_after)

# Check if the entire diagonal is exactly zero
if np.all(np.diag(cosine_distance_matrix) == 0):
    print("The diagonal is exactly set to zero.")
else:
    print("There are non-zero values in the diagonal.")

In [None]:
from sklearn.cluster import HDBSCAN
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler

# Compute cosine similarity
cosine_sim_matrix = cosine_similarity(X_train_tfidf)

# Normalize similarity values to range [0, 1]
scaler = MinMaxScaler()
cosine_sim_matrix = scaler.fit_transform(cosine_sim_matrix)


# Convert cosine similarity to distance (1 - similarity)
cosine_distance_matrix = 1 - cosine_sim_matrix


# Apply HDBSCAN using cosine similarity
hdbscan_clusterer = HDBSCAN(metric='precomputed', min_cluster_size=5, min_samples=5)
labels = hdbscan_clusterer.fit_predict(cosine_distance_matrix)

# Print cluster labels
print("HDBSCAN Clustering Labels:", labels)

# Count number of clusters and noise points
num_clusters = len(set(labels)) - (1 if -1 in labels else 0)  # -1 is the noise label
num_noise = list(labels).count(-1)
print("Number of Clusters:", num_clusters)
print("Number of Noise Points:", num_noise)

# Ensure diagonal values are exactly zero
np.fill_diagonal(cosine_distance_matrix, 0)

# Check if the entire diagonal is exactly zero
if np.all(np.diag(cosine_distance_matrix) == 0):
    print("The diagonal is exactly set to zero.")
else:
    print("There are non-zero values in the diagonal.")

In [None]:
from sklearn.neighbors import kneighbors_graph
from sklearn.metrics.pairwise import cosine_similarity

# Compute cosine similarity
cosine_sim_matrix = cosine_similarity(X_train_tfidf)

k = 10  # Number of nearest neighbors
cosine_similarity_matrix = kneighbors_graph(X_train_tfidf, n_neighbors=k, mode='connectivity').toarray()

In [None]:
from scipy.sparse.csgraph import connected_components


# Check connected components
n_connected_components, labels = connected_components(csgraph=cosine_similarity_matrix, directed=False)
print(f"Number of connected components: {n_connected_components}")