# CLUSTERING MODELS

### Clustering Workflow Overview:
1. **Define Clustering Methods**  
   Define functions for the following 5 clustering methods:
   - K-Means
   - K-Medoids
   - K-Medoids
   - Spectral Clustering

2. **Set Number of Clusters**  
   Calculate the number of original categories and set the clustering count to match this number.

3. **Execute Clustering and Save Results**  
   Perform clustering using each method and store results in dictionary format.  
   Save all clustering results as a pickle file in the directory `./output/clustering/`.

In [1]:
import pandas as pd
import numpy as np
import warnings
import os

warnings.filterwarnings("ignore")

## 1 K-MEANS

In [2]:
from sklearn.cluster import KMeans

def Kmeans_Raw(data,n_clusters,state=520):
    # Apply K-means clustering with 3 clusters
    kmeans = KMeans(n_clusters=n_clusters, random_state=state)
    kmeans.fit(data)

    return kmeans.labels_

## 2 KMedoids (with distance matrix)
- This method needs to use distance data

In [3]:
try:
    from sklearn_extra.cluster import KMedoids
except Exception as e:
    print('installing scikit-learn-extra package')
    
    !pip install scikit-learn-extra
    
    from sklearn_extra.cluster import KMedoids

### 2.1 Euclidean Distance

In [4]:
def Kmedoids_Euc(data,n_clusters,state=520):

    # Perform KMedoids
    kmedoids_euc = KMedoids(n_clusters=n_clusters, metric='euclidean', random_state=state)
    kmedoids_euc.fit(data)

    return kmedoids_euc.labels_

### 2.2 Manhattan Distance

In [5]:
def Kmedoids_Man(data,n_clusters,state=520):

    # Perform KMedoids
    kmedoids_man = KMedoids(n_clusters=n_clusters, metric='manhattan',random_state=state)
    kmedoids_man.fit(data)

    return kmedoids_man.labels_

### 2.3 Cosine Distance

In [6]:
def Kmedoids_Cos(data,n_clusters,state=520):

    # Perform KMedoids
    kmedoids_cos = KMedoids(n_clusters=n_clusters, metric='cosine',random_state=state)
    kmedoids_cos.fit(data)

    return kmedoids_cos.labels_

## 3 Spectral Clustering (with similarity matrix)
- This method needs to use similarity 

In [7]:
from sklearn.cluster import SpectralClustering

### 2.1 Nearest Neighbors

In [8]:
def Spectral_Nn(data,n_clusters,state=520):

    # Perform clustery
    spectral_nn = SpectralClustering(n_clusters=n_clusters, affinity='nearest_neighbors',random_state = state)
    spectral_nn_labels = spectral_nn.fit_predict(data)

    return spectral_nn_labels

### 2.2 Radial Basis Function

In [9]:
def Spectral_Rbf(data,n_clusters,state=520):

    # Perform clustery
    spectral_rbf = SpectralClustering(n_clusters=n_clusters, affinity='rbf',random_state=state)
    spectral_rbf_labels = spectral_rbf.fit_predict(data)

    return spectral_rbf_labels

## 4. Calculate the amount of original categories

In [10]:
# We need to keep the clustering number is equal to the amount of original categories
file_path = './/dataset//wikispeedia_paths-and-graph//categories.tsv'
category_df = pd.read_csv(file_path, sep='\t', skiprows=12,header=None)
category_df.columns = ['concept','category']

In [11]:
# the type of data is dict
print('The total number of articles in the category_dataset is: {}'.format(category_df.shape[0]))

The total number of articles in the category_dataset is: 5204


In [12]:
# Collect all primary categories
category_df['primary_category'] = category_df['category'].apply(lambda x: x.split('.')[1])
print('The orginal primary categories are:')
print(category_df['primary_category'].unique())

# Set the number of clustering
n_clusters = len(category_df['primary_category'].unique())
print('The clustering number we need to set is: {}'.format(n_clusters))

The orginal primary categories are:
['History' 'People' 'Countries' 'Geography' 'Business_Studies' 'Science'
 'Everyday_life' 'Design_and_Technology' 'Music' 'IT'
 'Language_and_literature' 'Mathematics' 'Religion' 'Art' 'Citizenship']
The clustering number we need to set is: 15


## 5. Perform Clustering and Save results

### 5.1 Read the embedding data

In [13]:
import pickle

# Set the embedding file path
file_path = './/output//embeddings//all_mpnet_base_v2//20241109_150434//embeddings.pkl'
saved_path = './/output//clustering//all_mpnet_base_v2'

# Read the file and print embeddings
with open(file_path, 'rb') as file:
    embedding = pickle.load(file)
    embedding_values = list(embedding.values())

print('The total number of articles in the category_dataset is: {}'.format(len(embedding_values)))

The total number of articles in the category_dataset is: 4604


In [20]:
# Set the embedding file path

file_path = './/output//embeddings//all_MiniLM_L6_v2//20241109_104244//embeddings.pkl'
saved_path = './/output//clustering//all_MiniLM_L6_v2'

# Read the file and print embeddings
with open(file_path, 'rb') as file:
    embedding = pickle.load(file)
    embedding_values = list(embedding.values())

print('The total number of articles in the category_dataset is: {}'.format(len(embedding_values)))

The total number of articles in the category_dataset is: 5232


### 5.2 Perform K-MEANS Clustering

In [21]:
# Calling the function
Kmeans_Clustering = Kmeans_Raw(embedding_values,n_clusters)

# Transfer the result into dict
Kmeans_result = dict(zip(list(embedding.keys()), Kmeans_Clustering))
print("K-means Cluster labels:")
print(Kmeans_result)

# Calculate the counts of clustering results
value_counts = pd.Series(list(Kmeans_result.values())).value_counts()
print(value_counts)

# Save the clustering result
os.makedirs(saved_path, exist_ok=True)

with open(saved_path+'//KMeans.pkl', 'wb') as f:
    pickle.dump(Kmeans_result, f)


K-means Cluster labels:
{'Lilongwe': 2, 'West_Virginia': 10, 'Magnet': 8, 'Scrooge_McDuck': 1, 'Juniper_berry': 11, 'Gerald_Durrell': 4, 'Henry_IV_of_England': 4, 'Central_African_Republic': 2, 'Lebanon_A': 3, 'Avro_Lancaster': 1, 'Clifton_Suspension_Bridge': 6, 'Speech_synthesis': 7, 'Number': 7, 'James_Garfield': 4, 'Summer': 9, 'Indian_Standard_Time': 10, 'Religious_Society_of_Friends': 7, 'Iron_Maiden': 1, 'Gliese_876_c': 0, 'Ununoctium': 8, 'Military_history_of_the_Soviet_Union': 1, 'Rockall': 9, 'Engineering': 7, 'Butter': 11, 'Dune': 9, 'Myco-heterotrophy': 11, 'Maize': 11, 'Celtic_mythology': 6, 'Telephone_exchange': 7, 'Medal_of_Honor': 1, 'University_of_Texas_at_Austin': 1, 'Cyril_Clarke': 4, 'Rosetta_Stone': 12, 'Felix_Mendelssohn': 4, 'Jacobite_rising': 6, 'Paris': 10, 'West_Flemish': 10, 'Catholic_social_teaching': 12, 'Borage': 11, 'Demographics_of_Libya': 2, 'Creation-evolution_controversy': 7, 'Grand_Central_Station_%28Chicago%29': 1, 'Syria_Ep_News_270706': 3, 'Darmsta

### 5.3 Perform K-Medoids Clustering with Euclidean Distance

In [22]:
# Calling the function
Kmedoids_Euc_Clustering = Kmedoids_Euc(embedding_values,n_clusters)

# Transfer the result into dict
Kmedoids_Euc_result = dict(zip(list(embedding.keys()), Kmedoids_Euc_Clustering))
print("Kmedoids_Euc Cluster labels:")
print(Kmedoids_Euc_result)

# Calculate the counts of clustering results
value_counts = pd.Series(list(Kmedoids_Euc_result.values())).value_counts()
print(value_counts)

# Save the clustering result
os.makedirs(saved_path, exist_ok=True)

with open(saved_path+'//Kmedoids_Euc.pkl', 'wb') as f:
    pickle.dump(Kmedoids_Euc_result, f)


Kmedoids_Euc Cluster labels:
{'Lilongwe': 11, 'West_Virginia': 5, 'Magnet': 9, 'Scrooge_McDuck': 10, 'Juniper_berry': 9, 'Gerald_Durrell': 1, 'Henry_IV_of_England': 14, 'Central_African_Republic': 6, 'Lebanon_A': 10, 'Avro_Lancaster': 7, 'Clifton_Suspension_Bridge': 10, 'Speech_synthesis': 9, 'Number': 7, 'James_Garfield': 11, 'Summer': 1, 'Indian_Standard_Time': 7, 'Religious_Society_of_Friends': 13, 'Iron_Maiden': 14, 'Gliese_876_c': 5, 'Ununoctium': 7, 'Military_history_of_the_Soviet_Union': 7, 'Rockall': 1, 'Engineering': 8, 'Butter': 6, 'Dune': 9, 'Myco-heterotrophy': 7, 'Maize': 6, 'Celtic_mythology': 7, 'Telephone_exchange': 5, 'Medal_of_Honor': 6, 'University_of_Texas_at_Austin': 6, 'Cyril_Clarke': 10, 'Rosetta_Stone': 7, 'Felix_Mendelssohn': 10, 'Jacobite_rising': 1, 'Paris': 6, 'West_Flemish': 14, 'Catholic_social_teaching': 8, 'Borage': 9, 'Demographics_of_Libya': 6, 'Creation-evolution_controversy': 7, 'Grand_Central_Station_%28Chicago%29': 5, 'Syria_Ep_News_270706': 10, 'D

### 5.4 Perform K-Medoids Clustering with Manhattan Distance

In [23]:
# Calling the function
Kmedoids_Man_Clustering = Kmedoids_Man(embedding_values,n_clusters)

# Transfer the result into dict
Kmedoids_Man_result = dict(zip(list(embedding.keys()), Kmedoids_Man_Clustering))
print("Kmedoids_Man Cluster labels:")
print(Kmedoids_Man_result)

# Calculate the counts of clustering results
value_counts = pd.Series(list(Kmedoids_Man_result.values())).value_counts()
print(value_counts)

# Save the clustering result
os.makedirs(saved_path, exist_ok=True)

with open(saved_path+'//Kmedoids_Man.pkl', 'wb') as f:
    pickle.dump(Kmedoids_Man_result, f)

Kmedoids_Man Cluster labels:
{'Lilongwe': 9, 'West_Virginia': 0, 'Magnet': 10, 'Scrooge_McDuck': 5, 'Juniper_berry': 13, 'Gerald_Durrell': 11, 'Henry_IV_of_England': 3, 'Central_African_Republic': 9, 'Lebanon_A': 1, 'Avro_Lancaster': 9, 'Clifton_Suspension_Bridge': 0, 'Speech_synthesis': 12, 'Number': 9, 'James_Garfield': 8, 'Summer': 4, 'Indian_Standard_Time': 8, 'Religious_Society_of_Friends': 10, 'Iron_Maiden': 9, 'Gliese_876_c': 2, 'Ununoctium': 3, 'Military_history_of_the_Soviet_Union': 9, 'Rockall': 4, 'Engineering': 1, 'Butter': 9, 'Dune': 2, 'Myco-heterotrophy': 3, 'Maize': 9, 'Celtic_mythology': 9, 'Telephone_exchange': 2, 'Medal_of_Honor': 12, 'University_of_Texas_at_Austin': 8, 'Cyril_Clarke': 7, 'Rosetta_Stone': 14, 'Felix_Mendelssohn': 9, 'Jacobite_rising': 9, 'Paris': 0, 'West_Flemish': 0, 'Catholic_social_teaching': 10, 'Borage': 8, 'Demographics_of_Libya': 14, 'Creation-evolution_controversy': 9, 'Grand_Central_Station_%28Chicago%29': 9, 'Syria_Ep_News_270706': 12, 'Dar

### 5.5 Perform K-Medoids Clustering with Cosine Distance

In [24]:
# Calling the function
Kmedoids_Cos_Clustering = Kmedoids_Cos(embedding_values,n_clusters)

# Transfer the result into dict
Kmedoids_Cos_result = dict(zip(list(embedding.keys()), Kmedoids_Cos_Clustering))
print("Kmedoids_Cos Cluster labels:")
print(Kmedoids_Cos_result)

# Calculate the counts of clustering results
value_counts = pd.Series(list(Kmedoids_Cos_result.values())).value_counts()
print(value_counts)

# Save the clustering result
os.makedirs(saved_path, exist_ok=True)

with open(saved_path+'//Kmedoids_Cos.pkl', 'wb') as f:
    pickle.dump(Kmedoids_Cos_result, f)

Kmedoids_Cos Cluster labels:
{'Lilongwe': 10, 'West_Virginia': 1, 'Magnet': 5, 'Scrooge_McDuck': 11, 'Juniper_berry': 11, 'Gerald_Durrell': 11, 'Henry_IV_of_England': 4, 'Central_African_Republic': 8, 'Lebanon_A': 6, 'Avro_Lancaster': 7, 'Clifton_Suspension_Bridge': 5, 'Speech_synthesis': 12, 'Number': 12, 'James_Garfield': 11, 'Summer': 2, 'Indian_Standard_Time': 7, 'Religious_Society_of_Friends': 1, 'Iron_Maiden': 4, 'Gliese_876_c': 11, 'Ununoctium': 7, 'Military_history_of_the_Soviet_Union': 7, 'Rockall': 2, 'Engineering': 12, 'Butter': 8, 'Dune': 12, 'Myco-heterotrophy': 11, 'Maize': 8, 'Celtic_mythology': 9, 'Telephone_exchange': 12, 'Medal_of_Honor': 11, 'University_of_Texas_at_Austin': 12, 'Cyril_Clarke': 11, 'Rosetta_Stone': 7, 'Felix_Mendelssohn': 10, 'Jacobite_rising': 9, 'Paris': 8, 'West_Flemish': 1, 'Catholic_social_teaching': 5, 'Borage': 12, 'Demographics_of_Libya': 9, 'Creation-evolution_controversy': 12, 'Grand_Central_Station_%28Chicago%29': 12, 'Syria_Ep_News_270706'

### 5.6 Perform Spectral Clustering with Nearest Neighbors

In [25]:
# Calling the function
Spectral_Nn_Clustering = Spectral_Nn(embedding_values,n_clusters)

# Transfer the result into dict
Spectral_Nn_result = dict(zip(list(embedding.keys()), Spectral_Nn_Clustering))
print("Spectral_Nn Cluster labels:")
print(Spectral_Nn_result)

# Calculate the counts of clustering results
value_counts = pd.Series(list(Spectral_Nn_result.values())).value_counts()
print(value_counts)

# Save the clustering result
os.makedirs(saved_path, exist_ok=True)

with open(saved_path+'//Spectral_Nn.pkl', 'wb') as f:
    pickle.dump(Spectral_Nn_result, f)

Spectral_Nn Cluster labels:
{'Lilongwe': 14, 'West_Virginia': 1, 'Magnet': 6, 'Scrooge_McDuck': 3, 'Juniper_berry': 2, 'Gerald_Durrell': 3, 'Henry_IV_of_England': 3, 'Central_African_Republic': 14, 'Lebanon_A': 12, 'Avro_Lancaster': 3, 'Clifton_Suspension_Bridge': 1, 'Speech_synthesis': 13, 'Number': 13, 'James_Garfield': 1, 'Summer': 14, 'Indian_Standard_Time': 14, 'Religious_Society_of_Friends': 3, 'Iron_Maiden': 10, 'Gliese_876_c': 9, 'Ununoctium': 6, 'Military_history_of_the_Soviet_Union': 3, 'Rockall': 14, 'Engineering': 13, 'Butter': 2, 'Dune': 14, 'Myco-heterotrophy': 2, 'Maize': 2, 'Celtic_mythology': 3, 'Telephone_exchange': 13, 'Medal_of_Honor': 3, 'University_of_Texas_at_Austin': 14, 'Cyril_Clarke': 3, 'Rosetta_Stone': 3, 'Felix_Mendelssohn': 3, 'Jacobite_rising': 3, 'Paris': 14, 'West_Flemish': 14, 'Catholic_social_teaching': 3, 'Borage': 2, 'Demographics_of_Libya': 14, 'Creation-evolution_controversy': 3, 'Grand_Central_Station_%28Chicago%29': 1, 'Syria_Ep_News_270706': 12

### 5.7 Perform Spectral Clustering with RBF

In [26]:
# Calling the function
Spectral_Rbf_Clustering = Spectral_Rbf(embedding_values,n_clusters)

# Transfer the result into dict
Spectral_Rbf_result = dict(zip(list(embedding.keys()), Spectral_Rbf_Clustering))
print("Spectral_Rbf Cluster labels:")
print(Spectral_Rbf_result)

# Calculate the counts of clustering results
value_counts = pd.Series(list(Spectral_Rbf_result.values())).value_counts()
print(value_counts)

# Save the clustering result
os.makedirs(saved_path, exist_ok=True)

with open(saved_path+'//Spectral_Rbf.pkl', 'wb') as f:
    pickle.dump(Spectral_Rbf_result, f)

Spectral_Rbf Cluster labels:
{'Lilongwe': 11, 'West_Virginia': 10, 'Magnet': 3, 'Scrooge_McDuck': 12, 'Juniper_berry': 5, 'Gerald_Durrell': 8, 'Henry_IV_of_England': 8, 'Central_African_Republic': 11, 'Lebanon_A': 4, 'Avro_Lancaster': 8, 'Clifton_Suspension_Bridge': 10, 'Speech_synthesis': 9, 'Number': 9, 'James_Garfield': 12, 'Summer': 1, 'Indian_Standard_Time': 13, 'Religious_Society_of_Friends': 9, 'Iron_Maiden': 12, 'Gliese_876_c': 7, 'Ununoctium': 3, 'Military_history_of_the_Soviet_Union': 0, 'Rockall': 10, 'Engineering': 9, 'Butter': 9, 'Dune': 10, 'Myco-heterotrophy': 9, 'Maize': 9, 'Celtic_mythology': 8, 'Telephone_exchange': 9, 'Medal_of_Honor': 12, 'University_of_Texas_at_Austin': 12, 'Cyril_Clarke': 12, 'Rosetta_Stone': 2, 'Felix_Mendelssohn': 2, 'Jacobite_rising': 8, 'Paris': 10, 'West_Flemish': 0, 'Catholic_social_teaching': 2, 'Borage': 5, 'Demographics_of_Libya': 11, 'Creation-evolution_controversy': 9, 'Grand_Central_Station_%28Chicago%29': 12, 'Syria_Ep_News_270706': 1