<a href="https://colab.research.google.com/github/femketenharkel/Predicting_Ratings/blob/main/models/clustering/Hierarchical_clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Hierarchical clustering**

In [None]:
import pandas as pd
from sklearn.cluster import KMeans
from google.colab import drive
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
from joblib import Parallel, delayed
import joblib

In [None]:
# Load in the data
drive.mount('/content/drive')
df_final = pd.read_csv('/content/drive/My Drive/Thesis/Data/df_final.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Exclude target variable and unique identifiers for clustering.
df = df_final.drop(columns= ['Rating', 'MovieID', 'UserID'])

In [None]:
# Normalizing the numerical variables
scaler = MinMaxScaler()
features_to_scale = ['Year', 'Month', 'Day', 'Hour', 'Age', 'Release_year',
                       'Time_release_to_rating', 'Total_ratings_per_movie',
                       'Total_ratings_per_user' ]
df[features_to_scale] = scaler.fit_transform(df[features_to_scale])

## Originele HC, too complex

In [None]:
# Hierarchical clustering might be too complex -> way to computationally expensive

import pandas as pd
from sklearn.preprocessing import StandardScaler
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import silhouette_score, davies_bouldin_score


# Hierarchical clustering
Z = linkage(df_scaled, method='ward')

# Plot the outcome
plt.figure(figsize=(10, 7))
dendrogram(Z)
plt.title("Dendrogram")
plt.xlabel("Data points")
plt.ylabel("Euclidean distances")
plt.show()

# 3. Determine the optimal number of clusters using the elbow method
last = Z[-10:, 2]
last_rev = last[::-1]
indexes = np.arange(1, len(last) + 1)
plt.plot(indexes, last_rev)
plt.title("Elbow Method for Optimal Clusters")
plt.xlabel("Number of clusters")
plt.ylabel("Distance")
plt.show()

In [None]:
# Assuming the elbow point is at 3 clusters (adjust based on your plot)
optimal_clusters = 3

# 4. Cluster the data using the optimal number of clusters
clusters = fcluster(Z, optimal_clusters, criterion='maxclust')

# Add the cluster labels to the original dataframe
df['Cluster'] = clusters

# 5. Evaluate the clustering with the silhouette score and Davies-Bouldin index
silhouette_avg = silhouette_score(df_scaled, clusters)
davies_bouldin_avg = davies_bouldin_score(df_scaled, clusters)

print(f"Optimal number of clusters: {optimal_clusters}")
print(f"Silhouette Score: {silhouette_avg}")
print(f"Davies-Bouldin Index: {davies_bouldin_avg}")

# Print the first few rows of the dataframe with cluster labels
print(df.head())


## HC with AgglomerativeClustering -> Way too computationally expensive

HC with AgglomerativeClustering (is better for large datasets, still scalable)
- still start with each point = cluster
- merges clusters
- creates a linkage matrix

-> merges clusters based on the chosen linkage criterion
-> Dendogram still used
->


In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from scipy.cluster.hierarchy import linkage
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score, davies_bouldin_score

# Calculate the inertia for different number of clusters using AgglomerativeClustering
cluster_range = range(1, 11)
inertia = []
for k in cluster_range:
    model = AgglomerativeClustering(n_clusters=k, linkage='complete')
    clusters = model.fit_predict(df)
    sum_of_squares = 0
    for cluster_id in np.unique(clusters):
        cluster_points = df[clusters == cluster_id]
        centroid = cluster_points.mean(axis=0)
        sum_of_squares += np.sum((cluster_points - centroid) ** 2)
    inertia.append(sum_of_squares)

# Plot the results to find the elbow point
plt.figure(figsize=(12, 9))
plt.plot(cluster_range, inertia, marker='o')
plt.title('Elbow Method For Optimal k')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.show()

In [None]:
# From this plot, determine the optimal number of clusters
optimal_clusters =

# Perform clustering again with the optimal number of clusters
model_optimal = AgglomerativeClustering(n_clusters=optimal_clusters,
                                        linkage='complete')
clusters = model_optimal.fit_predict(df)

# Evaluate the clustering with the silhouette score and Davies-Bouldin index
silhouette = silhouette_score(df, clusters)
davies_bouldin = davies_bouldin_score(df, clusters)

print(f"Optimal number of clusters: {optimal_clusters}")
print(f"Silhouette Score: {silhouette}")
print(f"Davies-Bouldin Index: {davies_bouldin}")

# Add the cluster labels to the original dataframe
df_final['Cluster'] = clusters

In [None]:
df_final.to_csv('/content/drive/My Drive/Thesis/Data/df_final_Hierarchical_Clustering.csv', index=False)

# Try 3 -> simplified, no hyperparameter tuning, regular hierarchical clustering -> Too much RAM needed, 334 GB is not enough


In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering
import matplotlib.pyplot as plt
import scipy.cluster.hierarchy as shc
import numpy as np


# Plot dendrogram to visualize the hierarchical clustering
plt.figure(figsize=(10, 7))
plt.title("Dendrogram (Complete Linkage)")
dendrogram = shc.dendrogram(shc.linkage(df, method='complete'))
plt.show()


In [None]:
# Apply Agglomerative Clustering using complete linkage
model = AgglomerativeClustering(n_clusters=5, linkage='complete')
clusters = model.fit_predict(df_scaled)

# Add cluster labels back to the DataFrame
df_final['Cluster'] = clusters

# Display the DataFrame with clusters
print(df)

# Plot the clusters
plt.figure(figsize=(10, 7))
plt.scatter(df['Feature1'], df['Feature2'], c=df['Cluster'], cmap='viridis')
plt.title("Clusters Visualization")
plt.xlabel('Feature1')
plt.ylabel('Feature2')
plt.show()

# Cluster in chunks to save memory (50000)

In [None]:
import pandas as pd
from sklearn.cluster import AgglomerativeClustering
import numpy as np

# Define parameters and empty Dataframe for results
chunk_size = 50000
n_clusters = 500
clustered_data = pd.DataFrame()

# Define function for chunk clustering
def cluster_chunk(chunk, n_clusters):
    chunk_copy = chunk.copy()
    model = AgglomerativeClustering(n_clusters=n_clusters, linkage='complete')
    chunk_copy['Cluster'] = model.fit_predict(chunk_copy)
    return chunk_copy


# Clustering the chunks
for i in range(0, len(df), chunk_size):
    chunk = df.iloc[i:i + chunk_size]

    # Ensure n_clusters does not exceed the number of samples in the chunk
    n_clusters_chunk = min(n_clusters, len(chunk))

    clustered_chunk = cluster_chunk(chunk, n_clusters_chunk)
    clustered_data = pd.concat([clustered_data, clustered_chunk], axis=0)

# Reset index after concatenation
clustered_data.reset_index(drop=True, inplace=True)

clustered_data.head()


Unnamed: 0,Year,Month,Day,Hour,Age,Release_year,Female,Male,Academic/educator,Artist,...,Favourite_Mystery,Favourite_Romance,Favourite_Sci-Fi,Favourite_Thriller,Favourite_War,Favourite_Western,Time_release_to_rating,Total_ratings_per_movie,Total_ratings_per_user,Cluster
0,0.0,0.909091,0.733333,0.521739,0.166667,0.91358,True,False,False,False,...,False,False,False,False,False,False,0.085366,0.147123,0.246995,5
1,0.0,0.727273,0.633333,0.869565,0.166667,0.925926,False,True,False,False,...,False,False,False,False,False,False,0.073171,0.350328,0.091803,59
2,0.0,0.909091,0.6,0.826087,0.166667,0.777778,False,True,False,False,...,False,False,False,False,False,False,0.219512,0.158776,0.292896,106
3,0.0,0.909091,0.833333,1.0,0.833333,0.975309,False,True,False,False,...,True,False,False,False,False,False,0.02439,0.060452,0.030601,394
4,0.0,0.909091,0.0,0.434783,0.5,0.444444,False,True,False,True,...,False,False,False,False,False,False,0.54878,0.115805,0.034973,95


Silhouette score calculation to memory intensive

In [None]:
# Calculate silhouette score
# Extract features and cluster labels
X = clustered_data.drop(columns=['Cluster']).values
labels = clustered_data['Cluster'].values

# Compute silhouette score
sil_score = silhouette_score(X, labels)
print(f'Silhouette Score: {sil_score:.4f}')

# Display the DataFrame with clusters
print(clustered_data.head())


KeyboardInterrupt: 

In [None]:
df_final['Cluster'] = clustered_data['Cluster']

In [None]:
df_final.head()

Unnamed: 0,UserID,MovieID,Rating,Year,Month,Day,Hour,Age,Release_year,Female,...,Favourite_Mystery,Favourite_Romance,Favourite_Sci-Fi,Favourite_Thriller,Favourite_War,Favourite_Western,Time_release_to_rating,Total_ratings_per_movie,Total_ratings_per_user,Cluster
0,1645,485,3,2000,11,23,12,1,1993,True,...,False,False,False,False,False,False,7,203,228,5
1,3112,357,4,2000,9,20,20,1,1994,False,...,False,False,False,False,False,False,6,482,86,59
2,2077,2792,2,2000,11,19,19,1,1982,False,...,False,False,False,False,False,False,18,219,270,106
3,868,2279,4,2000,11,26,23,5,1998,False,...,True,False,False,False,False,False,2,84,30,394
4,2773,1103,4,2000,11,1,10,3,1955,False,...,False,False,False,False,False,False,45,160,34,95


This dataset now with code from above, silhouette score not able to be calculated

In [None]:
df_final.to_csv('/content/drive/My Drive/Thesis/Data/df_final_Hierarchical_Clustering.csv', index=False)

# Bigger chunks and silhouette score calculated right away -> gives error, needs fixing

In [None]:
import pandas as pd
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_samples, silhouette_score
import numpy as np

# Define parameters and empty DataFrame for results
chunk_size = 50000
n_clusters = 500
clustered_data = pd.DataFrame()
silhouette_scores = []

# Define function for chunk clustering and silhouette score calculation
def cluster_chunk(chunk, n_clusters):
    chunk_copy = chunk.copy()
    model = AgglomerativeClustering(n_clusters=n_clusters, linkage='complete')
    labels = model.fit_predict(chunk_copy)
    chunk_copy['Cluster'] = labels
    return chunk_copy, labels

# Clustering the chunks and calculating silhouette scores
for i in range(0, len(df), chunk_size):
    chunk = df.iloc[i:i + chunk_size]

    # Ensure n_clusters does not exceed the number of samples in the chunk
    n_clusters_chunk = min(n_clusters, len(chunk))

    if n_clusters_chunk >= 2:
        clustered_chunk, labels_chunk = cluster_chunk(chunk, n_clusters_chunk)
        clustered_data = pd.concat([clustered_data, clustered_chunk], axis=0)

        # Calculate silhouette score for the current chunk
        X_chunk = clustered_chunk.drop(columns=['Cluster']).values
        unique_labels = np.unique(labels_chunk)
        if len(unique_labels) > 1 and len(X_chunk) > 1:
            sil_score = silhouette_score(X_chunk, labels_chunk)
            silhouette_scores.append(sil_score)
        else:
            print(f"Skipping silhouette score calculation for chunk {i}-{i+chunk_size} due to insufficient clusters or samples.")
    else:
        print(f"Skipping chunk {i}-{i+chunk_size} due to insufficient samples for clustering.")

# Reset index after concatenation
clustered_data.reset_index(drop=True, inplace=True)

# Compute the average silhouette score if any scores are available
if silhouette_scores:
    average_silhouette_score = np.mean(silhouette_scores)
    print(f'Average Silhouette Score: {average_silhouette_score:.4f}')
else:
    print("No valid silhouette scores calculated.")

# Display the DataFrame with clusters
print(clustered_data.head())


ValueError: Number of labels is 83. Valid values are 2 to n_samples - 1 (inclusive)

In [None]:
df_final['Cluster'] = clustered_data['Cluster']

In [None]:
df_final.to_csv('/content/drive/My Drive/Thesis/Data/df_final_Hierarchical_Clustering.csv', index=False)

# New try

In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import AgglomerativeClustering
from joblib import Parallel, delayed
import matplotlib.pyplot as plt

# Define the chunk size
chunk_size = 50000

# Define the range of clusters to test
cluster_range = range(2, 501, 20)

# Store the average SSD for each number of clusters
ssds = []

# Perform clustering for each number of clusters and compute the average SSD
for n_clusters in cluster_range:
    num_chunks = int(np.ceil(len(df) / chunk_size))
    chunks = [df.iloc[i*chunk_size:(i+1)*chunk_size] for i in range(num_chunks)]

    # Perform clustering and calculate SSD for each chunk
    results = Parallel(n-jobs=-1)(delayed(AgglomerativeClustering(n_clusters=n_clusters, linkage='complete').fit_predict)(chunk) for chunk in chunks)

    clustered_chunks = []
    ssds_chunks = []

    for chunk, labels in zip(chunks, results):
        chunk_copy = chunk.copy()
        chunk_copy['cluster'] = labels
        clustered_chunks.append(chunk_copy)

        ssd = 0
        for cluster in np.unique(labels):
            cluster_points = chunk[labels == cluster]
            centroid = cluster_points.mean(axis=0)
            ssd += np.sum((cluster_points - centroid) ** 2)
        ssds_chunks.append(ssd)

    avg_ssd = np.mean(ssds_chunks)
    ssds.append(avg_ssd)
    print(f'Average SSD for {n_clusters} clusters: {avg_ssd:.2f}')

# Plot the SSDs to find the optimal number of clusters
plt.figure(figsize=(12, 6))
plt.plot(cluster_range, ssds, marker='o')
plt.title('SSD for Optimal Number of Clusters')
plt.xlabel('Number of clusters')
plt.ylabel('Sum of Squared Distances')
plt.grid(axis='y', linewidth=0.5)
plt.show()


In [None]:
# Assuming the optimal number of clusters is determined from the elbow plot
optimal_n_clusters =

# Run the model with the optimal number of clusters
num_chunks = int(np.ceil(len(df) / chunk_size))
chunks = [df.iloc[i*chunk_size:(i+1)*chunk_size] for i in range(num_chunks)]

results = Parallel(n-jobs=-1)(delayed(AgglomerativeClustering(n_clusters=optimal_n_clusters, linkage='complete').fit_predict)(chunk) for chunk in chunks)

clustered_chunks = []

for chunk, labels in zip(chunks, results):
    chunk_copy = chunk.copy()
    chunk_copy['cluster'] = labels
    clustered_chunks.append(chunk_copy)

df_clustered = pd.concat(clustered_chunks)

# Display the resulting DataFrame with cluster assignments
print(f'Optimal number of clusters: {optimal_n_clusters}')
print(df_clustered.head())