<a href="https://colab.research.google.com/github/femketenharkel/Predicting_Ratings/blob/main/models/clustering/Hierarchical_clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Hierarchical clustering**

In [1]:
import pandas as pd
from sklearn.cluster import KMeans
from google.colab import drive
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
from joblib import Parallel, delayed
import joblib

In [2]:
# Load in the data
drive.mount('/content/drive')
df_final = pd.read_csv('/content/drive/My Drive/Thesis/Data/df_final.csv')

Mounted at /content/drive


In [3]:
# Exclude target variable and unique identifiers for clustering.
df = df_final.drop(columns= ['Rating', 'MovieID', 'UserID'])

In [4]:
df.head()

Unnamed: 0,Year,Month,Day,Hour,Age,Release_year,Female,Male,Academic/educator,Artist,...,Favourite_Musical,Favourite_Mystery,Favourite_Romance,Favourite_Sci-Fi,Favourite_Thriller,Favourite_War,Favourite_Western,Time_release_to_rating,Total_ratings_per_movie,Total_ratings_per_user
0,2000,11,23,12,1,1993,True,False,False,False,...,True,False,False,False,False,False,False,7,203,228
1,2000,9,20,20,1,1994,False,True,False,False,...,False,False,False,False,False,False,False,6,482,86
2,2000,11,19,19,1,1982,False,True,False,False,...,False,False,False,False,False,False,False,18,219,270
3,2000,11,26,23,5,1998,False,True,False,False,...,False,True,False,False,False,False,False,2,84,30
4,2000,11,1,10,3,1955,False,True,False,True,...,False,False,False,False,False,False,False,45,160,34


In [5]:
# Normalizing the numerical variables
scaler = MinMaxScaler()
features_to_scale = ['Year', 'Month', 'Day', 'Hour', 'Age', 'Release_year',
                       'Time_release_to_rating', 'Total_ratings_per_movie',
                       'Total_ratings_per_user' ]
df[features_to_scale] = scaler.fit_transform(df[features_to_scale])

In [None]:
# OLD CODE!!!!!
# Determine the range of clusters to evaluate
cluster_range = range(1, 11)
linkage_matrix = linkage(df_scaled, method='ward')

# Calculate the within-cluster sum of squares for each number of clusters
inertia = []
for k in cluster_range:
    clusters = fcluster(linkage_matrix, k, criterion='maxclust')
    inertia.append(np.sum(np.min(linkage_matrix[:, 2], axis=0)))

# Plot the results to find the elbow point
plt.figure(figsize=(8, 6))
plt.plot(cluster_range, inertia, marker='o')
plt.title('Elbow Method For Optimal k')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.show()

## Originele HC, might be too complex

In [None]:
# Hierarchical clustering might be too complex -> way to computationally expensive

import pandas as pd
from sklearn.preprocessing import StandardScaler
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import silhouette_score, davies_bouldin_score


# Hierarchical clustering
Z = linkage(df_scaled, method='ward')

# Plot the outcome
plt.figure(figsize=(10, 7))
dendrogram(Z)
plt.title("Dendrogram")
plt.xlabel("Data points")
plt.ylabel("Euclidean distances")
plt.show()

# 3. Determine the optimal number of clusters using the elbow method
last = Z[-10:, 2]
last_rev = last[::-1]
indexes = np.arange(1, len(last) + 1)
plt.plot(indexes, last_rev)
plt.title("Elbow Method for Optimal Clusters")
plt.xlabel("Number of clusters")
plt.ylabel("Distance")
plt.show()

In [None]:
# Assuming the elbow point is at 3 clusters (adjust based on your plot)
optimal_clusters = 3

# 4. Cluster the data using the optimal number of clusters
clusters = fcluster(Z, optimal_clusters, criterion='maxclust')

# Add the cluster labels to the original dataframe
df['Cluster'] = clusters

# 5. Evaluate the clustering with the silhouette score and Davies-Bouldin index
silhouette_avg = silhouette_score(df_scaled, clusters)
davies_bouldin_avg = davies_bouldin_score(df_scaled, clusters)

print(f"Optimal number of clusters: {optimal_clusters}")
print(f"Silhouette Score: {silhouette_avg}")
print(f"Davies-Bouldin Index: {davies_bouldin_avg}")

# Print the first few rows of the dataframe with cluster labels
print(df.head())


## HC with AgglomerativeClustering -> Run tonight

HC with AgglomerativeClustering (is better for large datasets, still scalable)
- still start with each point = cluster
- merges clusters
- creates a linkage matrix

-> merges clusters based on the chosen linkage criterion
-> Dendogram still used
->


In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from scipy.cluster.hierarchy import linkage
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score, davies_bouldin_score

# Calculate the inertia for different number of clusters using AgglomerativeClustering
cluster_range = range(1, 11)
inertia = []
for k in cluster_range:
    model = AgglomerativeClustering(n_clusters=k, affinity='euclidean', linkage='ward')
    clusters = model.fit_predict(df)
    sum_of_squares = 0
    for cluster_id in np.unique(clusters):
        cluster_points = df[clusters == cluster_id]
        centroid = cluster_points.mean(axis=0)
        sum_of_squares += np.sum((cluster_points - centroid) ** 2)
    inertia.append(sum_of_squares)

# Plot the results to find the elbow point
plt.figure(figsize=(8, 6))
plt.plot(cluster_range, inertia, marker='o')
plt.title('Elbow Method For Optimal k')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.show()

NameError: name 'df' is not defined

In [None]:
# From this plot, determine the optimal number of clusters
optimal_clusters = cluster_range[np.argmin(np.gradient(np.gradient(inertia)))]  # Adjust based on your plot

# Perform clustering again with the optimal number of clusters
model_optimal = AgglomerativeClustering(n_clusters=optimal_clusters, affinity='euclidean', linkage='ward')
clusters = model_optimal.fit_predict(df)

# Evaluate the clustering with the silhouette score and Davies-Bouldin index
silhouette_avg = silhouette_score(df, clusters)
davies_bouldin_avg = davies_bouldin_score(df, clusters)

print(f"Optimal number of clusters: {optimal_clusters}")
print(f"Silhouette Score: {silhouette_avg}")
print(f"Davies-Bouldin Index: {davies_bouldin_avg}")

# Add the cluster labels to the original dataframe
df_final['Cluster'] = clusters

In [None]:
df_final.to_csv('/content/drive/My Drive/Thesis/Data/df_final_Hierarchical_Clustering.csv', index=False)