<a href="https://colab.research.google.com/github/femketenharkel/Predicting_Ratings/blob/main/models/clustering/Hierarchical_clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Hierarchical clustering**

In [2]:
import pandas as pd
from sklearn.cluster import KMeans
from google.colab import drive
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
from joblib import Parallel, delayed
import joblib

In [3]:
# Load in the data
drive.mount('/content/drive')
df_final = pd.read_csv('/content/drive/My Drive/Thesis/Data/df_final.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# Exclude target variable and unique identifiers for clustering.
df = df_final.drop(columns= ['Rating', 'MovieID', 'UserID'])

In [6]:
df.head()

Unnamed: 0,Year,Month,Day,Hour,Age,Release_year,Female,Male,Academic/educator,Artist,...,Favourite_Musical,Favourite_Mystery,Favourite_Romance,Favourite_Sci-Fi,Favourite_Thriller,Favourite_War,Favourite_Western,Time_release_to_rating,Total_ratings_per_movie,Total_ratings_per_user
0,0.0,0.909091,0.733333,0.521739,0.166667,0.91358,True,False,False,False,...,True,False,False,False,False,False,False,0.085366,0.147123,0.246995
1,0.0,0.727273,0.633333,0.869565,0.166667,0.925926,False,True,False,False,...,False,False,False,False,False,False,False,0.073171,0.350328,0.091803
2,0.0,0.909091,0.6,0.826087,0.166667,0.777778,False,True,False,False,...,False,False,False,False,False,False,False,0.219512,0.158776,0.292896
3,0.0,0.909091,0.833333,1.0,0.833333,0.975309,False,True,False,False,...,False,True,False,False,False,False,False,0.02439,0.060452,0.030601
4,0.0,0.909091,0.0,0.434783,0.5,0.444444,False,True,False,True,...,False,False,False,False,False,False,False,0.54878,0.115805,0.034973


In [5]:
# Normalizing the numerical variables
scaler = MinMaxScaler()
features_to_scale = ['Year', 'Month', 'Day', 'Hour', 'Age', 'Release_year',
                       'Time_release_to_rating', 'Total_ratings_per_movie',
                       'Total_ratings_per_user' ]
df[features_to_scale] = scaler.fit_transform(df[features_to_scale])

## Originele HC, too complex

In [None]:
# Hierarchical clustering might be too complex -> way to computationally expensive

import pandas as pd
from sklearn.preprocessing import StandardScaler
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import silhouette_score, davies_bouldin_score


# Hierarchical clustering
Z = linkage(df_scaled, method='ward')

# Plot the outcome
plt.figure(figsize=(10, 7))
dendrogram(Z)
plt.title("Dendrogram")
plt.xlabel("Data points")
plt.ylabel("Euclidean distances")
plt.show()

# 3. Determine the optimal number of clusters using the elbow method
last = Z[-10:, 2]
last_rev = last[::-1]
indexes = np.arange(1, len(last) + 1)
plt.plot(indexes, last_rev)
plt.title("Elbow Method for Optimal Clusters")
plt.xlabel("Number of clusters")
plt.ylabel("Distance")
plt.show()

In [None]:
# Assuming the elbow point is at 3 clusters (adjust based on your plot)
optimal_clusters = 3

# 4. Cluster the data using the optimal number of clusters
clusters = fcluster(Z, optimal_clusters, criterion='maxclust')

# Add the cluster labels to the original dataframe
df['Cluster'] = clusters

# 5. Evaluate the clustering with the silhouette score and Davies-Bouldin index
silhouette_avg = silhouette_score(df_scaled, clusters)
davies_bouldin_avg = davies_bouldin_score(df_scaled, clusters)

print(f"Optimal number of clusters: {optimal_clusters}")
print(f"Silhouette Score: {silhouette_avg}")
print(f"Davies-Bouldin Index: {davies_bouldin_avg}")

# Print the first few rows of the dataframe with cluster labels
print(df.head())


## HC with AgglomerativeClustering -> Way too computationally expensive

HC with AgglomerativeClustering (is better for large datasets, still scalable)
- still start with each point = cluster
- merges clusters
- creates a linkage matrix

-> merges clusters based on the chosen linkage criterion
-> Dendogram still used
->


In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from scipy.cluster.hierarchy import linkage
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score, davies_bouldin_score

# Calculate the inertia for different number of clusters using AgglomerativeClustering
cluster_range = range(1, 11)
inertia = []
for k in cluster_range:
    model = AgglomerativeClustering(n_clusters=k, linkage='complete')
    clusters = model.fit_predict(df)
    sum_of_squares = 0
    for cluster_id in np.unique(clusters):
        cluster_points = df[clusters == cluster_id]
        centroid = cluster_points.mean(axis=0)
        sum_of_squares += np.sum((cluster_points - centroid) ** 2)
    inertia.append(sum_of_squares)

# Plot the results to find the elbow point
plt.figure(figsize=(12, 9))
plt.plot(cluster_range, inertia, marker='o')
plt.title('Elbow Method For Optimal k')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.show()

In [None]:
# From this plot, determine the optimal number of clusters
optimal_clusters =

# Perform clustering again with the optimal number of clusters
model_optimal = AgglomerativeClustering(n_clusters=optimal_clusters,
                                        linkage='complete')
clusters = model_optimal.fit_predict(df)

# Evaluate the clustering with the silhouette score and Davies-Bouldin index
silhouette = silhouette_score(df, clusters)
davies_bouldin = davies_bouldin_score(df, clusters)

print(f"Optimal number of clusters: {optimal_clusters}")
print(f"Silhouette Score: {silhouette}")
print(f"Davies-Bouldin Index: {davies_bouldin}")

# Add the cluster labels to the original dataframe
df_final['Cluster'] = clusters

In [None]:
df_final.to_csv('/content/drive/My Drive/Thesis/Data/df_final_Hierarchical_Clustering.csv', index=False)

# Try 3 -> simplified, no hyperparameter tuning, regular hierarchical clustering -> Too much RAM needed, 334 GB is not enough


In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering
import matplotlib.pyplot as plt
import scipy.cluster.hierarchy as shc
import numpy as np


# Plot dendrogram to visualize the hierarchical clustering
plt.figure(figsize=(10, 7))
plt.title("Dendrogram (Complete Linkage)")
dendrogram = shc.dendrogram(shc.linkage(df, method='complete'))
plt.show()


In [None]:
# Apply Agglomerative Clustering using complete linkage
model = AgglomerativeClustering(n_clusters=5, linkage='complete')
clusters = model.fit_predict(df_scaled)

# Add cluster labels back to the DataFrame
df_final['Cluster'] = clusters

# Display the DataFrame with clusters
print(df)

# Plot the clusters
plt.figure(figsize=(10, 7))
plt.scatter(df['Feature1'], df['Feature2'], c=df['Cluster'], cmap='viridis')
plt.title("Clusters Visualization")
plt.xlabel('Feature1')
plt.ylabel('Feature2')
plt.show()

# With agglomeratice hierarchical clustering, stops when ceratin amount of clusters is met -> Nog niet gerund

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering
import numpy as np

# Apply Agglomerative Clustering with a specified number of clusters
n_clusters = 500  # Specify the desired number of clusters
model = AgglomerativeClustering(n_clusters=n_clusters, linkage='complete')
clusters = model.fit_predict(df)

# Add cluster labels back to the DataFrame
df_final['Cluster'] = clusters


In [None]:
# Apply Agglomerative Clustering using complete linkage
model = AgglomerativeClustering(n_clusters=5, linkage='complete')
clusters = model.fit_predict(df_scaled)

# Add cluster labels back to the DataFrame
df_final['Cluster'] = clusters

# Display the DataFrame with clusters
print(df)

# Plot the clusters
plt.figure(figsize=(10, 7))
plt.scatter(df['Feature1'], df['Feature2'], c=df['Cluster'], cmap='viridis')
plt.title("Clusters Visualization")
plt.xlabel('Feature1')
plt.ylabel('Feature2')
plt.show()

# Clustering in chunks to save memory

In [8]:
import pandas as pd
from sklearn.cluster import AgglomerativeClustering
import numpy as np

# Define function
def cluster_chunk(chunk, n_clusters):
    model = AgglomerativeClustering(n_clusters=n_clusters, linkage='complete')
    chunk['Cluster'] = model.fit_predict(chunk)
    return chunk

# Parameters and resulting dataframe
chunk_size = 2000
n_clusters = 200
df_final = pd.DataFrame()

# Clustering the chunks
for i in range(0, len(df), chunk_size):
    chunk = df.iloc[i:i + chunk_size]
    clustered_chunk = cluster_chunk(chunk, n_clusters)
    df_final = pd.concat([df_final, clustered_chunk], axis=0)

# Reset index
df_final.reset_index(drop=True, inplace=True)

df_final.head()


[1;30;43mStreaminguitvoer ingekort tot de laatste 5000 regels.[0m
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Cluster'] = model.fit_predict(chunk)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Cluster'] = model.fit_predict(chunk)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['Cluster'] = model.fit_predict(chunk)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See t

ValueError: Cannot extract more clusters than samples: 200 clusters were given for a tree with 83 leaves.

In [11]:
import pandas as pd
from sklearn.cluster import AgglomerativeClustering
import numpy as np

# Function to perform clustering on a chunk
def cluster_chunk(chunk, n_clusters):
    chunk_copy = chunk.copy()
    model = AgglomerativeClustering(n_clusters=n_clusters, linkage='complete')
    chunk_copy['Cluster'] = model.fit_predict(chunk_copy)
    return chunk_copy

# Parameters
chunk_size = 10000  # Adjust chunk size based on available memory
n_clusters = 300  # Adjust the number of clusters as needed

# Initialize an empty DataFrame to store results
clustered_data = pd.DataFrame()

# Iterate over chunks and perform clustering
for i in range(0, len(df), chunk_size):
    chunk = df.iloc[i:i + chunk_size]

    # Ensure n_clusters does not exceed the number of samples in the chunk
    n_clusters_chunk = min(n_clusters, len(chunk))

    clustered_chunk = cluster_chunk(chunk, n_clusters_chunk)
    clustered_data = pd.concat([clustered_data, clustered_chunk], axis=0)

# Reset index after concatenation
clustered_data.reset_index(drop=True, inplace=True)

clustered_data.head()


Unnamed: 0,Year,Month,Day,Hour,Age,Release_year,Female,Male,Academic/educator,Artist,...,Favourite_Mystery,Favourite_Romance,Favourite_Sci-Fi,Favourite_Thriller,Favourite_War,Favourite_Western,Time_release_to_rating,Total_ratings_per_movie,Total_ratings_per_user,Cluster
0,0.0,0.909091,0.733333,0.521739,0.166667,0.91358,True,False,False,False,...,False,False,False,False,False,False,0.085366,0.147123,0.246995,103
1,0.0,0.727273,0.633333,0.869565,0.166667,0.925926,False,True,False,False,...,False,False,False,False,False,False,0.073171,0.350328,0.091803,186
2,0.0,0.909091,0.6,0.826087,0.166667,0.777778,False,True,False,False,...,False,False,False,False,False,False,0.219512,0.158776,0.292896,174
3,0.0,0.909091,0.833333,1.0,0.833333,0.975309,False,True,False,False,...,True,False,False,False,False,False,0.02439,0.060452,0.030601,109
4,0.0,0.909091,0.0,0.434783,0.5,0.444444,False,True,False,True,...,False,False,False,False,False,False,0.54878,0.115805,0.034973,44
