<a href="https://colab.research.google.com/github/femketenharkel/Predicting_Ratings/blob/main/models/clustering/Hierarchical_clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Hierarchical clustering**

In [1]:
import pandas as pd
from sklearn.cluster import KMeans
from google.colab import drive
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
from joblib import Parallel, delayed
import joblib

In [2]:
# Load in the data
drive.mount('/content/drive')
df_final = pd.read_csv('/content/drive/My Drive/Thesis/Data/df_final_2.csv')

Mounted at /content/drive


In [3]:
# Exclude target variable and unique identifiers for clustering.
df = df_final.drop(columns= ['Rating', 'MovieID', 'UserID'])

In [4]:
# Normalizing the numerical variables
scaler = MinMaxScaler()
features_to_scale = ['Year', 'Month', 'Day', 'Hour', 'Age', 'Release_year',
                       'Time_release_to_rating', 'Total_ratings_per_movie',
                       'Total_ratings_per_user' ]
df[features_to_scale] = scaler.fit_transform(df[features_to_scale])

# Try 3 -> simplified, no hyperparameter tuning, regular hierarchical clustering -> Too much RAM needed, 334 GB is not enough


In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering
import matplotlib.pyplot as plt
import scipy.cluster.hierarchy as shc
import numpy as np


# Plot dendrogram to visualize the hierarchical clustering
plt.figure(figsize=(10, 7))
plt.title("Dendrogram (Complete Linkage)")
dendrogram = shc.dendrogram(shc.linkage(df, method='complete'))
plt.show()


In [None]:
# Apply Agglomerative Clustering using complete linkage
model = AgglomerativeClustering(n_clusters=5, linkage='complete')
clusters = model.fit_predict(df_scaled)

# Add cluster labels back to the DataFrame
df_final['Cluster'] = clusters

# Display the DataFrame with clusters
print(df)

# Plot the clusters
plt.figure(figsize=(10, 7))
plt.scatter(df['Feature1'], df['Feature2'], c=df['Cluster'], cmap='viridis')
plt.title("Clusters Visualization")
plt.xlabel('Feature1')
plt.ylabel('Feature2')
plt.show()

# Bigger chunks and silhouette score calculated right away -> gives error, needs fixing

In [None]:
import pandas as pd
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_samples, silhouette_score
import numpy as np

# Define parameters and empty DataFrame for results
chunk_size = 50000
n_clusters = 500
clustered_data = pd.DataFrame()
silhouette_scores = []

# Define function for chunk clustering and silhouette score calculation
def cluster_chunk(chunk, n_clusters):
    chunk_copy = chunk.copy()
    model = AgglomerativeClustering(n_clusters=n_clusters, linkage='complete')
    labels = model.fit_predict(chunk_copy)
    chunk_copy['Cluster'] = labels
    return chunk_copy, labels

# Clustering the chunks and calculating silhouette scores
for i in range(0, len(df), chunk_size):
    chunk = df.iloc[i:i + chunk_size]

    # Ensure n_clusters does not exceed the number of samples in the chunk
    n_clusters_chunk = min(n_clusters, len(chunk))

    if n_clusters_chunk >= 2:
        clustered_chunk, labels_chunk = cluster_chunk(chunk, n_clusters_chunk)
        clustered_data = pd.concat([clustered_data, clustered_chunk], axis=0)

        # Calculate silhouette score for the current chunk
        X_chunk = clustered_chunk.drop(columns=['Cluster']).values
        unique_labels = np.unique(labels_chunk)
        if len(unique_labels) > 1 and len(X_chunk) > 1:
            sil_score = silhouette_score(X_chunk, labels_chunk)
            silhouette_scores.append(sil_score)
        else:
            print(f"Skipping silhouette score calculation for chunk {i}-{i+chunk_size} due to insufficient clusters or samples.")
    else:
        print(f"Skipping chunk {i}-{i+chunk_size} due to insufficient samples for clustering.")

# Reset index after concatenation
clustered_data.reset_index(drop=True, inplace=True)

# Compute the average silhouette score if any scores are available
if silhouette_scores:
    average_silhouette_score = np.mean(silhouette_scores)
    print(f'Average Silhouette Score: {average_silhouette_score:.4f}')
else:
    print("No valid silhouette scores calculated.")

# Display the DataFrame with clusters
print(clustered_data.head())


ValueError: Number of labels is 83. Valid values are 2 to n_samples - 1 (inclusive)

In [None]:
df_final['Cluster'] = clustered_data['Cluster']

In [None]:
df_final.to_csv('/content/drive/My Drive/Thesis/Data/df_final_Hierarchical_Clustering.csv', index=False)

# Hierarchical clustering

In [5]:
import pandas as pd
from sklearn.cluster import AgglomerativeClustering
import numpy as np

# Define parameters and empty Dataframe for results
chunk_size = 200000
n_clusters = 55000
clustered_data = pd.DataFrame()

# Define function for chunk clustering
def cluster_chunk(chunk, n_clusters):
    chunk_copy = chunk.copy()
    model = AgglomerativeClustering(n_clusters=n_clusters, linkage='complete')
    chunk_copy['Cluster'] = model.fit_predict(chunk_copy)
    return chunk_copy


# Clustering the chunks
for i in range(0, len(df), chunk_size):
    chunk = df.iloc[i:i + chunk_size]

    # Ensure n_clusters does not exceed the number of samples in the chunk
    n_clusters_chunk = min(n_clusters, len(chunk))

    clustered_chunk = cluster_chunk(chunk, n_clusters_chunk)
    clustered_data = pd.concat([clustered_data, clustered_chunk], axis=0)

# Reset index after concatenation
clustered_data.reset_index(drop=True, inplace=True)

clustered_data.head()


Unnamed: 0,Age,Year,Month,Day,Hour,Release_year,Time_release_to_rating,Total_ratings_per_movie,Total_ratings_per_user,Female,...,Favourite_Film-Noir,Favourite_Horror,Favourite_Musical,Favourite_Mystery,Favourite_Romance,Favourite_Sci-Fi,Favourite_Thriller,Favourite_War,Favourite_Western,Cluster
0,0.666667,0.0,0.363636,0.766667,0.086957,0.679012,0.317073,0.024511,0.047951,False,...,False,False,False,False,False,False,False,False,True,5947
1,0.333333,0.0,0.636364,0.133333,0.652174,0.938272,0.060976,0.39539,0.102877,False,...,False,False,False,False,False,False,False,False,True,42618
2,0.166667,0.0,0.909091,0.7,0.695652,1.0,0.0,0.168077,0.244115,True,...,True,False,False,False,False,False,False,False,False,10227
3,0.5,0.0,0.636364,0.866667,0.956522,0.938272,0.060976,0.121681,0.309939,False,...,True,False,False,False,False,False,False,False,False,8939
4,0.5,0.0,0.636364,0.466667,0.782609,0.777778,0.219512,0.422235,0.361378,False,...,True,False,False,False,False,False,False,False,False,9357


In [6]:
df_final['Cluster'] = clustered_data['Cluster']

In [8]:
df_final.to_csv('/content/drive/My Drive/Thesis/Data/df_final_Hierarchical_Clustering2.csv', index=False)

# Tijdelijke code voor model opslaan:

In [None]:
# Save the model to a file in a specific directory
joblib_file = "/path/to/your/directory/random_forest_model.pkl"
joblib.dump(model, joblib_file)

# Load the model from the specific directory
loaded_model = joblib.load("/path/to/your/directory/random_forest_model.pkl")
