<a href="https://colab.research.google.com/github/femketenharkel/Predicting_Ratings/blob/main/models/clustering/Hierarchical_clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Hierarchical clustering**

In [1]:
import pandas as pd
from sklearn.cluster import KMeans
from google.colab import drive
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
from joblib import Parallel, delayed
import joblib

In [2]:
# Load in the data
drive.mount('/content/drive')
df_final = pd.read_csv('/content/drive/My Drive/Thesis/Data/df_final_2.csv')

Mounted at /content/drive


In [3]:
# Exclude target variable and unique identifiers for clustering.
df = df_final.drop(columns= ['Rating', 'MovieID', 'UserID'])

In [4]:
# Normalizing the numerical variables
scaler = MinMaxScaler()
features_to_scale = ['Year', 'Month', 'Day', 'Hour', 'Age', 'Release_year',
                       'Time_release_to_rating', 'Total_ratings_per_movie',
                       'Total_ratings_per_user' ]
df[features_to_scale] = scaler.fit_transform(df[features_to_scale])

# **16/11 FINAL hierarchical with k = 80.000**

In [5]:
import pandas as pd
from sklearn.cluster import AgglomerativeClustering
import numpy as np

# Define parameters and empty Dataframe for results
chunk_size = 200000
n_clusters = 80000
clustered_data = pd.DataFrame()

# Define function for chunk clustering
def cluster_chunk(chunk, n_clusters):
    chunk_copy = chunk.copy()
    model = AgglomerativeClustering(n_clusters=n_clusters, linkage='complete')
    chunk_copy['Cluster'] = model.fit_predict(chunk_copy)
    return chunk_copy


# Clustering the chunks
for i in range(0, len(df), chunk_size):
    chunk = df.iloc[i:i + chunk_size]

    # Ensure n_clusters does not exceed the number of samples in the chunk
    n_clusters_chunk = min(n_clusters, len(chunk))

    clustered_chunk = cluster_chunk(chunk, n_clusters_chunk)
    clustered_data = pd.concat([clustered_data, clustered_chunk], axis=0)

# Reset index after concatenation
clustered_data.reset_index(drop=True, inplace=True)

clustered_data.head()

Unnamed: 0,Age,Year,Month,Day,Hour,Release_year,Time_release_to_rating,Total_ratings_per_movie,Total_ratings_per_user,Female,...,Favourite_Film-Noir,Favourite_Horror,Favourite_Musical,Favourite_Mystery,Favourite_Romance,Favourite_Sci-Fi,Favourite_Thriller,Favourite_War,Favourite_Western,Cluster
0,0.666667,0.0,0.363636,0.766667,0.086957,0.679012,0.317073,0.024511,0.047951,False,...,False,False,False,False,False,False,False,False,True,39857
1,0.333333,0.0,0.636364,0.133333,0.652174,0.938272,0.060976,0.39539,0.102877,False,...,False,False,False,False,False,False,False,False,True,21308
2,0.166667,0.0,0.909091,0.7,0.695652,1.0,0.0,0.168077,0.244115,True,...,True,False,False,False,False,False,False,False,False,61902
3,0.5,0.0,0.636364,0.866667,0.956522,0.938272,0.060976,0.121681,0.309939,False,...,True,False,False,False,False,False,False,False,False,20195
4,0.5,0.0,0.636364,0.466667,0.782609,0.777778,0.219512,0.422235,0.361378,False,...,True,False,False,False,False,False,False,False,False,10042


In [6]:
# Assign clusters to original dataframe
df_final['Cluster'] = clustered_data['Cluster']

In [8]:
# Save dataframe
df_final.to_csv('/content/drive/My Drive/Thesis/Data/df_final_Hierarchical_Clustering_80k.csv', index=False)

In [9]:
# Calculate silhouette score
features = clustered_data.drop(columns=['Cluster'])
labels = clustered_data['Cluster']
silhouette_score_80k = silhouette_score(features, labels)

# Display the silhouette score
print(f"Silhouette Score: {silhouette_score_80k}")

Silhouette Score: -0.23502749539509138


# OUDE CODE

In [None]:
import pandas as pd
from sklearn.cluster import AgglomerativeClustering
import numpy as np

# Define parameters and empty Dataframe for results
chunk_size = 200000
n_clusters = 55000
clustered_data = pd.DataFrame()

# Define function for chunk clustering
def cluster_chunk(chunk, n_clusters):
    chunk_copy = chunk.copy()
    model = AgglomerativeClustering(n_clusters=n_clusters, linkage='complete')
    chunk_copy['Cluster'] = model.fit_predict(chunk_copy)
    return chunk_copy


# Clustering the chunks
for i in range(0, len(df), chunk_size):
    chunk = df.iloc[i:i + chunk_size]

    # Ensure n_clusters does not exceed the number of samples in the chunk
    n_clusters_chunk = min(n_clusters, len(chunk))

    clustered_chunk = cluster_chunk(chunk, n_clusters_chunk)
    clustered_data = pd.concat([clustered_data, clustered_chunk], axis=0)

# Reset index after concatenation
clustered_data.reset_index(drop=True, inplace=True)

clustered_data.head()

Unnamed: 0,Age,Year,Month,Day,Hour,Release_year,Time_release_to_rating,Total_ratings_per_movie,Total_ratings_per_user,Female,...,Favourite_Film-Noir,Favourite_Horror,Favourite_Musical,Favourite_Mystery,Favourite_Romance,Favourite_Sci-Fi,Favourite_Thriller,Favourite_War,Favourite_Western,Cluster
0,0.666667,0.0,0.363636,0.766667,0.086957,0.679012,0.317073,0.024511,0.047951,False,...,False,False,False,False,False,False,False,False,True,5947
1,0.333333,0.0,0.636364,0.133333,0.652174,0.938272,0.060976,0.39539,0.102877,False,...,False,False,False,False,False,False,False,False,True,42618
2,0.166667,0.0,0.909091,0.7,0.695652,1.0,0.0,0.168077,0.244115,True,...,True,False,False,False,False,False,False,False,False,10227
3,0.5,0.0,0.636364,0.866667,0.956522,0.938272,0.060976,0.121681,0.309939,False,...,True,False,False,False,False,False,False,False,False,8939
4,0.5,0.0,0.636364,0.466667,0.782609,0.777778,0.219512,0.422235,0.361378,False,...,True,False,False,False,False,False,False,False,False,9357


In [None]:
df_final['Cluster'] = clustered_data['Cluster']
df_final.to_csv('/content/drive/My Drive/Thesis/Data/df_final_Hierarchical_Clustering2.csv', index=False)

### Silhouette score calculation with df_final

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import silhouette_score
from sklearn.utils import shuffle

# Define the percentage of data to sample
n_samples = int(len(df_final) * 0.25)
n_iterations = 4

silhouette_scores = []

# Perform the silhouette score calculation multiple times
for i in range(n_iterations):
    sample_df = shuffle(df_final).iloc[:n_samples]
    # Extract features and cluster labels
    X_sample = sample_df.drop(columns=['Cluster'])
    labels_sample = sample_df['Cluster']
    # Calculate silhouette score for the sample
    score = silhouette_score(X_sample, labels_sample)
    silhouette_scores.append(score)
    print(f"Iteration {i+1}: Silhouette Score = {score:.4f}")

# Calculate the average silhouette score
average_silhouette_score = np.mean(silhouette_scores)
print(f"\nAverage Silhouette Score: {average_silhouette_score:.4f}")


Iteration 1: Silhouette Score = -0.7678
Iteration 2: Silhouette Score = -0.7699
Iteration 3: Silhouette Score = -0.7682
Iteration 4: Silhouette Score = -0.7686

Average Silhouette Score: -0.7686


### Silhouette score calculation with clustered_data



In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import silhouette_score
from sklearn.utils import shuffle

# Define the percentage of data to sample
n_samples = int(len(clustered_data) * 0.25)
n_iterations = 4

silhouette_scores = []

# Perform the silhouette score calculation multiple times
for i in range(n_iterations):
    sample_df = shuffle(clustered_data).iloc[:n_samples]
    # Extract features and cluster labels
    X_sample = sample_df.drop(columns=['Cluster'])
    labels_sample = sample_df['Cluster']
    # Calculate silhouette score for the sample
    score = silhouette_score(X_sample, labels_sample)
    silhouette_scores.append(score)
    print(f"Iteration {i+1}: Silhouette Score = {score:.4f}")

# Calculate the average silhouette score
average_silhouette_score = np.mean(silhouette_scores)
print(f"\nAverage Silhouette Score: {average_silhouette_score:.4f}")


Iteration 1: Silhouette Score = -0.1868
Iteration 2: Silhouette Score = -0.1878
Iteration 3: Silhouette Score = -0.1898
Iteration 4: Silhouette Score = -0.1884

Average Silhouette Score: -0.1882
