<a href="https://colab.research.google.com/github/femketenharkel/Predicting_Ratings/blob/main/models/clustering/DBSCAN_clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
pip install gower

Collecting gower
  Downloading gower-0.1.2-py3-none-any.whl.metadata (3.7 kB)
Downloading gower-0.1.2-py3-none-any.whl (5.2 kB)
Installing collected packages: gower
Successfully installed gower-0.1.2


In [3]:
from google.colab import drive
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import numpy as np
from sklearn.cluster import DBSCAN
import gower
import matplotlib.pyplot as plt

In [4]:

from google.colab import drive
import pandas as pd
# Load in the data
drive.mount('/content/drive')
df_final = pd.read_csv("/content/drive/My Drive/Thesis/Data/df_final.csv")

Mounted at /content/drive


In [None]:
# Still needed for the datapreprocessing -> not for the clustering

# Remove total ratings per user because that is over the 100% dataset
df_final = df_final.drop(columns=['Total_ratings_per_user'])

# Create the new column with the time difference
df_final['Time_release_to_rating'] = df_final['Year'] - df_final['Release_year']

# Create count variables over the 40% dataset
# Creating total ratings per movie
df_final['Total_ratings_per_movie'] = df_final.groupby('MovieID')['MovieID'].transform('count')

# Create total ratings per user
df_final['Total_ratings_per_user'] = df_final.groupby('UserID')['UserID'].transform('count')


In [5]:
df_final.head()

Unnamed: 0,UserID,MovieID,Rating,Year,Month,Day,Hour,Age,Release_year,Female,...,Favourite_Musical,Favourite_Mystery,Favourite_Romance,Favourite_Sci-Fi,Favourite_Thriller,Favourite_War,Favourite_Western,Time_release_to_rating,Total_ratings_per_movie,Total_ratings_per_user
0,1645,485,3,2000,11,23,12,1,1993,True,...,True,False,False,False,False,False,False,7,203,228
1,3112,357,4,2000,9,20,20,1,1994,False,...,False,False,False,False,False,False,False,6,482,86
2,2077,2792,2,2000,11,19,19,1,1982,False,...,False,False,False,False,False,False,False,18,219,270
3,868,2279,4,2000,11,26,23,5,1998,False,...,False,True,False,False,False,False,False,2,84,30
4,2773,1103,4,2000,11,1,10,3,1955,False,...,False,False,False,False,False,False,False,45,160,34


In [7]:
# Remove target and unique identifiers
df = df_final.drop(columns= ['UserID', 'MovieID', 'Rating'])

In [8]:
# Scaling the features
scaler = MinMaxScaler()
features_to_scale = ['Year', 'Month', 'Day', 'Hour', 'Release_year',
                        'Time_release_to_rating', 'Total_ratings_per_movie',
                        'Total_ratings_per_user']
df[features_to_scale] = scaler.fit_transform(df[features_to_scale])


In [9]:
# WITH THE GOWER DISTANCE -> too expensive

# Calculate Gower distance matrix
gower_distances = gower.gower_matrix(df)

# Define a range of eps values to test, with smaller steps
eps_values = np.arange(0.1, 2.0, 0.1)

# Store the number of clusters for each eps value
num_clusters = []

# Min_samples based on heuristic that says 2*amount of features
for eps in eps_values:
    dbscan = DBSCAN(eps=eps, min_samples=140, metric='precomputed')
    clusters = dbscan.fit_predict(gower_distances)
    num_clusters.append(len(set(clusters)) - (1 if -1 in clusters else 0))  # Exclude noise points

# Plot the elbow graph
plt.figure(figsize=(8, 6))
plt.plot(eps_values, num_clusters, marker='o')
plt.title('Elbow Method for DBSCAN with Gower Distance')
plt.xlabel('Eps value')
plt.ylabel('Number of Clusters')
plt.show()

KeyboardInterrupt: 

In [None]:
# Assume optimal_eps is chosen based on the elbow method
optimal_eps = 0.5  # Replace with the optimal eps value from the graph

# Apply DBSCAN with the chosen eps value
dbscan = DBSCAN(eps=optimal_eps, min_samples=5, metric='precomputed')
clusters = dbscan.fit_predict(gower_distances)

# Add cluster labels back to the DataFrame
df_final['Cluster'] = clusters

# Display the DataFrame with clusters
print(df_final)

In [None]:
# Optimized code with chunks
import gower
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN
import matplotlib.pyplot as plt
from joblib import Parallel, delayed

# Function to calculate Gower distance matrix for a chunk
def calculate_gower_distance_chunk(chunk):
    return gower.gower_matrix(chunk)

# Function to fit DBSCAN on a chunk
def fit_dbscan_chunk(eps, chunk_distances, min_samples):
    dbscan = DBSCAN(eps=eps, min_samples=min_samples, metric='precomputed')
    clusters = dbscan.fit_predict(chunk_distances)
    return len(set(clusters)) - (1 if -1 in clusters else 0)

# Define subsample fraction and chunk size
sample_fraction = 0.1  # Adjust this as needed
chunk_size = 10000  # Adjust this as needed
min_samples = 140  # Heuristic based on 2 * number of features

# Sample the DataFrame
df_sample = df_final.sample(frac=sample_fraction, random_state=42)

# Process the DataFrame in chunks
num_chunks = int(np.ceil(len(df_sample) / chunk_size))

# Calculate Gower distance matrices for each chunk
gower_distances_chunks = Parallel(n_jobs=-1)(delayed(calculate_gower_distance_chunk)(df_sample.iloc[i*chunk_size:(i+1)*chunk_size]) for i in range(num_chunks))

# Define a range of eps values to test
eps_values = np.arange(0.1, 2.0, 0.1)

# Store the number of clusters for each eps value
num_clusters = []

# Fit DBSCAN on each chunk for each eps value
for eps in eps_values:
    clusters_per_chunk = Parallel(n_jobs=-1)(delayed(fit_dbscan_chunk)(eps, chunk_distances, min_samples) for chunk_distances in gower_distances_chunks)
    num_clusters.append(np.mean(clusters_per_chunk))

# Plot the elbow graph
plt.figure(figsize=(8, 6))
plt.plot(eps_values, num_clusters, marker='o')
plt.title('Elbow Method for DBSCAN with Gower Distance')
plt.xlabel('Eps value')
plt.ylabel('Number of Clusters')
plt.show()


## old CODE

In [None]:
# klopt niet, aantal clusters is ongeveer aantal instances -> nog aanpassen

import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import DBSCAN

# Apply DBSCAN
dbscan = DBSCAN(eps=0.5, min_samples=2, metric='euclidean')
clusters = dbscan.fit_predict(df)

# Add cluster labels back to the DataFrame
df_final['Cluster'] = clusters
