<a href="https://colab.research.google.com/github/femketenharkel/Predicting_Ratings/blob/main/models/clustering/GMM_clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# GMM clustering

In [13]:
import numpy as np
import pandas as pd
from google.colab import drive
from sklearn.preprocessing import MinMaxScaler
from sklearn.mixture import GaussianMixture
import matplotlib.pyplot as plt

In [14]:
# Loading in the data
drive.mount('/content/drive')
df_final = pd.read_csv('/content/drive/My Drive/Thesis/Data/df_final.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [15]:
# Exclude target variable and unique identifiers for clustering.
df = df_final.drop(columns= ['Rating', 'MovieID', 'UserID'])

In [16]:
# Normalizing the variables
scaler = MinMaxScaler()
features_to_scale = ['Year', 'Month', 'Day', 'Hour', 'Age', 'Release_year',
                       'Time_release_to_rating', 'Total_ratings_per_movie',
                       'Total_ratings_per_user' ]
df[features_to_scale] = scaler.fit_transform(df[features_to_scale])

In [12]:
# Define the range for number of clusters
n_components_range = range(1, 20, 5)
bic_values = []

for n_components in n_components_range:
    gmm = GaussianMixture(n_components=n_components, random_state=42)
    gmm.fit(df)
    bic_values.append(gmm.bic(df))

# Plotting the elbow graph
plt.figure(figsize=(12, 9))
plt.plot(n_components_range, bic_values, marker='o')
plt.title('Elbow Method for Optimal Number of Clusters (BIC)')
plt.xlabel('Number of Clusters')
plt.ylabel('BIC')
plt.show()

KeyboardInterrupt: 

In [None]:
# Choose optimal number of clusters based on the plot
optimal_n_components = n_components_range[np.argmin(bic_values)]
print(f"Optimal number of clusters: {optimal_n_components}")

# Apply GMM with the optimal number of clusters
gmm = GaussianMixture(n_components=optimal_n_components, random_state=42)
clusters = gmm.fit_predict(features_normalized)

# Add cluster labels back to the DataFrame
df['Cluster'] = clusters

# Display the DataFrame with clusters
print(df)

# Poging 2, bic scores

In [None]:

# 2. Apply GMM clustering and determine the optimal number of clusters using the elbow method
cluster_range = range(1, 11)
bic_scores = []

for k in cluster_range:
    gmm = GaussianMixture(n_components=k, covariance_type='full', random_state=42)
    gmm.fit(df_scaled)
    bic_scores.append(gmm.bic(df_scaled))

# Plot the BIC scores to find the elbow point
plt.figure(figsize=(8, 6))
plt.plot(cluster_range, bic_scores, marker='o')
plt.title('Elbow Method For Optimal k using BIC')
plt.xlabel('Number of clusters')
plt.ylabel('BIC Score')
plt.show()

# From this plot, determine the optimal number of clusters
optimal_clusters = cluster_range[np.argmin(bic_scores)]

# 3. Perform clustering again with the optimal number of clusters
gmm_optimal = GaussianMixture(n_components=optimal_clusters, covariance_type='full', random_state=42)
gmm_optimal.fit(df_scaled)
clusters = gmm_optimal.predict(df_scaled)

# 4. Evaluate the clustering with the silhouette score
silhouette_avg = silhouette_score(df_scaled, clusters)

print(f"Optimal number of clusters: {optimal_clusters}")
print(f"Silhouette Score: {silhouette_avg}")

# Add the cluster labels to the original dataframe
df['Cluster'] = clusters

# Print the first few rows of the dataframe with cluster labels
print(df.head())


# tyrehfbejhlhjh bic scores


In [None]:
import pandas as pd
import numpy as np
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score

# Define range for number of components (clusters)
n_components_range = range(1, 11)

# Store BIC values for each number of components
bic_values = []

# Fit GMM and calculate BIC for each number of components
for n_components in n_components_range:
    gmm = GaussianMixture(n_components=n_components, covariance_type='full', random_state=42)
    gmm.fit(df_scaled)
    bic_values.append(gmm.bic(df_scaled))

# Plot the BIC values
plt.figure(figsize=(10, 7))
plt.plot(n_components_range, bic_values, marker='o')
plt.title('Elbow Method for GMM Clustering')
plt.xlabel('Number of Components (Clusters)')
plt.ylabel('BIC')
plt.show()

# User inputs the optimal number of clusters based on the elbow plot
optimal_n_components = int(input("Enter the optimal number of clusters based on the Elbow plot: "))
print(f'Optimal number of clusters: {optimal_n_components}')

# Fit GMM with the optimal number of components
gmm_optimal = GaussianMixture(n_components=optimal_n_components, covariance_type='full', random_state=42)
labels_optimal = gmm_optimal.fit_predict(df_scaled)
df['Cluster'] = labels_optimal

# Calculate silhouette score for the best performing model
sil_score = silhouette_score(df_scaled, labels_optimal)
print(f'Silhouette Score for the optimal model: {sil_score:.4f}')

# Display the DataFrame with clusters
print(df.head())

# Visualize the clusters
plt.figure(figsize=(10, 7))
plt.scatter(df['Feature1'], df['Feature2'], c=df['Cluster'], cmap='viridis', s=10)
plt.title('GMM Clustering with Optimal Number of Clusters')
plt.xlabel('Feature1')
plt.ylabel('Feature2')
plt.show()


# 4 with WSS score

In [None]:
import pandas as pd
import numpy as np
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score


# Define range for number of components (clusters)
n_components_range = range(1, 11)

# Store sum of squared distances for each number of components
ssd_values = []

# Function to calculate sum of squared distances
def calculate_ssd(data, labels, centers):
    ssd = 0
    for i, center in enumerate(centers):
        cluster_points = data[labels == i]
        ssd += ((cluster_points - center) ** 2).sum()
    return ssd

# Fit GMM and calculate SSD for each number of components
for n_components in n_components_range:
    gmm = GaussianMixture(n_components=n_components, covariance_type='full',
                          random_state=42)
    gmm.fit(df)
    labels = gmm.predict(df)
    centers = gmm.means_
    ssd = calculate_ssd(df, labels, centers)
    ssd_values.append(ssd)

# Plot the SSD values
plt.figure(figsize=(10, 7))
plt.plot(n_components_range, ssd_values, marker='o')
plt.title('Elbow Method for GMM Clustering')
plt.xlabel('Number of Components (Clusters)')
plt.ylabel('Sum of Squared Distances')
plt.show()


In [None]:

# User inputs the optimal number of clusters based on the elbow plot
optimal_n_components = int(input("Enter the optimal number of clusters based on the Elbow plot: "))
print(f'Optimal number of clusters: {optimal_n_components}')

# Fit GMM with the optimal number of components
gmm_optimal = GaussianMixture(n_components=optimal_n_components, covariance_type='full', random_state=42)
labels_optimal = gmm_optimal.fit_predict(df)
df['Cluster'] = labels_optimal

# Calculate silhouette score for the best performing model
sil_score = silhouette_score(df, labels_optimal)
print(f'Silhouette Score for the optimal model: {sil_score:.4f}')

# Display the DataFrame with clusters
print(df.head())

# Visualize the clusters
plt.figure(figsize=(10, 7))
plt.scatter(df['Feature1'], df['Feature2'], c=df['Cluster'], cmap='viridis', s=10)
plt.title('GMM Clustering with Optimal Number of Clusters')
plt.xlabel('Feature1')
plt.ylabel('Feature2')
plt.show()