# Imports

In [2]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from sklearn.datasets import load_iris
from sklearn.cluster import DBSCAN, KMeans
from sklearn.metrics import silhouette_score, adjusted_rand_score, adjusted_mutual_info_score
from sklearn.preprocessing import StandardScaler
from sklearn.mixture import GaussianMixture

# Data

In [3]:
basic = "https://storage.googleapis.com/edulabs-public-datasets/synthetic/basic5.csv"


In [4]:
basic_df = pd.read_csv(basic)

In [5]:
X = basic_df[['x', 'y']]

In [6]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# GMM

In [22]:
gmm = GaussianMixture(n_components=2)

In [23]:
gmm.fit(X_scaled)
labels = gmm.predict(X_scaled)
probs = gmm.predict_proba(X_scaled)

In [24]:
labels

array([0, 0, 1, ..., 1, 1, 1])

In [25]:
probs

array([[1.00000000e+00, 2.56548860e-12],
       [9.97595824e-01, 2.40417555e-03],
       [5.57149446e-17, 1.00000000e+00],
       ...,
       [2.65005612e-13, 1.00000000e+00],
       [1.31370574e-08, 9.99999987e-01],
       [5.64697821e-10, 9.99999999e-01]])

In [26]:

# Prepare DataFrame for Plotly
df = pd.DataFrame(X_scaled, columns=['x', 'y'])
df['Cluster'] = labels.astype(str)

# Scatter plot of clusters
fig = px.scatter(
    df,
    x="x", y="y",
    color="Cluster",
    title="Gaussian Mixture Model Clustering",
    color_discrete_sequence=px.colors.qualitative.Set1,
    width=800, height=600
)
fig.show()

# Select the optimal number of components

To select the optimal number of components (n_components) in a Gaussian Mixture Model (GMM), the most common and principled approach is to use model selection criteria that balance goodness of fit with model complexity.

### BIC (Bayesian Information Criterion)


- Penalizes complex models more strongly than AIC.
- Lower BIC = Better model
- Use it to find the sweet spot where the model explains the data well without overfitting.

In [18]:
n_components_range = range(1, 11)
bics = []

for n in n_components_range:
    gmm = GaussianMixture(n_components=n, random_state=0)
    gmm.fit(X_scaled)
    bics.append(gmm.bic(X_scaled))

fig = go.Figure()
fig.add_trace(go.Scatter(
    x=list(n_components_range),
    y=bics,
    mode='lines+markers',
    marker=dict(color='red', size=8),
    line=dict(color='red'),
    name='BIC'
))

fig.update_layout(
    title="BIC for GMM Model Selection",
    xaxis_title="Number of Components",
    yaxis_title="BIC",
    width=700,
    height=450
)

fig.show()

## Visual Validation with Silhouette Score (if using hard clustering)

In [21]:
scores = []
for n in range(2, 11):
    gmm = GaussianMixture(n_components=n)
    labels = gmm.fit_predict(X_scaled)
    scores.append(silhouette_score(X_scaled, labels))

# Plot using Plotly
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=list(n_components_range),
    y=scores,
    mode='lines+markers',
    marker=dict(color='green', size=8),
    line=dict(color='green'),
    name='Silhouette Score'
))

fig.update_layout(
    title="Silhouette Score for GMM",
    xaxis_title="Number of Components",
    yaxis_title="Silhouette Score",
    width=700,
    height=450
)

fig.show()