In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm
from sklearn.mixture import GaussianMixture


---
**1. Gaussian Distributions**
* Generate and plot a single Gaussian distribution

In [None]:
# Generate a Gaussian distribution with a mean of 0 and a standard deviation of 1
mean = 0
stddev = 1
data = np.random.normal(mean, stddev, 1000)

# Plot the distribution
plt.figure(figsize=(10, 6))
plt.hist(data, bins=50, density=True, alpha=0.6, color='g')

# Plot the PDF
x_values = np.linspace(-4, 4, 150)
y_values = norm.pdf(x_values, mean, stddev)
plt.plot(x_values, y_values, linewidth=2, color='r')

plt.title('Single Gaussian Distribution')
plt.xlabel('Value')
plt.ylabel('Probability Density')

plt.grid(True)
plt.show()


---
**2: Mixture of Gaussians**
Generate and plot a mixture of two or more Gaussian distributions.


In [None]:
# Parameters for two Gaussian distributions
mean1, stddev1 = -2, 1
mean2, stddev2 = 2, 1.5
mix_p1 = 0.6
mix_p2 = 0.4
# Generate random data from two Gaussian distributions
data1 = np.random.normal(mean1, stddev1, int(1000 * mix_p1))
data2 = np.random.normal(mean2, stddev2, int(1000 * mix_p2))

# Combine the two datasets into one
data = np.concatenate([data1, data2])

# Plot a histogram of the mixed data
plt.figure(figsize=(10, 6))
plt.hist(data, bins=30, density=True, alpha=0.6, color='b')

# Plot the PDFs of the individual Gaussian distributions
x = np.linspace(min(data), max(data), 100)
p1 = norm.pdf(x, mean1, stddev1)
p2 = norm.pdf(x, mean2, stddev2)

# The mixed Gaussian PDFs
p_mixed = mix_p1 * p1 + mix_p2 * p2

plt.plot(x, p1, 'k', linewidth=2)
plt.plot(x, p2, 'k', linewidth=2)
plt.plot(x, p_mixed, 'r', linewidth=2, linestyle='--')

plt.title('Histogram and PDFs of a Mixture of Gaussian Distributions')
plt.show()


**3: Implementing GMM with Scikit-learn**

Use `sklearn.mixture.GaussianMixture` to fit a GMM on synthetic data.


In [None]:
X = data[:, np.newaxis]

# Fit a Gaussian mixture with 2 components
gmm = GaussianMixture(n_components=2, random_state=0)
gmm.fit(X)

# Display the means and variances of the two components
print(f"Means: {gmm.means_.flatten()}")
print(f"Variances: {gmm.covariances_.flatten()}")


**4: Clustering with GMM**

Visualize the clusters formed by the GMM on synthetic data.

In [None]:
# Predict cluster membership
labels = gmm.predict(X)

# Plot the clustered data
plt.figure(figsize=(10, 6))
plt.scatter(data, np.zeros_like(X), c=labels, cmap='viridis', s=30)
plt.title('Data points clustered by GMM')
plt.xlabel('Value')
plt.yticks([])
plt.show()


**5: Anomaly Detection using GMM**

Use the fitted GMM to detect outliers in the data.


In [None]:
# Calculate the probability density for each point
densities = gmm.score_samples(X)

# Consider points with a log probability density below a threshold as outliers
threshold = np.percentile(densities, 4)  # adjust the percentile as needed
outliers = X[densities < threshold]

# Plot the inliers and outliers
plt.figure(figsize=(10, 6))
plt.scatter(data, np.zeros_like(data), c='blue', label='Inliers')
plt.scatter(outliers, np.zeros_like(outliers), c='red', label='Outliers')
plt.title('Anomaly Detection with GMM')
plt.xlabel('Value')
plt.yticks([])
plt.legend()
plt.show()


## 2D Example

In [None]:
## Generate data

# Define parameters for a GMM with K components
K = 3  # Number of components
N = 1000  # Number of data points to generate

# Parameters for each component
mus = [np.array([4, 3]), np.array([5, 4]), np.array([8, 2])]  # Means
sigmas = [np.array([[1, 0.85], [0.85, 1]]), np.array([[1, -0.5], [-0.5, 1]]), np.array([[1, 0], [0, 1]])]  # Covariances
pis = [0.3, 0.5, 0.2]  # Mixing coefficients

# Ensure the covariance matrices are positive definite
sigmas = [np.dot(sigma, sigma.T) for sigma in sigmas]  # This step is optional depending on your input

data = []
truth = []

for _ in range(N):
    # Choose component
    k = np.random.choice(K, p=pis)
    
    # Sample from the chosen component
    sample = np.random.multivariate_normal(mus[k], sigmas[k])
    data.append(sample)
    truth.append(k)

data = np.array(data)


In [None]:
plt.scatter(data[:,0], data[:,1])
plt.show()

In [None]:
gm = GaussianMixture(n_components=3, n_init=10)

In [None]:
gm.fit(data)

In [None]:
plt.scatter(data[:,0], data[:,1], c=gm.predict(data))
plt.show()

In [None]:
gm.weights_

In [None]:
gm.means_

In [None]:
gm.covariances_

### AIC and BIC

In [None]:
gm.aic(data)

In [None]:
gm.bic(data)

In [None]:
aic = []
bic = []
ks = [1,2,3,4,5,6,7,8,9,10]
for k in ks:
    gm = GaussianMixture(n_components=k, n_init=10)
    gm.fit(data)
    aic.append(gm.aic(data))
    bic.append(gm.bic(data))

In [None]:
plt.figure(figsize=(8, 3))
plt.plot(ks, aic, "bo-", label='AIC')
plt.plot(ks, bic, "r^-", label='BIC')
plt.legend()
plt.xlabel("$k$")
plt.show()

### Anomaly Detection

In [None]:
gm = GaussianMixture(n_components=3, n_init=10)
gm.fit(data)
densities = gm.score_samples(data)

In [None]:
threshold = np.percentile(densities, 1)  # adjust the percentile as needed
outliers = data[densities < threshold]

# Plot the inliers and outliers
plt.figure(figsize=(10, 6))
plt.scatter(data[:,0],data[:,1] , c='blue', label='Inliers')
plt.scatter(outliers[:,0], outliers[:,1], c='red', label='Outliers',s=100)
plt.title('Anomaly Detection with GMM')
plt.xlabel('Value')
plt.yticks([])
plt.legend()
plt.show()

In [None]:
### Plot a mixture density with two features

import numpy as np
import plotly.graph_objects as go
from scipy.stats import multivariate_normal

# Define the mixture model parameters with 3 components
means = [
    # np.array([0, 0]),  # First component mean
    # np.array([3, 3]),  # Second component mean
    # np.array([-3, 3])  # Third component mean
    np.array([4.95, 4.08]),
    np.array([7.93, 1.94]),
    np.array([4.03, 3.01])
]
covariances = [
    # np.array([[2, 0.5], [0.5, 1]]),  # First component covariance
    # np.array([[2, -0.5], [-0.5, 1]]),  # Second component covariance
    # np.array([[2, 0], [0, 1]])  # Third component covariance

    np.array([[1.28, -.99], [-.99, 1.19]]),  # First component covariance
    np.array([[1.19, -0.14], [-0.14, 1.08]]),  # Second component covariance
    np.array([[2.03, 1.99], [1.99, 2]])  # Third component covariance
]
coefficients = [.5, .22, .28] # [0.2, 0.3, 0.5]  # Mixing coefficients for 3 components

# Create a grid of points where the PDF will be evaluated
x = np.linspace(0, 10, 100)
y = np.linspace(0, 10, 100)
X, Y = np.meshgrid(x, y)
pos = np.dstack((X, Y))

# Compute the mixture PDF with 3 components
Z = np.zeros(X.shape)
for mean, cov, coef in zip(means, covariances, coefficients):
    rv = multivariate_normal(mean, cov)
    Z += coef * rv.pdf(pos)

# Plot using Plotly
fig = go.Figure(data=[go.Surface(z=Z, x=X, y=Y)])
fig.update_layout(autosize=False,
                  width=700, height=700,
                  margin=dict(l=65, r=50, b=65, t=90),
                  scene=dict(
                      #xaxis_title='X Axis',
                      #yaxis_title='Y Axis',
                      zaxis_title='PDF'
                  ))

fig.show()


## Real Data

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/esnt/Data/refs/heads/main/CleanData/baseball.csv')