# GMM Assignment

In [2]:
# Maths
import numpy as np
import pandas as pd

# Plotting
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse

# ML
from sklearn.preprocessing import StandardScaler
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA
from sklearn import  metrics

# SQL
from sqlalchemy import create_engine

# Ignore Warn
import warnings
warnings.filterwarnings('ignore')

In [3]:
postgres_user = 'dsbc_student'
postgres_pw = '7*.8G9QH21'
postgres_host = '142.93.121.174'
postgres_port = '5432'
postgres_db = 'heartdisease'

engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(
    postgres_user, postgres_pw, postgres_host, postgres_port, postgres_db))
df = pd.read_sql_query('select * from heartdisease',con=engine)

# no need for an open connection, as we're only doing a single query
engine.dispose()

In [4]:
# Define the features and the outcome
X = df.iloc[:, :13]
y = df.iloc[:, 13]

# Replace missing values (marked by ?) with a 0
X = X.replace(to_replace='?', value=0)

# Binarize y so that 1 means heart disease diagnosis and 0 means no diagnosis
y = np.where(y > 0, 0, 1)

In [5]:
# Standardizing
scaler = StandardScaler()
X_std = scaler.fit_transform(X)


###   1. Apply GMM to the heart disease data by setting n_components=2. Get ARI and silhoutte scores for your solution and compare it with those of the k-means and hierarchical clustering solutions that you implemented in the assignments of the previous checkpoints. Which algorithm does perform better?

In [8]:
gmm = GaussianMixture(n_components=2)
clusters = gmm.fit_predict(X_std)

print(f'ARI Score: {metrics.adjusted_rand_score(y,clusters)}')
print(f'Silhouette Score: {metrics.silhouette_score(X_std,clusters, metric="euclidean")}')

ARI Score: 0.4207322145049338
Silhouette Score: 0.16118591340148433


K-means seems to perform slightly better on both metrics. GMM performs better than Hierarchical.

### 2. GMM implementation of scikit-learn has a parameter called covariance_type. This parameter determines the type of covariance parameters to use. Specifically, there are four types you can specify:

   * full: This is the default. Each component has its own general covariance matrix.
   * tied: All components share the same general covariance matrix.
   * diag: Each component has its own diagonal covariance matrix.
   * spherical: Each component has its own single variance.
Try all of these. Which one does perform better in terms of ARI and silhouette scores?

In [10]:
covariance_types = ['full','tied','diag','spherical']

for i in range(len(covariance_types)):
    gmm = GaussianMixture(n_components=2, covariance_type=covariance_types[i])
    clusters = gmm.fit_predict(X_std)
    print(('-'*50)+(f'\nCovariance Type: {covariance_types[i]}\n')+('-'*50))
    print(f'ARI Score: {metrics.adjusted_rand_score(y,clusters)}')
    print(f'Silhouette Score: {metrics.silhouette_score(X_std,clusters, metric="euclidean")}')
    

--------------------------------------------------
Covariance Type: full
--------------------------------------------------
ARI Score: 0.18389186035089963
Silhouette Score: 0.13628813153331445
--------------------------------------------------
Covariance Type: tied
--------------------------------------------------
ARI Score: 0.46482432589803474
Silhouette Score: 0.16607012124631088
--------------------------------------------------
Covariance Type: diag
--------------------------------------------------
ARI Score: 0.37878842718089933
Silhouette Score: 0.15836933745078682
--------------------------------------------------
Covariance Type: spherical
--------------------------------------------------
ARI Score: 0.20765243525722465
Silhouette Score: 0.12468753110276873


In [None]:
Tied seems to have the best performance