# Unsupervised -> DBSCAN

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sqlalchemy import create_engine

In [3]:
postgres_user = 'dsbc_student'
postgres_pw = '7*.8G9QH21'
postgres_host = '142.93.121.174'
postgres_port = '5432'
postgres_db = 'heartdisease'

# Initialize the postgres engine
engine = create_engine(f'postgresql://{postgres_user}:{postgres_pw}@{postgres_host}:{postgres_port}/{postgres_db}')

# Read the data from a sql query to the engine
heartdisease_df = pd.read_sql('SELECT * from {}'.format(postgres_db), con=engine)

# Dispose of the engine
engine.dispose()

# Define the features and the outcome
X = heartdisease_df.iloc[:, :13]
y = heartdisease_df.iloc[:, 13]

# Replace missing values (marked by ?) with a 0
X = X.replace(to_replace='?', value=0)

# Binarize y so that 1 means heart disease diagnosis and 0 means no diagnosis
y = np.where(y > 0, 0, 1)

### 1. Apply DBSCAN to the heart disease data by trying different values for eps and min_samples parameters. You'll realize that it's really hard to get a two cluster solution using DBSCAN if not impossible.

In [5]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_std = scaler.fit_transform(X)

In [23]:
from sklearn.cluster import DBSCAN

eps = [.2, .5, .7, 1]
samps = [1, 2, 3, 4]

for e in eps:
    for s in samps:

        dbscan = DBSCAN(eps=e, min_samples=s)
        preds = dbscan.fit_predict(X_std)

        preds_s = pd.Series(preds)
        num_clusts = preds_s.loc[preds_s != -1].nunique()
        print('epsilon:', e)
        print('minPts:', s)
        print('Number of clusters:', num_clusts)
        print('-------------------------------------------------')

epsilon: 0.2
minPts: 1
Number of clusters: 303
-------------------------------------------------
epsilon: 0.2
minPts: 2
Number of clusters: 0
-------------------------------------------------
epsilon: 0.2
minPts: 3
Number of clusters: 0
-------------------------------------------------
epsilon: 0.2
minPts: 4
Number of clusters: 0
-------------------------------------------------
epsilon: 0.5
minPts: 1
Number of clusters: 301
-------------------------------------------------
epsilon: 0.5
minPts: 2
Number of clusters: 2
-------------------------------------------------
epsilon: 0.5
minPts: 3
Number of clusters: 0
-------------------------------------------------
epsilon: 0.5
minPts: 4
Number of clusters: 0
-------------------------------------------------
epsilon: 0.7
minPts: 1
Number of clusters: 301
-------------------------------------------------
epsilon: 0.7
minPts: 2
Number of clusters: 2
-------------------------------------------------
epsilon: 0.7
minPts: 3
Number of clusters: 0

Solutions that yielded 2 clusters:

Epsilon: .5, minPts: 2
Epsilon: .7, minPts: 2

### 2. Apply DBSCAN by setting parameters eps=1, min_samples=1, metric="euclidean". Then, increase the value of min_samples. What's the effect of increasing min_samples on the number of clusters DBSCAN identifies?

In [28]:
samps = [1, 2, 3]

for s in samps:
    dbscan = DBSCAN(eps=1, min_samples=s, metric='euclidean')

    preds = dbscan.fit_predict(X_std)

    preds_s = pd.Series(preds)
    num_clusts = preds_s.loc[preds_s != -1].nunique()
    print('epsilon:', e)
    print('minPts:', s)
    print('Number of clusters:', num_clusts)
    print('----------------------------')

epsilon: 1
minPts: 1
Number of clusters: 294
epsilon: 1
minPts: 2
Number of clusters: 8
epsilon: 1
minPts: 3
Number of clusters: 1


Increasing min_samples decreases the total number of clusters predicted by the model

3. Apply DBSCAN by setting parameters eps=1, min_samples=1, metric="euclidean". Then, increase the value of eps. What's the effect of increasing eps on the number of clusters DBSCAN identifies?

In [32]:
eps = [1, 2, 3]

for e in eps:
    dbscan = DBSCAN(eps=e, min_samples=1, metric='euclidean')

    preds = dbscan.fit_predict(X_std)

    preds_s = pd.Series(preds)
    num_clusts = preds_s.loc[preds_s != -1].nunique()
    print('epsilon:', e)
    print('minPts:', s)
    print('Number of clusters:', num_clusts)
    print('----------------------------')

epsilon: 1
minPts: 3
Number of clusters: 294
----------------------------
epsilon: 2
minPts: 3
Number of clusters: 178
----------------------------
epsilon: 3
minPts: 3
Number of clusters: 34
----------------------------


Increasing epsilon also decreases the number of clusters predicted by the model