In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import seaborn as sns
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score

In [2]:
scaler = StandardScaler().set_output(transform='pandas')

In [3]:
milk = pd.read_csv("C:/Python/Datasets/milk.csv", index_col=0)
milk_scaled = scaler.fit_transform(milk)

In [4]:
clust = DBSCAN(eps=0.6, min_samples=2)
clust.fit(milk_scaled)
clust.labels_

array([ 0,  0,  0,  0, -1,  1,  1,  2, -1, -1,  2,  1,  0, -1,  1,  2, -1,
       -1, -1, -1,  3,  3, -1, -1, -1], dtype=int64)

In [10]:
inliers = milk_scaled.copy()
inliers['label'] = clust.labels_
inliers = inliers[inliers['label']!=-1]
silhouette_score(inliers.iloc[:,:-1], inliers['label'])

0.5934459505692155

In [30]:
epsilons = [0.2, 0.4, 0.6, 0.8, 1, 1.2]
min_pcts = [2, 3, 4, 5]
scores = []
for e in epsilons:
    for m in min_pcts:
        clust = DBSCAN(eps=e, min_samples=m)
        clust.fit(milk_scaled)       
        inliers = milk_scaled.copy()
        inliers['label'] = clust.labels_
        inliers = inliers[inliers['label']!=-1]
        # len( np.unique( inliers['label'] )) are the number of clusters
        # getting formed
        if len( np.unique( inliers['label'] )) >= 2:
            scores.append([e, m, silhouette_score(inliers.iloc[:,:-1], inliers['label'])])
df_scores = pd.DataFrame(scores, columns=['eps','min','score'])
df_scores.sort_values('score', ascending=False)

Unnamed: 0,eps,min,score
0,0.4,2,0.651894
9,1.0,3,0.647387
2,0.6,2,0.593446
10,1.2,2,0.552889
4,0.6,4,0.551975
1,0.4,3,0.538518
3,0.6,3,0.534443
6,0.8,3,0.533038
5,0.8,2,0.464674
7,0.8,4,0.457151


In [23]:
clust = DBSCAN(eps=0.6, min_samples=4)
clust.fit(milk_scaled)
print(len( np.unique( clust.labels_ ) ) - 1)
print( clust.labels_ )

2
[ 0  0  0  0 -1  1  1 -1 -1 -1 -1  1  0 -1  1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1]


In [15]:
clust = DBSCAN(eps=0.6, min_samples=2)
clust.fit(milk_scaled)
len( np.unique( clust.labels_ ))

5

In [5]:
df_copy = milk.copy()
df_copy['clust'] = clust.labels_
df_copy

Unnamed: 0_level_0,water,protein,fat,lactose,ash,clust
Animal,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
HORSE,90.1,2.6,1.0,6.9,0.35,0
ORANGUTAN,88.5,1.4,3.5,6.0,0.24,0
MONKEY,88.4,2.2,2.7,6.4,0.18,0
DONKEY,90.3,1.7,1.4,6.2,0.4,0
HIPPO,90.4,0.6,4.5,4.4,0.1,-1
CAMEL,87.7,3.5,3.4,4.8,0.71,1
BISON,86.9,4.8,1.7,5.7,0.9,1
BUFFALO,82.1,5.9,7.9,4.7,0.78,2
GUINEA PIG,81.9,7.4,7.2,2.7,0.85,-1
CAT,81.6,10.1,6.3,4.4,0.75,-1


Usage with Supervised Learning

In [35]:
from sklearn.model_selection import train_test_split

In [36]:
kyph = pd.read_csv("C:/Python/Cases/Kyphosis/Kyphosis.csv")
y = kyph['Kyphosis']
X = kyph.drop('Kyphosis', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=25,
                                                    stratify=y)

In [38]:
X.shape

(81, 3)

In [37]:
X_train.shape

(56, 3)