# 비원형 데이터의 군집화 - DBSCAN
---
- 분할적 군집화 방식
- 밀도(데이터의 밀집)기반 군집화 ==> 미리 군집수 지정 필요 없음
- 다양한 형태의 데이터에서 군집화 가능
- 이상치 데이터 제거도 가능함
- 군집을 정하는 기준
    * 임의의 점(Point)에서 지정된 거리만큼 영역 안에 지정된 데이터 수 존재 여부
    * 하이퍼파라미터 => 거리, 데이터수

In [78]:
from sklearn.datasets import load_iris
from sklearn.cluster import DBSCAN
import numpy as np

## [1] 데이터 로딩

In [79]:
# DataFrame 형태로 데이터 로딩 시 ==> return_X_y=True, as_frame=True 설정
X, y=load_iris(return_X_y=True, as_frame=True)

In [80]:
type(X), type(y), X.shape, y.shape

(pandas.core.frame.DataFrame, pandas.core.series.Series, (150, 4), (150,))

## [2] 데이터 전처리

In [81]:
from sklearn.preprocessing import StandardScaler

In [82]:
scaler=StandardScaler()

In [83]:
scaler.fit(X)

StandardScaler()

In [84]:
X_scaled=scaler.transform(X)

In [85]:
X_scaled[:,2]

array([-1.34022653, -1.34022653, -1.39706395, -1.2833891 , -1.34022653,
       -1.16971425, -1.34022653, -1.2833891 , -1.34022653, -1.2833891 ,
       -1.2833891 , -1.22655167, -1.34022653, -1.51073881, -1.45390138,
       -1.2833891 , -1.39706395, -1.34022653, -1.16971425, -1.2833891 ,
       -1.16971425, -1.2833891 , -1.56757623, -1.16971425, -1.05603939,
       -1.22655167, -1.22655167, -1.2833891 , -1.34022653, -1.22655167,
       -1.22655167, -1.2833891 , -1.2833891 , -1.34022653, -1.2833891 ,
       -1.45390138, -1.39706395, -1.34022653, -1.39706395, -1.2833891 ,
       -1.39706395, -1.39706395, -1.39706395, -1.22655167, -1.05603939,
       -1.34022653, -1.22655167, -1.34022653, -1.2833891 , -1.34022653,
        0.53540856,  0.42173371,  0.64908342,  0.13754657,  0.47857113,
        0.42173371,  0.53540856, -0.26031542,  0.47857113,  0.08070915,
       -0.14664056,  0.25122143,  0.13754657,  0.53540856, -0.08980313,
        0.36489628,  0.42173371,  0.194384  ,  0.42173371,  0.08

## [3] 군집화
---

In [86]:
# 군집화 객체 생성
dbscan = DBSCAN(eps=3, min_samples=10)

In [87]:
# iris 데이터 분류
irisDBS=dbscan.fit(X_scaled)

In [88]:
irisDBS.labels_, irisDBS.labels_.shape

(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64),
 (150,))

In [89]:
dbscan_labels = irisDBS.labels_

In [90]:
irisDBS.components_.shape

(150, 4)

In [91]:
irisDBS.fit_predict(X_scaled)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [92]:
## 정답지랑 하나로 묶기
import pandas as pd

irisDF=pd.DataFrame(X_scaled)

In [93]:
irisDF['dbscan_cluster'] = dbscan_labels
irisDF['target'] = y

In [94]:
iris_result = irisDF.groupby(['target'])['dbscan_cluster'].value_counts()
print(iris_result)

target  dbscan_cluster
0       0                 50
1       0                 50
2       0                 50
Name: dbscan_cluster, dtype: int64
