# **KDE**

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

sns.set(color_codes=True)

np.random.seed(0)
x = np.random.normal(0, 1, size=30) # 정규분포
sns.distplot(x)

In [None]:
sns.distplot(x, rug=True) # 데이터 분포 보여줌

In [None]:
from scipy import stats

bandwidth = 1.06 * x.std() * x.size ** (-1 / 5.) # optimal bandwidth (Gaussian일 때)
support = np.linspace(-4, 4, 200)

kernels = []
for x_i in x:
    kernel = stats.norm(x_i, bandwidth).pdf(support) # 가우시안 pdf 반환 // xi : pdf 평균이면서 데이터, bandwidth : 표준편차
    kernels.append(kernel)
    plt.plot(support, kernel, color="r")

sns.rugplot(x, color=".2", linewidth=3);

In [None]:
from scipy.integrate import trapz
density = np.sum(kernels, axis=0) # sum of kernel func
density /= trapz(density, support) # 정규화
plt.plot(support, density) # sns.distplot과 동일

In [None]:
sns.kdeplot(x)
sns.kdeplot(x, bw=.2, label="bw: 0.2")
sns.kdeplot(x, bw=2, label="bw: 2")
plt.legend();

# **Mean_shift (Bandwidth에 너무 민감)**

In [None]:
import numpy as np
from sklearn.datasets import make_blobs
from sklearn.cluster import MeanShift

X, y = make_blobs(n_samples=200, n_features=2, centers=3, cluster_std=0.8, random_state=0)
meanshift= MeanShift(bandwidth=0.9)
cluster_labels = meanshift.fit_predict(X) # fit_predict임 조심
print('cluster labels 유형:', np.unique(cluster_labels)) # cluster가 8개가 생김

In [None]:
meanshift= MeanShift(bandwidth=1) # bandwidth 증가
cluster_labels = meanshift.fit_predict(X)
print('cluster labels 유형:', np.unique(cluster_labels))

In [None]:
from sklearn.cluster import estimate_bandwidth

bandwidth = estimate_bandwidth(X,quantile=0.25) # optimal bandwidth // 데이터가 많으면 quantile 증가시키는게 좋음
print('bandwidth 값:', round(bandwidth,3))

In [None]:
import pandas as pd

clusterDF = pd.DataFrame(data=X, columns=['ftr1', 'ftr2']) # 원본 데이터
clusterDF['target'] = y # 원본 데이터

best_bandwidth = estimate_bandwidth(X, quantile=0.25) # optimal bandwidth

meanshift= MeanShift(best_bandwidth)
cluster_labels = meanshift.fit_predict(X)
print('cluster labels 유형:',np.unique(cluster_labels))    

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

clusterDF['meanshift_label']  = cluster_labels # Mean_shift의 클러스터링 결과 추가
centers = meanshift.cluster_centers_
unique_labels = np.unique(cluster_labels)
markers=['o', 's', '^', 'x', '*']

for label in unique_labels:
    label_cluster = clusterDF[clusterDF['meanshift_label']==label]
    center_x_y = centers[label]
    plt.scatter(x=label_cluster['ftr1'], y=label_cluster['ftr2'], edgecolor='k', marker=markers[label] )
    
    plt.scatter(x=center_x_y[0], y=center_x_y[1], s=200, color='white', edgecolor='k', alpha=0.9, marker=markers[label])
    plt.scatter(x=center_x_y[0], y=center_x_y[1], s=70, color='k', edgecolor='k', marker='$%d$' % label)

In [None]:
print(clusterDF.groupby('target')['meanshift_label'].value_counts())