## 유사한 얼굴끼리 군집화 하기 
1. 얼굴 Feature Vector 값 로드 
2. 군집화 수 K결정 → 
$$ K \approx  \sqrt{\frac{N}{2}} $$
3. K-Means로 군집화
4. 군집화 결과 보기 
5. Centroid와 가장 유사한 얼굴 찾아 보기  : 중심 얼굴
6. K의 값을 변경해 가면 적절한 값을 찾기 

In [204]:
## 1. 얼굴 Feature Vector 값 로드 

import pandas as pd 
landmark_ds = pd.DataFrame.from_csv('/Users/goodvc/data/fastcampus/week2/resource/landmark.csv')
landmark_ds.head()

Unnamed: 0,eye_width,lip_to_nose,mouth_width,face_width,face_height,lip_to_chin,eye_to_eye,nose_width,eyebrow_width
김상범:1,0.65332,0.18036,0.42371,0.27891,1,0.27125,0.46367,0.28989,0.07309
김상범:2,0.45774,0.17712,0.30499,0.1713,1,0.25579,0.31862,0.20503,0.07676
김수현:1,0.45107,0.20541,0.22673,0.17002,1,0.25671,0.31702,0.19088,0.06413
김수현:2,0.38241,0.1979,0.20191,0.15363,1,0.2429,0.27322,0.16854,0.0468
김수현:3,0.70781,0.18121,0.40702,0.28452,1,0.21859,0.50424,0.328,0.05449


---
## K-Means로 클러스터링하기

* 군집화 수 결정
$$ K \approx  \sqrt{\frac{N}{2}} $$


In [217]:
## 2. 군집화 수 K결정 sqrt(N/2)
import numpy as np
import math

K = math.ceil(np.sqrt(landmark_ds.count()[0]/2))
print ("K=", K)


K= 4


In [218]:
from sklearn.cluster import KMeans
X = landmark_ds.values
cluster = KMeans(n_clusters=K)
cluster.fit(X)

KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=4, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0)

In [220]:
## cluster 결과 
clustered = pd.DataFrame( [[a,b] for a,b in zip(landmark_ds.index, cluster.labels_)], \
                         columns=['name','group-id'], \
                         index=landmark_ds.index )
clustered.groupby(['group-id']).groups

{0: ['아기:1', '황영식:1', '황영식:2'],
 1: ['김상범:1', '김수현:3', '양정길:1', '원빈:1', '원빈:2', '황전식:1', '황전식:2', '황전식:3'],
 2: ['김상범:2', '김수현:1', '김수현:2', '원빈:3', '최규민:1', '최규민:2', '최규민:3'],
 3: ['양정길:2', '양정길:3', '와이프:1', '전명훈:1', '전명훈:2']}

In [221]:
cluster.cluster_centers_





array([[ 1.24233333,  0.20926667,  0.68006667,  0.47554   ,  1.        ,
         0.21665667,  0.85434333,  0.54325667,  0.06132   ],
       [ 0.63088875,  0.18807125,  0.36962   ,  0.25897625,  1.        ,
         0.23419875,  0.44586875,  0.2851675 ,  0.06321125],
       [ 0.44821286,  0.19528571,  0.25528571,  0.17713429,  1.        ,
         0.25135143,  0.31894286,  0.19491286,  0.05940429],
       [ 0.86043   ,  0.199106  ,  0.486568  ,  0.328156  ,  1.        ,
         0.228484  ,  0.60881   ,  0.395158  ,  0.064618  ]])

## 군집화의 적정성 평가  ( 주관적 평가)
* 동일한 사람이 동일한 클러스터 많이 묶일수록 군집화가 적정하다. 
* 클러스터의 평균 이름수가 작을 수록 군집화가 적절하다고 판단

In [215]:
clustered['name_only'] = clustered['name'].apply(lambda x: x.split(':')[0])
clustered.groupby('group-id').name_only.nunique().mean()

3.75

Unnamed: 0,name,group-id,name_only
김상범:1,김상범:1,3,김상범
김상범:2,김상범:2,2,김상범
김수현:1,김수현:1,2,김수현
김수현:2,김수현:2,2,김수현
김수현:3,김수현:3,3,김수현
아기:1,아기:1,0,아기
양정길:1,양정길:1,3,양정길
양정길:2,양정길:2,1,양정길
양정길:3,양정길:3,1,양정길
와이프:1,와이프:1,1,와이프


---
## Hierarchical Clustering으로 군집화 하기 

In [209]:
from sklearn.cluster import AgglomerativeClustering
X = landmark_ds.values
cluster = AgglomerativeClustering(n_clusters=K,affinity='euclidean',linkage='ward')
cluster.fit(X)

AgglomerativeClustering(affinity='euclidean', compute_full_tree='auto',
            connectivity=None, linkage='ward',
            memory=Memory(cachedir=None), n_clusters=4, n_components=None,
            pooling_func=<function mean at 0x105188048>)

In [210]:
## cluster 결과 
clustered = pd.DataFrame( [[a,b] for a,b in zip(landmark_ds.index, cluster.labels_)], \
                         columns=['name','group-id'], \
                         index=landmark_ds.index )
clustered.groupby(['group-id']).groups

{0: ['아기:1', '황영식:1', '황영식:2'],
 1: ['양정길:2', '양정길:3', '와이프:1', '전명훈:1', '전명훈:2'],
 2: ['김상범:2', '김수현:1', '김수현:2', '원빈:3', '최규민:1', '최규민:2', '최규민:3', '황전식:1'],
 3: ['김상범:1', '김수현:3', '양정길:1', '원빈:1', '원빈:2', '황전식:2', '황전식:3']}

In [201]:
clustered['name_only'] = clustered['name'].apply(lambda x: x.split(':')[0])
clustered.groupby('group-id').name_only.nunique().mean()

3.75

name    5.75
dtype: float64