### 각 칼럼별 unique value 확인
* 칼럼별로 어떤 value를 가지고 있는 지
* 그리고 각 비중(%) 어떻게 되는 지 확인

In [None]:
# count 파라미터는 unique value를 몇 개까지만 볼꺼냐?
# 예) Sex는 남, 여 2개
# 예2) 연령대는 10대, 20대, ~ , 60대 6개이므로 count가 5일때는 표시되지 않음

def col_value_check(df,count) :
    for i in df.columns :
        print("{}'s Nunique: {}\n".format(i,df[i].nunique()))

        if df[i].nunique() < count :
            print("{:=^20}".format(i))
            print(round(df[i].value_counts() / df[i].value_counts().sum(),2))
            print("{:=^20}".format("="))
    return
col_value_check(test,5)

### NaN Table
* 적용 상황
    - train, test의 각 칼럼별 NaN 값을 확인하고자 할 때


In [None]:
# train 및 test에서 각각의 na 값을 DF형태로 반환

def make_na_table(train, test):
    train_na = train.isna().sum()
    test_na = test.isna().sum()
    na_table = pd.concat((train_na, test_na), join='outer', axis=1, sort=False, keys=('train','test'))
    return na_table
na_table = make_na_table(train,test)
na_table

* kaggle에서 발췌한 missing value df 만들기 함수

In [None]:
def check_missing_data(df):
    flag=df.isna().sum().any()
    if flag==True:
        total = df.isnull().sum()
        percent = (df.isnull().sum())/(df.isnull().count()*100)
        output = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
        data_type = []
        # written by MJ Bahmani
        for col in df.columns:
            dtype = str(df[col].dtype)
            data_type.append(dtype)
        output['Types'] = data_type
        return(np.transpose(output))
    else:
        return(False)

### Unique value Table
* 적용 상황
    - train, test의 각 칼럼별 unique value을 확인하고자 할 때
    - columns이 index로 들어감


In [None]:
def make_unique_value_table(train, test) :
    a = []
    b = []

    for x in range(0, len(train.columns)) :
        a.append(train[train.columns[x]].nunique())
        b.append(train.columns[x])
    a1 = pd.Series(a, index=b)

    c = []
    d = []
    for x in range(0, len(test.columns)) :
        c.append(int(test[test.columns[x]].nunique()))
        d.append(test.columns[x])

    a2 = pd.Series(c, index=d)

    unique_table = pd.concat([a1,a2], axis=1, sort=False, keys=['train', 'test'])
    return unique_table
unique_table = make_unique_value_table(train, test)
unique_table

### IQR 로 Outlier 찾아내는 함수

In [None]:
# Outlier 찾아내는 함수
def get_outlier(df, col, weight=1.5) :
    col = df[col]
    quan_25 = np.percentile(col.values, 25)
    quan_75 = np.percentile(col.values, 75)
    
    # IQR을 구하고, 1.5를 곱해서 최대/최소를 구함
    iqr = quan_75 - quan_25
    iqr_weight = iqr * weight
    min_val = quan_25 - iqr_weight
    max_val = quan_75 + iqr_weight
    outlier_index = col[(col < min_val) | (col > max_val)].index
    return outlier_index

# 사용 예제

for i in ride_col :
    outlier_index = get_outlier(train2, i)
    print("{}: {}".format(i,train2.loc[outlier_index][i].shape[0]))
print()
for i in off_col :
    outlier_index = get_outlier(train2, i)
    print("{}: {}".format(i,train2.loc[outlier_index][i].shape[0]))
print()

outlier_index = get_outlier(train2, '18~20_ride')
print("{}: {}".format('18~20_ride',train2.loc[outlier_index]['18~20_ride'].shape[0]))

### 이상치 제거 함수

In [None]:
# 이상치 제거 함수

def del_outlier(df, col_list) :
    df_copy = df.copy()
    for i in col_list :
        outlier_index = get_outlier(df_copy, i)
        df_copy.drop(outlier_index, axis=0, inplace=True)
        print(i)
    return df_copy

### PCA 및 Randomforest 산출 함수
* 적용 상황
    - PCA를 통해 차원 축소가 필요한 데, 몇 개의 component가 필요한 지 모를 때
    - components 수에 따른 R2(모델설명력)와 Randomforest의 score를 산출함

In [None]:
# 컬럼 최대개수에 맞게 components 수를 변화시키면서 설명력을 확인함
# 

def pca_rcf(train_x, train_y, test_x, test_y) :
    
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.decomposition import PCA
    
    max_sum, max_com, max_result = 0.0, 0, 0.0
    final_rcf = RandomForestClassifier()
    
    # components를 2~5개까지 테스트 해봄
    for i in range(2, 5) :
        pca = PCA(n_components=i)
        pca_train_x = pca.fit_transform(train_x)
        imp = []
        
        for p in pca.explained_variance_ratio_ :
            imp.append(round(p,4))
            
        print("components:",i,"\tTot:",sum(imp))
        rcf = RandomForestClassifier(n_estimators=1000, random_state=2442)
        rcf.fit(train_x, train_y)
        predicted = rcf.predict(test_x)
        result = rcf.score(test_x, test_y)
        print("\tScore: {:.3f}".format(result))
        
        if result > max_result :
            max_result = result
            final_rcf = rcf

        if sum(imp) > max_sum :
            max_sum = sum(imp)
            max_com = i
            
    print("최종 >> 컴포넌트수: {}, R2: {:.3f}, 예측 정확도: {:.3f}".format(max_com, max_sum, max_result))
    # return 값으로 rcf 모델과 예측값을 반환함 (필요에 따라 수정혀)
    return final_rcf, predicted

rcf, predicted = pca_rcf(train_x, train_y, test_x, test_y)


<hr>

### 모델 성능 평가

* classification 평가
    * classification report
    * roc_auc
    * confusion matrix

In [None]:
def get_model_train_eval(model, train_x,  test_x, train_y, test_y) :
    from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
    
    model.fit(train_x, train_y)
    pred = model.predict(test_x)
    print("==Confusion_matrix below==\n",confusion_matrix(test_y, pred))
    print(classification_report(test_y, pred, digits=3))  # digit, 3자리 수까지만 표기
    print()
    print("{:>12}\t{:.3f}".format("ROC-AUC",roc_auc_score(test_y, pred))) 

### GridSearchCV - 교차 검증과 체적 하이퍼 파라미터 튜닝을 한 번에

In [None]:
from sklearn.model_selection import GridSerchCV

params = {
    "" : [],
    "" : []
}

grid_cv = GridSerchCV(model, param_grid = params, cv = 2, n_jobs = -1)
grid_cv.fit(train_x, train_y)
print("Best Params:\n", grid_cv.best_params_)
print("Best predict score: {:.4f}".format(grid_cv.best_score_))

<hr>

### 전화번호, URL, PRICE 필터링

In [None]:
def phone_number_filter(text):
    re_pattern = r'\d{2,3}[-\.\s]*\d{3,4}[-\.\s]*\d{4}(?!\d)'
    new_text = re.sub(re_pattern, 'tel', text)
    re_pattern = r'\(\d{3}\)\s*\d{4}[-\.\s]??\d{4}'
    new_text = re.sub(re_pattern, 'tel', new_text)
    return new_text
    
    
def url_filter(text):
    re_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),|]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    new_text = re.sub(re_pattern, 'url', text)
    return new_text


def price_filter(text):
    re_pattern = r'\d{1,3}[,\.]\d{1,3}[만\천]?\s?[원]|\d{1,5}[만\천]?\s?[원]'
    text = re.sub(re_pattern, 'money', text)
    re_pattern = r'[일/이/삼/사/오/육/칠/팔/구/십/백][만\천]\s?[원]'
    text = re.sub(re_pattern, 'money', text)
    re_pattern = r'(?!-)\d{2,4}[0]{2,4}(?!년)(?!.)|\d{1,3}[,/.]\d{3}'
    new_text = re.sub(re_pattern, 'money', text)
    return new_text

<hr>

### 샘플링
* Imbalanced Data Set에 적용
* 모든 알고리즘을 적용한 것은 아님
* [imblearn site](https://imbalanced-learn.readthedocs.io/en/stable/api.html)

In [1]:
def make_sampling(X_train, y_train) :
    '''https://imbalanced-learn.readthedocs.io/en/stable/api.html
    현재 5개만 추가된 상태'''
    
    from imblearn.under_sampling import EditedNearestNeighbours, NeighbourhoodCleaningRule, TomekLinks
    from imblearn.over_sampling import SVMSMOTE
    from imblearn.combine import SMOTEENN
    from imblearn.metrics import sensitivity_score, specificity_score
    
    # Edited Nearest Neighbours (이웃 개수 및 kidn_sel ('all'/ 'mode') 둘 중 한개 택
    x_enn, y_enn = EditedNearestNeighbours(sampling_strategy='auto', n_neighbors=3, 
                                           kind_sel='all', n_jobs=-1).fit_sample(X_train, y_train)
    
    # threshold_cleaing은 임계값
    x_ncr, y_ncr = NeighbourhoodCleaningRule(sampling_strategy='auto', n_neighbors=3,
                                             kind_sel='all', threshold_cleaning=0.5, n_jobs=-1).fit_sample(X_train, y_train)
    
    # SMOTE + ENN : only can use 'binary' 
    x_s_enn, y_s_enn = SMOTEENN(sampling_strategy='auto', random_state=None, n_jobs=-1).fit_sample(X_train, y_train)
    
    # Tomek's llnk
    x_t, y_t = TomekLinks(n_jobs=-1).fit_sample(X_train, y_train)
    
    x_s_s, y_s_s = SVMSMOTE(n_jobs=-1).fit_sample(X_train, y_train)
    
    sampling_list = [x_enn, y_enn, x_ncr, y_ncr, x_s_enn, y_s_enn, x_t, y_t, x_s_s, y_s_s]
    
    return sampling_list

help(make_sampling)

###################

def get_eval(y_test, pred) :
    from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score 
    
    cm = confusion_matrix(y_test, pred)
    acc = np.round(accuracy_score(y_test, pred),3)
    pre = np.round(precision_score(y_test, pred),3)
    recall = np.round(recall_score(y_test, pred),3)
    f1 = np.round(f1_score(y_test, pred),3)
    roc_auc = np.round(roc_auc_score(y_test, pred),3)

    return cm, acc, pre, recall, f1, roc_auc
######################

def make_series(y_test, pred, index_name) :
    cm, acc, pre, recall, f1, roc_auc = get_eval(y_test, pred)
    score = pd.Series({'c_m':cm, 'acc': acc, 'precision':pre, 'recall':recall, 'f1':f1, 'roc_auc':roc_auc},
                             name=index_name)
    return score

Help on function make_sampling in module __main__:

make_sampling(X_train, y_train)
    https://imbalanced-learn.readthedocs.io/en/stable/api.html
    현재 5개만 추가된 상태



In [None]:
# 위에서 샘플링된 것으로 모델에 넣는 것
# 현재는 LGBM을 기본 모델로 채택

def use_sampling(sampling_list, x_test, y_test) :
    # sampling api 리스트
    api = ['ENN', 'NCR', 'SMOTEENN', 'Tomek', 'SVMSMOTE']
    
    from lightgbm import LGBMClassifier

    lgbm = LGBMClassifier(n_estimators=200, n_jobs=-1, random_state=1, learning_rate=0.1)
    evals = [(x_test, y_test)]
    score_df = pd.DataFrame(columns=['c_m','acc','precision','recall','f1','roc_auc'])
    
    for i in range(0,len(sampling_list),2) :
        lgbm.fit(sampling_list[i], sampling_list[i+1], early_stopping_rounds=50, eval_metric="auc", eval_set=evals, verbose=False)
        pred = lgbm.predict(x_test)
        
        if i != 0 :
            score = make_series(y_test, pred, api[int(i/2)])
            score_df = score_df.append(score)
        else :
            score = make_series(y_test, pred, api[i])
            score_df = score_df.append(score)
    
    return score_df
        

* 실사용 예시 (x_train, y_train, x_test, y_test 모두 있어야함)

In [None]:
sampling_list = make_sampling(x_train, y_train)
print(len(sampling_list))  # 모델별로 2개씩(x, y) 들어가므로 총 10개가 반환

score_df = use_sampling(sampling_list, x_test, y_test)
score_df

<hr>

### Scaled 및 선형 회귀모델에 따른 회귀계수, RMSE 차이 확인
* 사용된 회귀 : Linear, Ridge, Lasso, ElasticNet
* 평가지표 : RMSE

**1. Ridge 고정 | 스케일별 & alpha별 RMSE df 산출** 

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

# get_scaled_data 함수
# method는 표준 정규 분포 변환(Standard), 최대값/최소값 정규화(MinMax), 로그변환(Log) 결정
# p_degree는 다향식 특성을 추가할 때 적용. p_degree는 2이상 부여하지 않음
def get_scaled_data(method='None', p_degree=None, input_data=None):
    if method == 'Standard':
        scaled_data = StandardScaler().fit_transform(input_data)
    elif method == 'MinMax':
        scaled_data = MinMaxScaler().fit_transform(input_data)
    elif method == 'Log':
        scaled_data = np.log1p(input_data)
    else:
        scaled_data = input_data

    if p_degree != None:
        scaled_data = PolynomialFeatures(degree=p_degree, 
                                         include_bias=False).fit_transform(scaled_data)
    
    return scaled_data
#############################################

# alpha값에 따른 회귀 모델의 폴드 평균 RMSE를 출력하고 RMSE값을 DataFrame으로 반환 
def get_linear_reg_eval_1000(rmse_df, model_name, params=None, X_data_n=None, y_target_n=None, method=None):
    rmse_list = []
    print('{:=^30}'.format(model_name))
    for param in params:
        if model_name =='Ridge': model = Ridge(alpha=param)
        neg_mse_scores = cross_val_score(model, X_data_n, 
                                             y_target_n, scoring="neg_mean_squared_error", cv = 5)
        avg_rmse = np.mean(np.sqrt(-1 * neg_mse_scores))
        print('alpha {0}일 때 5 폴드 세트의 평균 RMSE: {1:.3f} '.format(param, avg_rmse))
        rmse_list.append(np.round(avg_rmse, 2))
    rmse_series = pd.Series(rmse_list, index=params, name=method)
    rmse_df[method] = rmse_series
    return rmse_df
##############################################

# Ridge의 alpha값을 다르게 적용하고 다양한 데이터 변환방법에 따른 RMSE 추출. 
alphas = [0.1, 1, 10, 100]
#변환 방법은 모두 6개, 원본 그대로, 표준정규분포, 표준정규분포+다항식 특성
# 최대/최소 정규화, 최대/최소 정규화+다항식 특성, 로그변환 
scale_methods=[(None, None), ('Standard', None), ('Standard_Poly', 2), 
               ('MinMax', None), ('MinMax_Poly', 2), ('Log', None)]
rmse_df = pd.DataFrame()
for scale_method in scale_methods:
    X_data_scaled = get_scaled_data(method=scale_method[0], p_degree=scale_method[1], 
                                    input_data=X_data)
    print('\n## 변환 유형:{0}, Polynomial Degree:{1}'.format(scale_method[0], scale_method[1]))
    rmse_df = get_linear_reg_eval_1000(rmse_df, 'Ridge', params=alphas, X_data_n=X_data_scaled, y_target_n=y_target,
                             method=scale_method[0], verbose=False)

In [None]:
# 표로 확인
rmse_df = rmse_df.transpose().reset_index()
rmse_df.rename({'index':'method'},inplace=True, axis=1)
rmse_df.iloc[0,0] = 'Default'
rmse_df

In [None]:
# plot 확인
plt.figure(figsize=(8,6))
plt.plot(rmse_df.iloc[:,1:])
plt.xticks(ticks=[0,1,2,3,4,5], labels=rmse_df['method'])
plt.legend(rmse_df.columns[1:])
plt.show()

**2. 선형모델에 따른 평가 및 회귀계수 그래프**

In [None]:
def get_linear_eval_coeff(model_name, params=None, x_train=None, y_train=None, graph=True):
    coeff_df = pd.DataFrame()
    coeff_list = []
    print('{:=^30}'.format(model_name))
    if graph == True : 
        # alpha가 리스트가 최소 5개
        fig , axs = plt.subplots(figsize=(20, 5) , nrows=1, ncols=5)
        for pos , param in enumerate(params) :
            if model_name =='Ridge': model = Ridge(alpha=param)
            elif model_name =='Lasso': model = Lasso(alpha=param)
            elif model_name =='ElasticNet': model = ElasticNet(alpha=param, l1_ratio=0.7)
            neg_mse_scores = cross_val_score(model, x_train, 
                                                 y_train, scoring="neg_mean_squared_error", cv = 5)
            avg_rmse = np.mean(np.sqrt(-1 * neg_mse_scores))
            print('alpha {0}일 때 5 폴드 세트의 평균 RMSE: {1:.4f} '.format(param, avg_rmse))
            model.fit(x_train , y_train)
            coeff = pd.Series(data=model.coef_ , index=x_train.columns )
            coeff = coeff.sort_values(ascending=False)[:20]
            coeff_list.append([coeff.values, coeff.index])
            colname='alpha:'+str(param)
            coeff_df[colname] = coeff
            axs[pos].set_title(colname)
            axs[pos].set_xlim(-2, 2)
            sns.barplot(x=coeff.values , y=coeff.index, ax=axs[pos])
        plt.tight_layout()
        plt.show()
        return coeff_df, coeff_list, fig
    else :
        for pos , param in enumerate(params) :
            if model_name =='Ridge': model = Ridge(alpha=param)
            elif model_name =='Lasso': model = Lasso(alpha=param)
            elif model_name =='ElasticNet': model = ElasticNet(alpha=param, l1_ratio=0.7)
            neg_mse_scores = cross_val_score(model, x_train, 
                                                 y_train, scoring="neg_mean_squared_error", cv = 5)
            avg_rmse = np.mean(np.sqrt(-1 * neg_mse_scores))
            print('alpha {0}일 때 5 폴드 세트의 평균 RMSE: {1:.4f} '.format(param, avg_rmse))
            model.fit(x_train , y_train)
            coeff = pd.Series(data=model.coef_ , index=x_train.columns )
            coeff = coeff.sort_values(ascending=False)
            colname='alpha:'+str(param)
            coeff_df[colname] = coeff
        return coeff_df

In [None]:
df, coeff_list, fig1 = get_linear_eval_coeff('Ridge', ridge_alphas, X_data, y_target, True)
df, coeff_list, fig2 = get_linear_eval_coeff('Lasso', lasso_alphas, X_data, y_target, True)
df, coeff_list, fig3 = get_linear_eval_coeff('ElasticNet', elastic_alphas, X_data, y_target, True)

In [None]:
# 각 모델별 그래프만 보려면 각 셀에서 독립적으로 fig 실행
fig1

In [None]:
fig2

In [None]:
fig3

<hr>

## 차원 축소
* PCA, LDA(Linear Discriminant Analysis), Truncked SVD, NMF(Non-negative Matrix Factorization)   
* 기본 scaler는 책에서 추천한 대로 Standard Scaler를 사용함   
    * 그랬더니 TSVD와 PCA가 거의 동일한 형태임

In [None]:
def get_dimension_reduction(feature, components, target=None) :
    
    from sklearn.preprocessing import StandardScaler
    from sklearn.decomposition import PCA, TruncatedSVD, NMF
    from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
    
    def get_pca(feature, components) :
        pca = PCA(n_components=components)
        pca.fit(feature)
        feature_pca = pca.transform(feature)
        print("PCA : {}\nSum: {:.3f}".format(pca.explained_variance_ratio_,np.sum(pca.explained_variance_ratio_)))
        return feature_pca

    def get_tsvd(feature, components) :
        tsvd = TruncatedSVD(n_components=components)
        tsvd.fit(feature)
        feature_tsvd = tsvd.transform(feature)
        print("TSVD : {}\nSum: {:.3f}".format(tsvd.explained_variance_ratio_,np.sum(tsvd.explained_variance_ratio_)))
        return feature_tsvd

    def get_lda(feature, components, target) :
        lda = LinearDiscriminantAnalysis(n_components=components)
        lda.fit(feature, target)
        feature_lda = lda.transform(feature)
        print("LDA : {}\nSum: {:.3f}".format(lda.explained_variance_ratio_, np.sum(lda.explained_variance_ratio_)))
        return feature_lda

    def get_nmf(feature, components) :
        nmf = NMF(n_components=components)
        nmf.fit(feature)
        feature_nmf = nmf.transform(feature)
        return feature_nmf
    
    # StandardScaler 적용
#     feature_scaled = StandardScaler().fit_transform(feature)
    
    # Scaler 적용 안하려면 
    feature_scaled = feature
    
    if len(target) > 0 and np.min(feature_scaled).min() >= 0 :
        
        feature_pca = get_pca(feature_scaled, components)
        feature_tsvd = get_tsvd(feature_scaled, components)
        feature_lda = get_lda(feature_scaled, components, target)
        feature_nmf = get_nmf(feature_scaled, components)

        return feature_pca, feature_tsvd, feature_nmf, feature_lda
    
    elif len(target) < 0 and np.min(feature_scaled).min() >= 0 :
        
        feature_pca = get_pca(feature_scaled, components)
        feature_tsvd = get_tsvd(feature_scaled, components)
        feature_nmf = get_nmf(feature_scaled, components)
        
        return feature_pca, feature_tsvd, feature_nmf
    
    elif len(target) < 0 and np.min(feature_scaled).min() < 0 :
        feature_pca = get_pca(feature_scaled, components)
        feature_tsvd = get_tsvd(feature_scaled, components)
        
        return feature_pca, feature_tsvd
    
    elif len(target) > 0 and np.min(feature_scaled).min() < 0 :
        feature_pca = get_pca(feature_scaled, components)
        feature_tsvd = get_tsvd(feature_scaled, components)
        feature_lda = get_lda(feature_scaled, components, target)
        
        return feature_pca, feature_tsvd, feature_lda 

In [None]:
# 예시

x = iris.data
y = iris.target

feature_pca, feature_tsvd, feature_nmf, feature_lda = get_dimension_reduction(x, components=2 ,y)

In [None]:
# 차원축소된 arrayy를 DataFrame으로 변환

def create_df(feature_, columns, target) :
    import pandas as pd
    df = pd.DataFrame(feature_, columns=columns)
    df['target'] = target
    return df

col = ['1_com', '2_com']
pca_df = create_df(feature_pca, col, y)
tsvd_df = create_df(feature_tsvd, col, y)
lda_df = create_df(feature_lda, col, y)
nmf_df = create_df(feature_nmf, col, y)

In [None]:
# 차원이 2개일 때만 그릴 수 있음

import seaborn as sns

f, ax = plt.subplots(2,2, figsize=(12,8))

ax[0,0].set_title('PCA')
sns.scatterplot(x='1_com', y='2_com', data=pca_df, hue='target', ax=ax[0,0])

ax[0,1].set_title('TSVD')
sns.scatterplot(x='1_com', y='2_com', data=tsvd_df, hue='target', ax=ax[0,1])

ax[1,0].set_title('LDA')
sns.scatterplot(x='1_com', y='2_com', data=lda_df, hue='target', ax=ax[1,0])

ax[1,1].set_title('NMF')
sns.scatterplot(x='1_com', y='2_com', data=nmf_df, hue='target', ax=ax[1,1])

plt.tight_layout()
plt.show()

<hr>

## 군집화(Clustering)에 따른 실루엣 스코어 확인

* K-means, GMM 2개의 군집 모델에 적용 가능한 실루엣 plot 을 그려주는 코드
* 한계
1. 실루엣 스코어 만으로 클러스터링의 정확도를 판단하는 것은 무리
2. 실제 분포(시각화)를 통해서 눈으로 보는 것도 중요함
3. 여러 클러스터링을 사용하여 최적의 군집개수(k)를 찾아보고 테스트 해보는 것을 추천함

In [None]:
### 여러개의 클러스터링 개수를 List로 입력 받아 각각의 실루엣 계수를 면적으로 시각화한 함수 작성  
def visualize_silhouette(cluster_model, cluster_lists, X_features): 
    
    from sklearn.cluster import KMeans, MeanShift, estimate_bandwidth, DBSCAN
    from sklearn.mixture import GaussianMixture
    from sklearn.metrics import silhouette_samples, silhouette_score

    import matplotlib.pyplot as plt
    import matplotlib.cm as cm
    import math
    
    # 입력값으로 클러스터링 갯수들을 리스트로 받아서, 각 갯수별로 클러스터링을 적용하고 실루엣 개수를 구함
    n_cols = len(cluster_lists)
    
    # plt.subplots()으로 리스트에 기재된 클러스터링 만큼의 sub figures를 가지는 axs 생성 
    fig, axs = plt.subplots(figsize=(4*n_cols, 4), nrows=1, ncols=n_cols)
    
    # 리스트에 기재된 클러스터링 갯수들을 차례로 iteration 수행하면서 실루엣 개수 시각화
    for ind, n_cluster in enumerate(cluster_lists):
        
        if cluster_model == 'kmeans' :
            # KMeans 클러스터링 수행하고, 실루엣 스코어와 개별 데이터의 실루엣 값 계산. 
            clusterer = KMeans(n_clusters = n_cluster, max_iter=300, random_state=0)
            
        elif cluster_model == 'gmm' :
            # GMM
            clusterer = GaussianMixture(n_components=n_cluster, random_state=0)
            
        cluster_labels = clusterer.fit_predict(X_features)
        sil_avg = silhouette_score(X_features, cluster_labels)
        sil_values = silhouette_samples(X_features, cluster_labels)
        
        y_lower = 10
        axs[ind].set_title('Number of Cluster : '+ str(n_cluster)+'\n' \
                          'Silhouette Score :' + str(round(sil_avg,3)) )
        axs[ind].set_xlabel("The silhouette coefficient values")
        axs[ind].set_ylabel("Cluster label")
        axs[ind].set_xlim([-0.1, 1])
        axs[ind].set_ylim([0, len(X_features) + (n_cluster + 1) * 10])
        axs[ind].set_yticks([])  # Clear the yaxis labels / ticks
        axs[ind].set_xticks([0, 0.2, 0.4, 0.6, 0.8, 1])
        
        # 클러스터링 갯수별로 fill_betweenx( )형태의 막대 그래프 표현. 
        for i in range(n_cluster):
            ith_cluster_sil_values = sil_values[cluster_labels==i]
            ith_cluster_sil_values.sort()
            
            size_cluster_i = ith_cluster_sil_values.shape[0]
            y_upper = y_lower + size_cluster_i
            
            color = cm.nipy_spectral(float(i) / n_cluster)
            axs[ind].fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_sil_values, \
                                facecolor=color, edgecolor=color, alpha=0.7)
            axs[ind].text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
            y_lower = y_upper + 10
            
        axs[ind].axvline(x=sil_avg, color="red", linestyle="--")

In [None]:
# (모델명, 군집개수 리스트, x_feature)
visualize_silhouette('gmm', [2,3,4,5], X_features)

### cluster별 Plot 생성
* kmeans, GMM, DBSCAN 가능

In [None]:
### 클러스터 결과를 담은 DataFrame과 사이킷런의 Cluster 객체등을 인자로 받아 클러스터링 결과를 시각화하는 함수  
def visualize_cluster_plot(cluster_model, X_feature):
    from sklearn.cluster import KMeans, DBSCAN
    from sklearn.mixture import GaussianMixture
    from sklearn.decomposition import PCA
    
    # PCA 변환 및 시각화를 위해서 2개로만 축소함
    pca = PCA(n_components=2)
    pca_df = pca.fit_transform(X_feature)
    pca_df = pd.DataFrame(pca_df, columns=['pca1','pca2'])
    
    # 기본 2개 군집으로 해논 상태(k-menas, gmm 파라미터로 들어감)
    n_clu = 3
    
    if cluster_model == 'kmeans' :
        clusterer = KMeans(n_clusters=n_clu, init='k-means++', max_iter=500, random_state=0)
        
    elif cluster_model == 'gmm' :
        clusterer = GaussianMixture(n_components=n_clu, random_state=0)
        
    elif cluster_model == 'dbscan' :
        clusterer = DBSCAN(eps=0.5, min_samples=10, metric='euclidean')
    
    print(clusterer)
    clusterer_label = clusterer.fit_predict(pca_df)
    pca_df[cluster_model] = clusterer_label
        
    unique_labels = np.unique(pca_df[cluster_model].values)
    markers=['o', 's', '^', 'x', '*']
    isNoise=False
    
    # DBSCAN의 Noise 처리를 위해 필요한 함수
    for label in unique_labels:
        label_cluster = pca_df[pca_df[cluster_model]==label]
        if label == -1:
            cluster_legend = 'Noise'
            isNoise=True
        else :
            cluster_legend = 'Cluster '+str(label)
        
        plt.scatter(x=label_cluster['pca1'], y=label_cluster['pca2'], s=70,\
                    edgecolor='k', marker=markers[label], label=cluster_legend)
        
    if isNoise:
        legend_loc='upper center'
    else: legend_loc='upper right'
    
    plt.legend(loc=legend_loc)
    plt.show()

In [None]:
visualize_cluster_plot('kmeans', iris.data)
visualize_cluster_plot('gmm', iris.data)
visualize_cluster_plot('dbscan', iris.data)

<hr>