In [1]:
import pandas as pd
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn import mixture

In [2]:
def cluster_kmeans():
    """kmeans聚类效果展示"""
    train_p = pd.read_csv('../init_data/train.csv')

    # train_p存在有缺失值(28个),每个stoick的time_id不是完全一致
    train_p = train_p.pivot(index='time_id', columns='stock_id', values='target')  # time_id和stock_id的交叉表
    train_p_corr = train_p.corr()  # 交叉表train_p关于stock_id的相关系数矩阵(112 * 112)

    for i in range(3, 20):
        kmeans = KMeans(n_clusters=i, random_state=0).fit(train_p_corr.values)  # 聚类结果
        # cluster_labels = kmeans.fit_predict(train_p_corr.values)
        print("欧式距离", i, silhouette_score(train_p_corr.values, kmeans.labels_))
        print("余弦距离", i, silhouette_score(train_p_corr.values, kmeans.labels_, metric='cosine'))
        print("davies_bouldin_score", i, davies_bouldin_score(train_p_corr.values, kmeans.labels_))
        print("*" * 100)

In [3]:
cluster_kmeans()

欧式距离 3 0.3974102647483843
余弦距离 3 0.44068935163101763
davies_bouldin_score 3 0.909788678581063
****************************************************************************************************
欧式距离 4 0.32001485570710503
余弦距离 4 0.3927916506991904
davies_bouldin_score 4 1.0983442127807246
****************************************************************************************************
欧式距离 5 0.33232290541616416
余弦距离 5 0.23823111070771358
davies_bouldin_score 5 1.0680829975661201
****************************************************************************************************
欧式距离 6 0.23860600932843856
余弦距离 6 0.2439325354844353
davies_bouldin_score 6 1.17710175134732
****************************************************************************************************
欧式距离 7 0.26090351390012806
余弦距离 7 0.19145468679142338
davies_bouldin_score 7 0.9973781402178068
****************************************************************************************************
欧式距离 8 0.213219097168

In [4]:
def cluster_GaussianMixture():
    """高斯混合分布聚类效果展示"""
    train_p = pd.read_csv('../init_data/train.csv')

    # train_p存在有缺失值(28个),每个stoick的time_id不是完全一致
    train_p = train_p.pivot(index='time_id', columns='stock_id', values='target')  # time_id和stock_id的交叉表
    train_p_corr = train_p.corr()  # 交叉表train_p关于stock_id的相关系数矩阵(112 * 112)

    for i in range(3, 20):
        gauss = mixture.GaussianMixture(n_components=i).fit(train_p_corr.values)  # 聚类结果
        cluster_labels = gauss.predict(train_p_corr.values)
        print("欧式距离", i, silhouette_score(train_p_corr.values, cluster_labels))
        print("余弦距离", i, silhouette_score(train_p_corr.values, cluster_labels, metric='cosine'))
        print("davies_bouldin_score", i, davies_bouldin_score(train_p_corr.values, cluster_labels))
        print("*" * 100)

In [5]:
cluster_GaussianMixture()

欧式距离 3 0.3573114784818623
余弦距离 3 0.3969427842196652
davies_bouldin_score 3 1.0093210181480508
****************************************************************************************************
欧式距离 4 0.23559260150186004
余弦距离 4 0.2935479958477141
davies_bouldin_score 4 1.191883248420416
****************************************************************************************************
欧式距离 5 0.3120057673971827
余弦距离 5 0.2834826593621762
davies_bouldin_score 5 0.9301453297543057
****************************************************************************************************
欧式距离 6 0.23973375776776495
余弦距离 6 0.21185954070736748
davies_bouldin_score 6 1.1744669654862392
****************************************************************************************************
欧式距离 7 0.22113103011797416
余弦距离 7 0.24671302488412486
davies_bouldin_score 7 1.277643610063864
****************************************************************************************************
欧式距离 8 0.2421402844145

In [6]:
def cluster_AgglomerativeClustering():
    """层次聚类展示"""
    train_p = pd.read_csv('../init_data/train.csv')

    # train_p存在有缺失值(28个),每个stoick的time_id不是完全一致
    train_p = train_p.pivot(index='time_id', columns='stock_id', values='target')  # time_id和stock_id的交叉表
    train_p_corr = train_p.corr()  # 交叉表train_p关于stock_id的相关系数矩阵(112 * 112)

    for i in range(3, 20):
        agg = AgglomerativeClustering(n_clusters=i, affinity='cosine', linkage='average')
        cluster_labels = agg.fit_predict(train_p_corr.values)
        print("欧式距离", i, silhouette_score(train_p_corr.values, cluster_labels))
        print("余弦距离", i, silhouette_score(train_p_corr.values, cluster_labels, metric='cosine'))
        print("davies_bouldin_score", i, davies_bouldin_score(train_p_corr.values, cluster_labels))
        print("*" * 100)

In [7]:
cluster_AgglomerativeClustering()

欧式距离 3 0.6701976094021231
余弦距离 3 0.7813802666973293
davies_bouldin_score 3 0.186876597076697
****************************************************************************************************
欧式距离 4 0.6314059608793937
余弦距离 4 0.574990343128351
davies_bouldin_score 4 0.20260441602830237
****************************************************************************************************
欧式距离 5 0.3488134683263294
余弦距离 5 0.5958432195986667
davies_bouldin_score 5 0.5832779693103645
****************************************************************************************************
欧式距离 6 0.3089034029952998
余弦距离 6 0.528667322310333
davies_bouldin_score 6 0.5771938808918109
****************************************************************************************************
欧式距离 7 0.27590977328120875
余弦距离 7 0.4775245610448362
davies_bouldin_score 7 0.7216916994900553
****************************************************************************************************
欧式距离 8 0.2765489187000022
