In [1]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
heart_failure_clinical_records = fetch_ucirepo(id=519) 
  
# data (as pandas dataframes) 
X = heart_failure_clinical_records.data.features 
y = heart_failure_clinical_records.data.targets 
  
# metadata 
print(heart_failure_clinical_records.metadata) 
  
# variable information 
print(heart_failure_clinical_records.variables) 


{'uci_id': 519, 'name': 'Heart Failure Clinical Records', 'repository_url': 'https://archive.ics.uci.edu/dataset/519/heart+failure+clinical+records', 'data_url': 'https://archive.ics.uci.edu/static/public/519/data.csv', 'abstract': 'This dataset contains the medical records of 299 patients who had heart failure, collected during their follow-up period, where each patient profile has 13 clinical features.', 'area': 'Health and Medicine', 'tasks': ['Classification', 'Regression', 'Clustering'], 'characteristics': ['Multivariate'], 'num_instances': 299, 'num_features': 12, 'feature_types': ['Integer', 'Real'], 'demographics': ['Age', 'Sex'], 'target_col': ['death_event'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2020, 'last_updated': 'Mon Feb 26 2024', 'dataset_doi': '10.24432/C5Z89R', 'creators': [], 'intro_paper': {'title': 'Machine learning can predict survival of patients with heart failure from serum creatinine and ejec

In [2]:
X

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time
0,75.0,0,582,0,20,1,265000.00,1.9,130,1,0,4
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6
2,65.0,0,146,0,20,0,162000.00,1.3,129,1,1,7
3,50.0,1,111,0,20,0,210000.00,1.9,137,1,0,7
4,65.0,1,160,1,20,0,327000.00,2.7,116,0,0,8
...,...,...,...,...,...,...,...,...,...,...,...,...
294,62.0,0,61,1,38,1,155000.00,1.1,143,1,1,270
295,55.0,0,1820,0,38,0,270000.00,1.2,139,0,0,271
296,45.0,0,2060,1,60,0,742000.00,0.8,138,0,0,278
297,45.0,0,2413,0,38,0,140000.00,1.4,140,1,1,280


In [4]:
y

Unnamed: 0,death_event
0,1
1,1
2,1
3,1
4,1
...,...
294,0
295,0
296,0
297,0


In [19]:
import numpy as np

class HierarchicalClustering:

    #n_clusters：指定要生成的簇的数量，默认为 2。
    #linkage：指定用于计算簇之间距离的链接方式。
    def __init__(self, n_clusters=2, linkage='single'):
        self.n_clusters = n_clusters
        self.linkage = linkage
    
    def fit(self, X):
        print(X)
        # 获取样本数量和特征维度
        n_samples, _ = X.shape       #tips：使用下划线（underscore）作为一个占位符，用于表示一个变量的值不被使用或者忽略。在这个代码中，_ 作为一个占位符，用于接收 X.shape 返回的元组中的第二个值，但在后续的代码中并未使用到。这种做法是为了符合Python的习惯和规范，同时也表明了程序员的意图：只关心 n_samples，而不关心样本的特征维度。
        self.labels_ = np.zeros(n_samples)
        
        # 初始化距离矩阵，一个n*n的距离矩阵
        distances = np.zeros((n_samples, n_samples))
        
        print(distances.shape)


        # 计算距离矩阵
        for i in range(n_samples):
            for j in range(n_samples):
                print("i的值是:" + str(i))
                print("j的值是:" + str(j))
                print("X[i]:", X[i])
                print("X[j]:",X[j])
                distances[i, j] = self.calculate_distance(X[i], X[j])
                print("距离计算出来是:"+str(distances[i, j]))
        
        print(distances)
        
        # 开始聚类过程，直到达到指定的簇数量
        for _ in range(n_samples - self.n_clusters):
            min_distance = np.inf
            
            for b in range(n_samples):
                for c in range(b+1, n_samples):
                    if distances[b, c] < min_distance:
                        min_distance = distances[b, c]
                        min_i = b
                        min_j = c
            
            # 将最近的两个簇合并
            for i in range(n_samples):
                if i != min_i and i != min_j:
                    if self.linkage == 'single':
                        distances[min_i, i] = min(distances[min_i, i], distances[min_j, i])
                    elif self.linkage == 'complete':
                        distances[min_i, i] = max(distances[min_i, i], distances[min_j, i])
                    elif self.linkage == 'average':
                        distances[min_i, i] = (distances[min_i, i] + distances[min_j, i]) / 2
            
            # 从距离矩阵中删除已合并的簇
            distances = np.delete(distances, min_j, axis=0)
            distances = np.delete(distances, min_j, axis=1)
            
        # 根据簇的合并情况得到最终的标签
        self.labels_ = self.get_labels(distances)
    
    def calculate_distance(self, x1, x2):
        # 根据选择的链接方式计算两个样本之间的距离
        if self.linkage == 'single':
            return np.linalg.norm(x1 - x2, ord=1)  # 使用曼哈顿距离
        elif self.linkage == 'complete':
            return np.linalg.norm(x1 - x2, ord=np.inf)  # 使用切比雪夫距离
        elif self.linkage == 'average':
            return np.linalg.norm(x1 - x2)  # 使用欧氏距离
    
    def get_labels(self, distances):
        n_samples = distances.shape[0]
        labels = np.zeros(n_samples)
        current_label = 0
        cluster_dict = {}
        
        for i in range(n_samples):
            if i not in cluster_dict:
                cluster_dict[i] = current_label
                current_label += 1
            
            for j in range(i+1, n_samples):
                if distances[i, j] == 0:
                    if j not in cluster_dict:
                        cluster_dict[j] = cluster_dict[i]
                    else:
                        for k, v in cluster_dict.items():
                            if v == cluster_dict[j]:
                                cluster_dict[k] = cluster_dict[i]
                    break
        
        for i in range(n_samples):
            labels[i] = cluster_dict[i]
        
        return labels.astype(int)

# 测试算法
X_t = np.array([[1, 2], [1, 3], [2, 2], [8, 7], [8, 8], [7, 7]])
model_single = HierarchicalClustering(n_clusters=2, linkage='single')
model_complete = HierarchicalClustering(n_clusters=2, linkage='complete')
model_average = HierarchicalClustering(n_clusters=2, linkage='average')

model_single.fit(X_t)
model_complete.fit(X_t)
model_average.fit(X_t)

print("Cluster Labels (Single Linkage):", model_single.labels_)
print("Cluster Labels (Complete Linkage):", model_complete.labels_)
print("Cluster Labels (Average Linkage):", model_average.labels_)


[[1 2]
 [1 3]
 [2 2]
 [8 7]
 [8 8]
 [7 7]]
(6, 6)
i的值是:0
j的值是:0
X[i]: [1 2]
X[j]: [1 2]
距离计算出来是:0.0
i的值是:0
j的值是:1
X[i]: [1 2]
X[j]: [1 3]
距离计算出来是:1.0
i的值是:0
j的值是:2
X[i]: [1 2]
X[j]: [2 2]
距离计算出来是:1.0
i的值是:0
j的值是:3
X[i]: [1 2]
X[j]: [8 7]
距离计算出来是:12.0
i的值是:0
j的值是:4
X[i]: [1 2]
X[j]: [8 8]
距离计算出来是:13.0
i的值是:0
j的值是:5
X[i]: [1 2]
X[j]: [7 7]
距离计算出来是:11.0
i的值是:1
j的值是:0
X[i]: [1 3]
X[j]: [1 2]
距离计算出来是:1.0
i的值是:1
j的值是:1
X[i]: [1 3]
X[j]: [1 3]
距离计算出来是:0.0
i的值是:1
j的值是:2
X[i]: [1 3]
X[j]: [2 2]
距离计算出来是:2.0
i的值是:1
j的值是:3
X[i]: [1 3]
X[j]: [8 7]
距离计算出来是:11.0
i的值是:1
j的值是:4
X[i]: [1 3]
X[j]: [8 8]
距离计算出来是:12.0
i的值是:1
j的值是:5
X[i]: [1 3]
X[j]: [7 7]
距离计算出来是:10.0
i的值是:2
j的值是:0
X[i]: [2 2]
X[j]: [1 2]
距离计算出来是:1.0
i的值是:2
j的值是:1
X[i]: [2 2]
X[j]: [1 3]
距离计算出来是:2.0
i的值是:2
j的值是:2
X[i]: [2 2]
X[j]: [2 2]
距离计算出来是:0.0
i的值是:2
j的值是:3
X[i]: [2 2]
X[j]: [8 7]
距离计算出来是:11.0
i的值是:2
j的值是:4
X[i]: [2 2]
X[j]: [8 8]
距离计算出来是:12.0
i的值是:2
j的值是:5
X[i]: [2 2]
X[j]: [7 7]
距离计算出来是:10.0
i的值是:3
j的值是:0
X[i]: [8 7]
X[j]: [1 2]
距离计

IndexError: index 5 is out of bounds for axis 1 with size 5