In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from IDK2 import *

# 读取CSV文件
data = pd.read_csv('csv\CinCECGTorso_TEST.csv')

# 提取标签列
labels = data.iloc[:, -1]

# 去除最后一列(label)
data_without_label = data.iloc[:, :-1]

unique_labels = labels.unique()
num_colors = len(unique_labels)
colors = plt.cm.tab10.colors[:num_colors + 1]

print(len(data))

## IDK mapping

In [None]:
# 对data_without_label每一行提取出来，构建(1, dim)的列表
list_of_distributions = [[np.array(row).tolist()] for row in data_without_label.values]
idk_map = idk_kernel_map(list_of_distributions, 13)

## Visualize before clustering

In [None]:
# 可视化
plt.figure(figsize=(10, 6))
for i, (_, row) in enumerate(data_without_label.iterrows()):
    if i >= 10: # 可视化前10个时间序列
        break 
    label = labels.iloc[i]
    color_index = list(unique_labels).index(label)
    plt.plot(row, color=colors[color_index])

plt.legend(unique_labels, title='Labels')
plt.xlabel('Time')
plt.ylabel('Value')
plt.title('Visualization of Time Series Data')
plt.show()

## Preprocessing


In [None]:
# DWT
import pywt
import numpy as np

# 假设我们有一个简单的正弦波时间序列数据
t = np.linspace(start=0, stop=1, num=200)
signal = np.sin(2 * np.pi * 5 * t)
print(signal.shape)


# 应用DWT，这里使用db1小波，进行1级分解
(cA, cD) = pywt.dwt(signal, 'db1')

# coefficients是一个二维数组，包含了不同尺度上的小波系数
# 我们可以通过查看coefficients数组来了解不同尺度上的信息
print(cA.shape, cD.shape)

## Reduce dimension

In [None]:
from sklearn.manifold import TSNE
import numpy as np

tsne = TSNE(perplexity=3, n_components=2)

x = np.array(data_without_label)
y = np.array(labels)
# y = [0 if y[i] == 1 else 1 for i in range(len(y))]
print(y)
x_tsne = tsne.fit_transform(x)
plt.scatter(x_tsne[:, 0], x_tsne[:, 1], c=y)
plt.show()

## IK mapping

In [None]:
ik = iNN_IK(5, 10) # psi and t
ik.fit(x_tsne)
sparse_representation = ik.fit_transform(x_tsne)
sparse_representation = sparse_representation.toarray()

## Clustering

In [None]:
from MyKMeans import *

print(C)

if C == choose.IDK:
    points = idk_map
if C == choose.IK:
    points = sparse_representation
if C == choose.GDK or C == choose.DTW:
    points = x
else:
    points = x_tsne
centroids, closestCentroid, points = KMeans(points, k=7, maxIters=25)

## Visualize after clustering

In [None]:
# 创建一个新的matplotlib图形
fig = plt.figure(figsize=(12, 6))

# 第一个子图：可视化原始点
ax1 = fig.add_subplot(121)
ax1.set_title('Original Points')
ax1.scatter(points[:, 0], points[:, 1], c=y, label='Original Points')

# 第二个子图：可视化聚类后的点
ax2 = fig.add_subplot(122)
ax2.set_title('Points after clustering')
unique_clusters, counts_array = np.unique(closestCentroid, return_counts=True)
colors = plt.cm.jet(np.linspace(0, 1, len(unique_clusters)))
for cluster, color in zip(unique_clusters, colors):
    cluster_points = points[closestCentroid == cluster]
    ax2.scatter(cluster_points[:, 0], cluster_points[:, 1], c=color, label=f'Cluster {cluster}')

# 添加图例到第二个子图
ax2.legend()

# 设置坐标轴标签
ax1.set_xlabel('X Axis')
ax1.set_ylabel('Y Axis')
ax2.set_xlabel('X Axis')
ax2.set_ylabel('Y Axis')

# 调整子图间距
plt.tight_layout()

# 显示图形
plt.show()


## NMI & ARI

In [None]:
from sklearn.metrics.cluster import normalized_mutual_info_score
labels_true = y
labels_pred = closestCentroid

score = normalized_mutual_info_score(labels_true, labels_pred)
print(score)

In [None]:
from sklearn import metrics
labels_true = y
labels_pred = closestCentroid

score = metrics.adjusted_rand_score(labels_true, labels_pred)
print(score) 