# 1.探讨多大的样本量可以代表总体

In [None]:
from google.colab import drive
drive.mount('/content/drive')
!pip install rasterio matplotlib

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import glob
import os
import rasterio
import pandas as pd

统计

In [None]:
VI_feature_file = '/content/drive/MyDrive/14SNE/VI_parameter_EVIcos1_14SNE.tif'
with rasterio.open(VI_feature_file) as src:
    feature_image = src.read()

# 计算总体的均值和方差
true_mean = np.mean(feature_image)
true_std = np.std(feature_image)
print('***** total statistics *****')
print('image size information:',feature_image.shape)
print('total mean:',true_mean,'total std:',true_std)

# 逐步增加样本比例，并计算均值和方差
sample_means = []
sample_stds = []
sample_sizes = np.logspace(1, 8, num=100, base=10).astype(int)  # 确保最后一次抽样覆盖整个数据集
print('***** different sample ratio statistics *****')
for size in sample_sizes:
    # 从总体数据中无放回随机抽样
    sample = np.random.choice(feature_image.flatten(), size, replace=False)
    sample_means.append(np.mean(sample))
    sample_stds.append(np.std(sample))
    print(f'sample size: {size}, sample mean: {np.mean(sample)}, sample std: {np.std(sample)}')
np.save('sample_means.npy', sample_means)
np.save('sample_stds.npy', sample_stds)
np.save('true_mean.npy', true_mean)
np.save('true_std.npy', true_std)


绘图

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

sample_sizes = np.logspace(1, 8, num=100, base=10).astype(int)
sample_means = np.load('sample_means.npy')
sample_stds = np.load('sample_stds.npy')
true_mean = np.mean(feature_image)
true_std = np.std(feature_image)
# 创建一个图像，包含均值和方差
plt.figure(figsize=(4, 3))

plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.size'] = 8

# 绘制样本均值和标准差
plt.plot(sample_sizes, sample_means, linestyle='--', label='Sample Means',color='green',linewidth=1)
plt.plot(sample_sizes, sample_stds, linestyle='--', label='Sample Std Dev', color='orange',linewidth=1)

# 添加横轴为对数刻度
plt.xscale('log')

# 设置横轴刻度为 10 的幂次，并格式化为 10^n 形式
ax = plt.gca()
ax.xaxis.set_major_locator(ticker.LogLocator(base=10.0))
ax.xaxis.set_minor_locator(ticker.NullLocator())
ax.xaxis.set_major_formatter(ticker.LogFormatterExponent(base=10.0))

# 添加标签和图例
plt.xlabel('Sample Size / 10^')
plt.ylabel('EVI paramater Value / 10^-3')
plt.legend()

# 添加阈值线
plt.axvline(x=10**4, color='gray', linestyle='--', label=r'$x=10^4$', linewidth=0.5)

plt.savefig('sampleSize_figure.jpg', dpi=300,bbox_inches='tight')
# 显示图像
plt.show()


# 2.置信学习去除样本标签噪声

In [None]:
from google.colab import drive
drive.mount('/content/drive')
!pip install rasterio matplotlib
!pip install cleanlab scikit-learn

In [None]:
import glob
import os
import rasterio
import numpy as np
import pandas as pd
from cleanlab.classification import CleanLearning
from sklearn.ensemble import RandomForestClassifier

In [None]:
# 1. 读取CSV文件，保留FID列，提取CDL标签和特征
feature_file = '/content/drive/MyDrive/14SNE/feature_14SNE.csv'
feature_df = pd.read_csv(feature_file)

# 保留 FID 作为唯一标识符
fid_column = feature_df['FID']
cropland_column = feature_df['cropland']

# 提取 CDL 标签，将标签24设置为1，其余为0
cdl_labels_binary = cropland_column.apply(lambda x: 1 if x in [24, 26, 27] else 0)

# 提取特征，去掉不必要的列
features = feature_df.drop(columns=['cropland', 'FID', 'system:index', '.geo'])  # 去除标签和不需要的列

# 2. 初始化置信学习模型和随机森林分类器
rf_classifier = RandomForestClassifier()
cl = CleanLearning(clf=rf_classifier)

# 3. 定义迭代停止条件
noise_threshold = 50  # 噪声点变化阈值
max_iterations = 20  # 最大迭代次数
previous_noise_count = np.inf  # 上一次迭代的噪声点数初始化为无穷大
iteration = 0  # 迭代计数器

# 4. 开始置信学习迭代
previous_labels = cdl_labels_binary
for iteration in range(max_iterations):
    # 执行置信学习
    cl.fit(features, previous_labels)

    # 获取噪声点索引
    label_issues = cl.find_label_issues(X=features, labels=previous_labels)
    print('label_issues information:')
    print(label_issues.head())
    noise_indices = np.where(label_issues.is_label_issue)[0]  # 获取噪声样本的索引
    current_noise_count = len(noise_indices)

    print(f"Iteration {iteration + 1}: Noise points = {current_noise_count}")

    # 更新上一次的噪声点数量
    previous_labels = label_issues.predicted_label

    # 判断停止条件：如果两次迭代中的噪声点变化小于阈值，则停止
    if current_noise_count < noise_threshold:
        print(f"Stopped at iteration {iteration + 1}: Noise change < {noise_threshold}")
        break

# 5. 创建噪声标记列，1表示被认为是噪声的样本，0表示非噪声样本
is_noise = np.where(previous_labels != cdl_labels_binary, 1, 0)   # 将噪声样本标记为1

# 6. 合并 FID、清洗后的置信标签和噪声标记
result_df = pd.DataFrame({
    'FID': fid_column,  # 原始唯一标识符
    'cropland': cropland_column,
    'CDL_Label': cdl_labels_binary,  # 原始CDL标签
    'Confident_Label': previous_labels,  # 清洗后的置信标签
    'Is_Noise': is_noise  # 噪声标记，1表示噪声，0表示非噪声
})

# 7. 输出结果，供后续操作
print(result_df.head())  # 查看结果前几行
result_df.to_csv('/content/drive/MyDrive/14SNE/cleaned_labels_14SNE.csv', index=False)  # 将结果保存为新的CSV文件

# 3.绘制t-SNE展示置信学习效果

In [None]:
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.colors import ListedColormap
import seaborn as sns

In [None]:
# 1. 读取CSV文件
feature_file = '/content/drive/MyDrive/14SNE/feature_14SNE.csv'
feature_df = pd.read_csv(feature_file)
features = feature_df.drop(columns=['cropland', 'FID', 'system:index', '.geo'])

result_file = '/content/drive/MyDrive/14SNE/cleaned_labels_14SNE.csv'
result_df = pd.read_csv(result_file)
cdl_labels_binary = result_df['CDL_Label']
previous_labels = result_df['Confident_Label']
is_noise = result_df['Is_Noise']

# 1. 执行 t-SNE 映射到三维空间
features_EVI = features[['constant_EVI','cos_1_EVI','cos_2_EVI','cos_3_EVI','sin_1_EVI','sin_2_EVI','sin_3_EVI']]

tsne = TSNE(n_components=3, random_state=42)
features_3d = tsne.fit_transform(features_EVI)

# 2. 创建一个包含原始标签和置信标签的 DataFrame
tsne_df = pd.DataFrame({
    'X': features_3d[:, 0],  # t-SNE 映射到三维的X轴坐标
    'Y': features_3d[:, 1],  # t-SNE 映射到三维的Y轴坐标
    'Z': features_3d[:, 2],  # t-SNE 映射到三维的Z轴坐标
    'Original_Label': cdl_labels_binary,  # 原始标签
    'Confident_Label': previous_labels,  # 经过置信学习后的标签
    'Is_Noise': is_noise  # 噪声标记
})
np.save('tsne_df.npy', tsne_df)


In [None]:

# 3. 绘制 t-SNE 三维图，分别展示原始标签和置信标签的效果
fig = plt.figure(figsize=(8, 3.5))

plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.size'] = 8

custom_cmap = ListedColormap(['green', 'orange'])
# 子图1: 原始标签的 t-SNE 三维图
ax1 = fig.add_subplot(121, projection='3d')
scatter1 = ax1.scatter(tsne_df2['X'], tsne_df2['Y'], tsne_df2['Z'], c=tsne_df['Original_Label'], cmap=custom_cmap, s=2)
ax1.set_title(' t-SNE without using CL',pad=-1)
ax1.set_xlabel('t-SNE 1', labelpad=-5)
ax1.set_ylabel('t-SNE 2', labelpad=-5)
ax1.set_zlabel('t-SNE 3', labelpad=-8)
ax1.view_init(elev=30, azim=30)  # 旋转三维图，使角度更好看

ax1.tick_params(axis='x', pad=-3.5)  # X 轴
ax1.tick_params(axis='y', pad=-2)  # Y 轴
ax1.tick_params(axis='z', pad=-2)  # Z 轴

# 子图2: 清洗后的置信标签的 t-SNE 三维图
ax2 = fig.add_subplot(122, projection='3d')
scatter2 = ax2.scatter(tsne_df2['X'], tsne_df2['Y'], tsne_df2['Z'], c=tsne_df['Confident_Label'], cmap=custom_cmap, s=2)
ax2.set_title('t-SNE using CL',pad=0)
ax2.set_xlabel('t-SNE 1', labelpad=-5)
ax2.set_ylabel('t-SNE 2', labelpad=-5)
ax2.set_zlabel('t-SNE 3', labelpad=-8)
ax2.view_init(elev=30, azim=30)  # 旋转三维图，使角度更好看

ax2.tick_params(axis='x', pad=-3.5)  # X 轴
ax2.tick_params(axis='y', pad=-2)  # Y 轴
ax2.tick_params(axis='z', pad=-2)  # Z 轴

plt.subplots_adjust(left=0.5, wspace=0.3)

# 显示图像
plt.tight_layout()
plt.savefig('CL_tSNE.jpg', dpi=300,bbox_inches='tight')

plt.show()

# 4.样本重聚类，同时获取参考物候曲线

In [None]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans

In [None]:
# 1. 读取CSV文件
feature_file = '/content/drive/MyDrive/14SNE/feature_14SNE.csv'
feature_df = pd.read_csv(feature_file)

cdl_labels = feature_df['cropland']

result_file = '/content/drive/MyDrive/14SNE/cleaned_labels_14SNE.csv'
result_df = pd.read_csv(result_file)
result_df['cropland'] = cdl_labels

feature_EVI_name = ['constant_EVI','cos_1_EVI','cos_2_EVI','cos_3_EVI','sin_1_EVI','sin_2_EVI','sin_3_EVI']
feature_RE1_name = ['constant_RE1','cos_1_RE1','cos_2_RE1','cos_3_RE1','sin_1_RE1','sin_2_RE1','sin_3_RE1']
feature_LSWI_name = ['constant_LSWI','cos_1_LSWI','cos_2_LSWI','cos_3_LSWI','sin_1_LSWI','sin_2_LSWI','sin_3_LSWI']
VI_names = feature_EVI_name
features_EVI = feature_df[feature_EVI_name]
features_VI = feature_df[feature_LSWI_name]

# 修改 121 122 123 111 类别的Is noise属性为 false
result_df.loc[result_df['cropland'].isin([121, 122, 123, 111]), 'Is_Noise'] = False

# 1. 过滤掉 `is_noise` 为 True 的样本
filtered_data = result_df[result_df['Is_Noise'] == False]
filtered_features = features_VI.loc[filtered_data.index]

# 修改26 27类别的标签为1
#filtered_data[result_df['cropland'] == 26]['Confident_Label'] = 1
#filtered_data[result_df['cropland'] == 27]['Confident_Label'] = 1

# 标签为 0 的数据
data_label_0 = filtered_data[filtered_data['Confident_Label'] == 0]
features_label_0 = filtered_features.loc[data_label_0.index]

# 标签为 1 的数据
data_label_1 = filtered_data[filtered_data['Confident_Label'] == 1]
features_label_1 = filtered_features.loc[data_label_1.index]

# 2. 使用 KMeans 对不同标签的数据分别聚类
# 对标签为 0 的样本进行聚类
kmeans_0 = KMeans(n_clusters=5, random_state=42)
clusters_label_0 = kmeans_0.fit_predict(features_label_0)

# 对标签为 1 的样本进行聚类
kmeans_1 = KMeans(n_clusters=1, random_state=42)
clusters_label_1 = kmeans_1.fit_predict(features_label_1)

# 3. 设置聚类标签
# 标签为 0 的样本聚类结果分别为 10、20、30
cluster_labels_0 = (clusters_label_0 + 1) * 10

# 标签为 1 的样本聚类结果分别为 11、21、31
cluster_labels_1 = (clusters_label_1 + 1) * 10 + 1

# 4. 将聚类结果分别赋值给原数据的聚类列
data_label_0['Cluster'] = cluster_labels_0
data_label_1['Cluster'] = cluster_labels_1

# 5. 合并聚类结果
clustered_data = pd.concat([data_label_0, data_label_1])

# 6. 将聚类结果与原始数据合并
# 对于 is_noise == True 的样本，将 Cluster 设置为 NaN
result_df['Cluster'] = np.nan  # 初始化聚类列

# 根据 index 更新聚类结果
result_df.loc[clustered_data.index, 'Cluster'] = clustered_data['Cluster']

# 7. 最终结果：包含 FID，CDL_Label，Confident_Label，和聚类结果
final_result = result_df[['FID', 'cropland','CDL_Label', 'Confident_Label', 'Cluster']]

# 如果需要，可以将结果导出
final_result.to_csv('/content/drive/MyDrive/14SNE/clustered_labels_14SNE_V2.csv', index=False)


# 5.绘制不同聚类的EVI曲线示意图

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import cm

# 定义特征名
feature_EVI_name = ['constant_EVI','cos_1_EVI','cos_2_EVI','cos_3_EVI','sin_1_EVI','sin_2_EVI','sin_3_EVI']

# 定义颜色映射
cmap_label_0 = cm.Greens  # 绿色渐变
cmap_label_1 = cm.Oranges  # 橙色渐变

# 定义时间序列的 x 轴范围 (比如从 0 到 2π)
x_values = np.linspace(241, 611, 37)
t_values = x_values * np.pi / 365

# 定义谐波函数
def harmonic_function(x, feature_row):
    """ 构建谐波函数，基于 EVI 特征 """
    constant = feature_row['constant_EVI']
    harmonic_sum = (
        constant +
        feature_row['cos_1_EVI'] * np.cos(x) +
        feature_row['sin_1_EVI'] * np.sin(x) +
        feature_row['cos_2_EVI'] * np.cos(2 * x) +
        feature_row['sin_2_EVI'] * np.sin(2 * x) +
        feature_row['cos_3_EVI'] * np.cos(3 * x) +
        feature_row['sin_3_EVI'] * np.sin(3 * x)
    )
    return harmonic_sum

# 创建一个1行2列的图像
fig, axes = plt.subplots(1, 2, figsize=(8.5, 3.5))

plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.size'] = 8

### 自定义横纵坐标轴 ###
xticks_values = [250, 300, 350, 400, 450, 500, 550, 600]  # 实际 x 轴数据
xticks_labels = [250, 300, 350, 35, 85, 135, 185, 235]     # 对应的循环年度标签

yticks_values = [100, 200, 300, 400, 500, 600]  # 自定义 y 轴刻度

# 自定义颜色列表，子图1和子图2分别指定不同的颜色
colors_sub1 = ['orange', 'brown', 'purple', 'green', 'blue', 'red', 'pink']  # 子图1的颜色
colors_sub2 = ['orange', 'brown', 'purple', 'green', 'blue', 'red', 'pink']  # 子图2的颜色

### 左边：不同作物的曲线图 ###

crops = [24, 27, 4, 36, 37, 176, 142]
# 定义 crops 数值与作物名称的映射
crops_dict = {
    24: 'Winter wheat / Ww',
    27: 'Rye',
    4: 'Sorghum',
    36: 'Alfalfa',
    37: 'Other Hay',
    176: 'Pasture',
    142: 'Forest'
}

# 遍历每个作物，绘制谐波曲线
for idx, crop in enumerate(crops):
    # 筛选属于当前作物的样本
    cluster_data = features_EVI[result_df['cropland'] == crop]

    # 初始化当前聚类的谐波曲线
    mean_harmonic_values = np.zeros_like(t_values)

    # 对每个样本计算谐波函数并取平均
    for _, row in cluster_data[feature_EVI_name].iterrows():
        harmonic_values = harmonic_function(t_values, row)
        mean_harmonic_values += harmonic_values

    # 平均每个聚类的谐波曲线
    mean_harmonic_values /= len(cluster_data)

    # 打印每个作物的样本数
    print(crop, ' sample number:', len(cluster_data))

    # 绘制谐波曲线
    axes[0].plot(x_values, mean_harmonic_values, label=f'{crops_dict[crop]}', color=colors_sub1[idx],linestyle='--', alpha=0.8, linewidth=2)

# 添加作物图的图例、标题和标签
axes[0].set_title('EVI Curves for Different Crop types')
axes[0].set_xlabel('DOY')
axes[0].set_ylabel('EVI / 10^-2')

# 设置自定义横纵坐标轴
axes[0].set_xticks(xticks_values)
axes[0].set_xticklabels(xticks_labels)
axes[0].set_yticks(yticks_values)

### 右边：聚类后的曲线图 ###

# 获取聚类结果
clusters = result_df['Cluster'].dropna().unique()
clusters_dict = {
    #21.0: 'Ww c1',
    11.0: 'Ww c2',
    50.0: 'Non-Ww c1',
    30.0: 'Non-Ww c2',
    20.0: 'Non-Ww c3',
    40.0: 'Non-Ww c4',
    10.0: 'Non-Ww c5'
}

# 设置聚类颜色
color_idx_0 = 0  # 初始颜色索引，用于绿色渐变
color_idx_1 = 0  # 初始颜色索引，用于橙色渐变

# 遍历每个聚类，绘制其谐波曲线
for idx, cluster in enumerate(clusters_dict.keys()):
    # 筛选属于当前聚类的样本
    cluster_data = features_EVI[result_df['Cluster'] == cluster]

    # 获取该聚类的标签（0 或 1）
    label = cluster % 10

    # 选择相应的颜色映射
    if label == 0:
        cmap = cmap_label_0
        color_idx = color_idx_0
        color_idx_0 += 1  # 更新颜色索引
    else:
        cmap = cmap_label_1
        color_idx = color_idx_1
        color_idx_1 += 1  # 更新颜色索引

    # 计算当前颜色
    color = cmap((color_idx + 1) / 10)  # 假设每个标签下有 10 个聚类

    # 初始化当前聚类的谐波曲线
    #mean_harmonic_values = np.zeros_like(t_values)
    # 定义一个列表来保存每个样本的谐波曲线
    harmonic_values_list = []

    # 对每个样本计算谐波函数并取平均
    for _, row in cluster_data[feature_EVI_name].iterrows():
        harmonic_values = harmonic_function(t_values, row)
        harmonic_values_list.append(harmonic_values)  # 将每条曲线存入列表
        #mean_harmonic_values += harmonic_values

    # 平均每个聚类的谐波曲线
    #mean_harmonic_values /= len(cluster_data)
    harmonic_values_list = np.array(harmonic_values_list)  # 转换为 NumPy 数组
    mean_harmonic_values = np.median(harmonic_values_list, axis=0)  # 计算中值

    # 绘制聚类的平均谐波曲线
    axes[1].plot(x_values, mean_harmonic_values, label=f'{clusters_dict[cluster]}', color=colors_sub2[idx],linestyle='--', alpha=0.8, linewidth=2)

# 添加聚类图的图例、标题和标签
axes[1].set_title('EVI Curves for Each Cluster using K-means')
axes[1].set_xlabel('DOY')
axes[1].set_ylabel('EVI / 10^-2')

# 设置自定义横纵坐标轴
axes[1].set_xticks(xticks_values)
axes[1].set_xticklabels(xticks_labels)
axes[1].set_yticks(yticks_values)

### 设置两个图的相同纵轴范围 ###
axes[0].set_ylim(60, 650)  # 设置左图的纵轴范围
axes[1].set_ylim(60, 650)  # 设置右图的纵轴范围

# 获取图例信息
handles_1, labels_1 = axes[0].get_legend_handles_labels()  # 子图1图例
handles_2, labels_2 = axes[1].get_legend_handles_labels()  # 子图2图例

# 第一列：设置子图1的图例
fig.legend(handles_1, labels_1, loc='lower center', ncol=3, bbox_to_anchor=(0.281, 0.754), frameon=False)

# 第二列：设置子图2的图例
fig.legend(handles_2, labels_2, loc='lower center', ncol=3, bbox_to_anchor=(0.775, 0.754), frameon=False)

# 调整布局，确保图例和图像不重叠
plt.tight_layout(rect=[0, 0.1, 1, 1])

plt.savefig('clustering impect visialization.jpg', dpi=300,bbox_inches='tight')

plt.show()


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import cm

# 假设 result_df 包含 'FID', 'Confident_Label', 'Cluster' 和 EVI 特征
# 特征名列表
feature_EVI_name = ['constant_EVI','cos_1_EVI','cos_2_EVI','cos_3_EVI','sin_1_EVI','sin_2_EVI','sin_3_EVI']

# 提取需要的 EVI 特征列
features_EVI = feature_df[feature_EVI_name]

# 获取聚类结果
clusters = result_df['Cluster'].unique()


# 定义颜色映射
cmap_label_0 = cm.Greens  # 蓝色渐变
cmap_label_1 = cm.Oranges   # 红色渐变

# 构建谐波函数
def harmonic_function(x, feature_row):
    """ 构建谐波函数，基于 EVI 特征 """
    constant = feature_row['constant_EVI']
    harmonic_sum = (
        constant +
        feature_row['cos_1_EVI'] * np.cos(x) +
        feature_row['sin_1_EVI'] * np.sin(x) +
        feature_row['cos_2_EVI'] * np.cos(2 * x) +
        feature_row['sin_2_EVI'] * np.sin(2 * x) +
        feature_row['cos_3_EVI'] * np.cos(3 * x) +
        feature_row['sin_3_EVI'] * np.sin(3 * x)
    )
    return harmonic_sum

# 定义时间序列的 x 轴范围 (比如从 0 到 2π)
x_values = np.linspace(241, 611, 37)
t_values = x_values*np.pi/365

# 设置聚类颜色
color_idx_0 = 0  # 初始颜色索引，用于蓝色渐变
color_idx_1 = 0  # 初始颜色索引，用于红色渐变

# 遍历每个聚类，绘制其谐波曲线
plt.figure(figsize=(10, 8))

for cluster in clusters:
    # 筛选属于当前聚类的样本
    cluster_data = features_EVI[result_df['Cluster'] == cluster]

    # 获取该聚类的标签（0 或 1）
    label = cluster % 10

    # 选择相应的颜色映射
    if label == 0:
        cmap = cmap_label_0
        color_idx = color_idx_0
        color_idx_0 += 1  # 更新颜色索引
    else:
        cmap = cmap_label_1
        color_idx = color_idx_1
        color_idx_1 += 1  # 更新颜色索引

    # 计算当前颜色
    color = cmap((color_idx + 1) / 10)  # 假设每个标签下有3个聚类

    # 初始化当前聚类的谐波曲线
    mean_harmonic_values = np.zeros_like(t_values)

    # 对每个样本计算谐波函数并取平均
    for _, row in cluster_data[feature_EVI_name].iterrows():
        harmonic_values = harmonic_function(t_values, row)
        mean_harmonic_values += harmonic_values

    # 平均每个聚类的谐波曲线
    mean_harmonic_values /= len(cluster_data)

    mean_params = cluster_data[feature_EVI_name].mean()
    mean_harmonic_values = harmonic_function(t_values, mean_params)

    # 绘制聚类的平均谐波曲线
    plt.plot(x_values, mean_harmonic_values, label=f'Cluster {cluster}', color=color)

# 添加图例、标题和标签
plt.title('Harmonic Curves for Each Cluster')
plt.xlabel('Time (radians)')
plt.ylabel('Harmonic Value')
plt.legend()
plt.show()

In [None]:

crops = [24,27,176,37,142,141,36]

# 遍历每个聚类，绘制其谐波曲线
plt.figure(figsize=(10, 8))

for crop in crops:
    # 筛选属于当前聚类的样本
    cluster_data = features_EVI[result_df['cropland'] == crop]

    # 初始化当前聚类的谐波曲线
    mean_harmonic_values = np.zeros_like(t_values)

    # 对每个样本计算谐波函数并取平均
    for _, row in cluster_data[feature_EVI_name].iterrows():
        harmonic_values = harmonic_function(t_values, row)
        mean_harmonic_values += harmonic_values

    # 平均每个聚类的谐波曲线
    mean_harmonic_values /= len(cluster_data)
    print(crop,' sample number:',len(cluster_data))

    # 绘制聚类的平均谐波曲线
    plt.plot(x_values, mean_harmonic_values, label=f'Cluster {crop}')

# 添加图例、标题和标签
plt.title('Harmonic Curves for Each Cluster')
plt.xlabel('Time (radians)')
plt.ylabel('Harmonic Value')
plt.legend()
plt.show()

# 6.绘制光谱和植被指数曲线图

In [None]:
from google.colab import drive
drive.mount('/content/drive')
!pip install rasterio matplotlib

In [None]:
import matplotlib.pyplot as plt
from matplotlib import cm
import glob
import os
import rasterio
import numpy as np
import pandas as pd

In [None]:
#生成谐波参数名
def genetate_feature_names(bandname):
    feature_names = [f'constant_{bandname}', f'cos_1_{bandname}', f'cos_2_{bandname}', f'cos_3_{bandname}', f'sin_1_{bandname}', f'sin_2_{bandname}', f'sin_3_{bandname}']
    return feature_names

# 定义谐波函数
def harmonic_function(x, feature_row,bandname):
    feature_name = genetate_feature_names(bandname)
    """ 构建谐波函数，基于给定特征 """
    constant = feature_row[feature_name[0]]
    harmonic_sum = (
        constant +
        feature_row[feature_name[1]] * np.cos(x) +
        feature_row[feature_name[4]] * np.sin(x) +
        feature_row[feature_name[2]] * np.cos(2 * x) +
        feature_row[feature_name[5]] * np.sin(2 * x) +
        feature_row[feature_name[3]] * np.cos(3 * x) +
        feature_row[feature_name[6]] * np.sin(3 * x)
    )
    return harmonic_sum

# 绘制不同作物类型以及不同簇结果的单波段谐波曲线
def plot_harmonic_singleBand(features,labels,bandname):

    features_name = genetate_feature_names(bandname)
    # 定义时间序列的 x 轴范围 (比如从 0 到 2π)
    x_values = np.linspace(241, 611, 37)
    t_values = x_values * np.pi / 365

    # 创建一个1行2列的图像
    fig, axes = plt.subplots(1, 2, figsize=(8.5, 3.5))

    plt.rcParams['font.family'] = 'serif'
    plt.rcParams['font.size'] = 8

    ### 自定义横纵坐标轴 ###
    xticks_values = [250, 300, 350, 400, 450, 500, 550, 600]  # 实际 x 轴数据
    xticks_labels = [250, 300, 350, 35, 85, 135, 185, 235]     # 对应的循环年度标签

    # 自定义y轴标签，初始化存储全局最小值和最大值的变量
    global_min = float('inf')  # 初始化为正无穷大
    global_max = float('-inf')  # 初始化为负无穷大

    # 自定义颜色列表，子图1和子图2分别指定不同的颜色
    colors_sub1 = ['orange', 'brown', 'purple', 'green', 'blue', 'red', 'pink']  # 子图1的颜色
    colors_sub2 = ['orange', 'brown', 'purple', 'green', 'blue', 'red', 'pink']  # 子图2的颜色

    ### 左边：不同作物的曲线图 ###

    crops = [24, 27, 4, 36, 37, 176, 142]
    # 定义 crops 数值与作物名称的映射
    crops_dict = {
        24: 'Winter wheat / Ww',
        27: 'Rye',
        4: 'Sorghum',
        36: 'Alfalfa',
        37: 'Other Hay',
        176: 'Pasture',
        142: 'Forest'
    }

    # 遍历每个作物，绘制谐波曲线
    for idx, crop in enumerate(crops):
        # 筛选属于当前作物的样本
        cluster_data = features[labels['cropland'] == crop]

        # 初始化当前聚类的谐波曲线
        mean_harmonic_values = np.zeros_like(t_values)

        # 对每个样本计算谐波函数并取平均
        for _, row in cluster_data[features_name].iterrows():
            harmonic_values = harmonic_function(t_values, row,bandname)
            mean_harmonic_values += harmonic_values

        # 平均每个聚类的谐波曲线
        mean_harmonic_values /= len(cluster_data)

        # 获取y轴标签范围
        current_min = mean_harmonic_values.min()
        current_max = mean_harmonic_values.max()
        # 更新全局最小值和最大值
        if current_min < global_min:
            global_min = current_min
        if current_max > global_max:
            global_max = current_max

        # 打印每个作物的样本数
        print(crop, ' sample number:', len(cluster_data))

        # 绘制谐波曲线
        axes[0].plot(x_values, mean_harmonic_values, label=f'{crops_dict[crop]}', color=colors_sub1[idx],linestyle='--', alpha=0.8, linewidth=2)

    # 添加作物图的图例、标题和标签
    axes[0].set_title('{} Curves for Different Crop types'.format(bandname))
    axes[0].set_xlabel('DOY')
    axes[0].set_ylabel('{} / 10^-2'.format(bandname))

    # 动态定义y轴刻度
    range_span = global_max - global_min + 130
    # 计算y轴刻度间隔，大于800时刻度为200，小于800时刻度为100
    if range_span < 800:
        interval = 100  # 如果范围较小，使用100间隔
    else:
        interval = 200  # 如果范围较大，使用200间隔
    # 生成动态的 yticks 刻度
    yticks_values = np.arange(start=np.ceil(global_min / interval) * interval,
                            stop=np.floor(global_max / interval) * interval + interval,
                            step=interval)

    # 设置自定义横纵坐标轴
    axes[0].set_xticks(xticks_values)
    axes[0].set_xticklabels(xticks_labels)
    axes[0].set_yticks(yticks_values)

    ### 右边：聚类后的曲线图 ###

    # 获取聚类结果
    clusters = result_df['Cluster'].dropna().unique()
    clusters_dict = {
        21.0: 'Ww c1',
        11.0: 'Ww c2',
        50.0: 'Non-Ww c1',
        30.0: 'Non-Ww c2',
        20.0: 'Non-Ww c3',
        40.0: 'Non-Ww c4',
        10.0: 'Non-Ww c5'
    }

    # 设置聚类颜色
    color_idx_0 = 0  # 初始颜色索引，用于绿色渐变
    color_idx_1 = 0  # 初始颜色索引，用于橙色渐变

    # 遍历每个聚类，绘制其谐波曲线
    for idx, cluster in enumerate(clusters_dict.keys()):
        # 筛选属于当前聚类的样本
        cluster_data = features[labels['Cluster'] == cluster]

        # 初始化当前聚类的谐波曲线
        mean_harmonic_values = np.zeros_like(t_values)
        # 定义一个列表来保存每个样本的谐波曲线
        #harmonic_values_list = []

        #median_harmonic_parameters = cluster_data[features_name].median()
        #median_harmonic_values = harmonic_function(t_values, median_harmonic_parameters,bandname)

        # 对每个样本计算谐波函数并取平均
        for _, row in cluster_data[features_name].iterrows():
            harmonic_values = harmonic_function(t_values, row,bandname)
            #harmonic_values_list.append(harmonic_values)  # 将每条曲线存入列表
            mean_harmonic_values += harmonic_values

        # 平均每个聚类的谐波曲线
        mean_harmonic_values /= len(cluster_data)
        #harmonic_values_list = np.array(harmonic_values_list)  # 转换为 NumPy 数组
        #mean_harmonic_values = np.median(harmonic_values_list, axis=0)  # 计算中值

        # 绘制聚类的平均谐波曲线
        axes[1].plot(x_values, mean_harmonic_values, label=f'{clusters_dict[cluster]}', color=colors_sub2[idx],linestyle='--', alpha=0.8, linewidth=2)

    # 添加聚类图的图例、标题和标签
    axes[1].set_title('{} Curves for Each Cluster using K-means'.format(bandname))
    axes[1].set_xlabel('DOY')
    axes[1].set_ylabel('{} / 10^-2'.format(bandname))

    # 设置自定义横纵坐标轴
    axes[1].set_xticks(xticks_values)
    axes[1].set_xticklabels(xticks_labels)
    axes[1].set_yticks(yticks_values)

    ### 设置两个图的相同纵轴范围 ###
    axes[0].set_ylim(global_min-30, global_max+140)  # 设置左图的纵轴范围
    axes[1].set_ylim(global_min-30, global_max+140)  # 设置右图的纵轴范围

    # 获取图例信息
    handles_1, labels_1 = axes[0].get_legend_handles_labels()  # 子图1图例
    handles_2, labels_2 = axes[1].get_legend_handles_labels()  # 子图2图例

    # 第一列：设置子图1的图例
    fig.legend(handles_1, labels_1, loc='lower center', ncol=3, bbox_to_anchor=(0.281, 0.754), frameon=False)

    # 第二列：设置子图2的图例
    fig.legend(handles_2, labels_2, loc='lower center', ncol=3, bbox_to_anchor=(0.775, 0.754), frameon=False)

    # 调整布局，确保图例和图像不重叠
    plt.tight_layout(rect=[0, 0.1, 1, 1])

    plt.savefig('{} clustering impect visialization.jpg'.format(bandname), dpi=300,bbox_inches='tight')

    plt.show()


In [None]:
VI_harmonicPara_file = '/content/drive/MyDrive/14SNE/feature_14SNE.csv'
VI_harmonicPara_df = pd.read_csv(VI_harmonicPara_file)
cropland_column = VI_harmonicPara_df['cropland']

result_file = '/content/drive/MyDrive/14SNE/clustered_labels_14SNE.csv'
result_df = pd.read_csv(result_file)
result_df['cropland'] = cropland_column

SR_harmonicPara_file = '/content/drive/MyDrive/14SNE/surfaceReflectance_14SNE.csv'
SR_harmonicPara_df = pd.read_csv(SR_harmonicPara_file)

SR_band = ['B', 'G', 'R', 'RE1', 'RE2', 'RE3', 'NIR', 'RE4', 'SWIR1', 'SWIR2']
VI_band = ['EVI','LSWI','OSAVI','RVI']
# plot
for band in SR_band:
    plot_harmonic_singleBand(SR_harmonicPara_df,result_df,band)
for band in VI_band:
    plot_harmonic_singleBand(VI_harmonicPara_df,result_df,band)



# 7.生成参考曲线库

In [None]:
from google.colab import drive
drive.mount('/content/drive')
!pip install rasterio matplotlib

In [None]:
import matplotlib.pyplot as plt
from matplotlib import cm
import glob
import os
import rasterio
import numpy as np
import pandas as pd
from scipy.optimize import curve_fit

In [None]:
#生成谐波参数名
def genetate_feature_names(bandname):
    feature_names = [f'constant_{bandname}', f'cos_1_{bandname}', f'cos_2_{bandname}', f'cos_3_{bandname}', f'sin_1_{bandname}', f'sin_2_{bandname}', f'sin_3_{bandname}']
    return feature_names

# 定义谐波函数
def harmonic_function(x, feature_row,bandname):
    feature_name = genetate_feature_names(bandname)
    """ 构建谐波函数，基于给定特征 """
    constant = feature_row[feature_name[0]]
    harmonic_sum = (
        constant +
        feature_row[feature_name[1]] * np.cos(x) +
        feature_row[feature_name[4]] * np.sin(x) +
        feature_row[feature_name[2]] * np.cos(2 * x) +
        feature_row[feature_name[5]] * np.sin(2 * x) +
        feature_row[feature_name[3]] * np.cos(3 * x) +
        feature_row[feature_name[6]] * np.sin(3 * x)
    )
    return harmonic_sum

# 定义要拟合的谐波模型
def harmonic_model(x, constant, cos_1, cos_2, cos_3, sin_1, sin_2, sin_3):
    """ 拟合的谐波模型，基于谐波函数的形式 """
    return (
        constant +
        cos_1 * np.cos(x) + sin_1 * np.sin(x) +
        cos_2 * np.cos(2 * x) + sin_2 * np.sin(2 * x) +
        cos_3 * np.cos(3 * x) + sin_3 * np.sin(3 * x)
    )

# 获取谐波参数
def get_harmonic_parameter(features,labels,bandname):
    features_name = genetate_feature_names(bandname)
    # 定义时间序列的 x 轴范围 (比如从 0 到 2π)
    x_values = np.linspace(241, 611, 37)
    t_values = x_values * np.pi / 365

    clusters = result_df['Cluster'].dropna().unique()

    all_popt = []

    for cluster in clusters:
        # 筛选属于当前聚类的样本
        cluster_data = features[labels['Cluster'] == cluster]

        # 初始化当前聚类的谐波曲线
        mean_harmonic_values = np.zeros_like(t_values)
        # 对每个样本计算谐波函数并取平均
        for _, row in cluster_data[features_name].iterrows():
            harmonic_values = harmonic_function(t_values, row,bandname)
            mean_harmonic_values += harmonic_values
        mean_harmonic_values /= len(cluster_data)

        popt, pcov = curve_fit(harmonic_model, x_values, mean_harmonic_values)
        all_popt.append((cluster, popt))
    return all_popt

# 导出谐波参数到csv文件
def export_cluster_parameters_to_csv(all_bands_popt, output_file):
    # Step 1: 获取所有波段名称
    bands = [band for band, _ in all_bands_popt]

    # Step 2: 初始化存储结果的列表（存储列名）
    columns = ['Cluster']  # 第一列是 Cluster 名
    for band in bands:
        columns.extend([f'{band}_constant', f'{band}_cos1', f'{band}_cos2', f'{band}_cos3', f'{band}_sin1', f'{band}_sin2', f'{band}_sin3'])

    # 存储每个 cluster 的行数据
    result_rows = []

    # Step 3: 遍历所有波段，按 cluster 聚合数据
    # 假设 all_bands_popt 是 [(band_name, [(cluster1, popt1), (cluster2, popt2), ...]), ...] 形式的列表
    cluster_keys = list(set([cluster for _, clusters_popt in all_bands_popt for cluster, _ in clusters_popt]))  # 获取所有 cluster 的 key

    for cluster in cluster_keys:
        row = [cluster]  # 初始化每行的第一个元素是 cluster 名

        # 遍历每个波段的参数
        for band, band_popt in all_bands_popt:
            # 查找该 cluster 对应的 popt
            popt = next((popt for c, popt in band_popt if c == cluster), [None] * 7)  # 如果找不到 cluster，填充 None

            # 确保 popt 的参数顺序为 [constant, cos1, cos2, cos3, sin1, sin2, sin3]
            row.extend(popt)

        # 将该行数据添加到结果中
        result_rows.append(row)

    # Step 4: 将数据转换为 pandas DataFrame 并导出为 CSV
    df = pd.DataFrame(result_rows, columns=columns)
    df.to_csv(output_file, index=False)


In [None]:
VI_harmonicPara_file = '/content/drive/MyDrive/14SNE/feature_14SNE.csv'
VI_harmonicPara_df = pd.read_csv(VI_harmonicPara_file)
cropland_column = VI_harmonicPara_df['cropland']

result_file = '/content/drive/MyDrive/14SNE/clustered_labels_14SNE_V2.csv'
result_df = pd.read_csv(result_file)
result_df['cropland'] = cropland_column

SR_harmonicPara_file = '/content/drive/MyDrive/14SNE/surfaceReflectance_14SNE.csv'
SR_harmonicPara_df = pd.read_csv(SR_harmonicPara_file)

SR_band = ['B', 'G', 'R', 'RE1', 'RE2', 'RE3', 'NIR', 'RE4', 'SWIR1', 'SWIR2']
VI_band = ['EVI','LSWI','OSAVI','RVI']
# get the refer curve libiary
result_SR_file = '/content/drive/MyDrive/14SNE/all_SRbands_cluster_parameters_V2.csv'
result_VI_file = '/content/drive/MyDrive/14SNE/all_VIbands_cluster_parameters_V2.csv'

all_SRbands_popt = []
add_VIbands_popt = []
for band in SR_band:
    SR_parameter = get_harmonic_parameter(SR_harmonicPara_df,result_df,band)
    all_SRbands_popt.append((band, SR_parameter))
export_cluster_parameters_to_csv(all_SRbands_popt, result_SR_file)
for band in VI_band:
    VI_parameter = get_harmonic_parameter(VI_harmonicPara_df,result_df,band)
    add_VIbands_popt.append((band, VI_parameter))
export_cluster_parameters_to_csv(add_VIbands_popt, result_VI_file)

# 8.计算目标年份随机样本与参考曲线库的相似度度量指标值


In [None]:
from google.colab import drive
drive.mount('/content/drive')
!pip install rasterio matplotlib
!pip install tslearn joblib

In [None]:
import numpy as np
import pandas as pd
import os
import glob
from tslearn.metrics import dtw_path
from joblib import Parallel, delayed
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score, confusion_matrix
import matplotlib.pyplot as plt
from scipy.signal import savgol_filter

DTLS 关于dtw距离以及sad距离计算相关函数

In [None]:
#**** functions related to get refer VI curve
# generate harmonic parameter name based on a given bandname
def generate_parameter_names(bandname):
    feature_names = [f'{bandname}_constant', f'{bandname}_cos1', f'{bandname}_cos2', f'{bandname}_cos3', f'{bandname}_sin1', f'{bandname}_sin2', f'{bandname}_sin3']
    return feature_names

# define the harmonic function
def harmonic_function(x, feature_row,bandname):
    parameter_names = generate_parameter_names(bandname)
    """ 构建谐波函数，基于给定特征 """
    constant = feature_row[parameter_names[0]]
    VI_harmonic_values = (
        constant +
        feature_row[parameter_names[1]] * np.cos(x) +
        feature_row[parameter_names[4]] * np.sin(x) +
        feature_row[parameter_names[2]] * np.cos(2 * x) +
        feature_row[parameter_names[5]] * np.sin(2 * x) +
        feature_row[parameter_names[3]] * np.cos(3 * x) +
        feature_row[parameter_names[6]] * np.sin(3 * x)
    )
    return VI_harmonic_values

#get refer vi curve of whole growing period based on the harmonic parameters
def get_refer_VIcurve(VI_file,t_values,bandname):
    VI_df = pd.read_csv(VI_file)
    x_values = t_values * np.pi / 365
    refer_VI_curve_list = VI_df.apply(lambda row: harmonic_function(x_values, row, bandname), axis=1)
    refer_VI_curve_df = pd.DataFrame(refer_VI_curve_list.tolist(), columns=[f'Time_{int(t)}' for t in t_values])
    refer_VI_curve_df['Cluster'] = VI_df['Cluster']
    return refer_VI_curve_df

# get the spectral reflectance curve based on the harmonic parameters and given time
def get_refer_srCurve(refer_SRpara,SR_bandnames,time_Ts):
    refer_SRpara = refer_SRpara.iloc[0]
    refer_SR_curve_list = []
    time_Ts = np.array(time_Ts)
    x = 241 + (time_Ts-1) * 10 * np.pi / 365
    for sr_bandname in SR_bandnames:
        sr_value = harmonic_function(x, refer_SRpara,sr_bandname)
        refer_SR_curve_list.append(sr_value)
    refer_SR_curve_array = np.array(refer_SR_curve_list).T
    return refer_SR_curve_array

# get the refer curve time of given time T based on the path result of dtw
def get_referTime(optimal_paths,time_T):
    refer_Ts = []
    for path in optimal_paths:
        refer_T = None
        for refer_index,sample_index in path:
            if sample_index == time_T:
                refer_T = refer_index
                break
        refer_Ts.append(refer_T)
    return refer_Ts

# get the spectral reflectance curve of target samples based on the given filename
def get_target_srCurve(fileDir,fileIndex,sr_bandnames,FID_df):
    target_sr_file = fileDir + f'/{int(fileIndex)}.csv'
    target_sr_df = pd.read_csv(target_sr_file)
    #target_sr_df = target_sr_df.head(10000)
    target_sr_df = target_sr_df[target_sr_df['FID'].isin(FID_df['FID'])]
    target_SR_curve_list = target_sr_df[sr_bandnames].values.tolist()
    return target_SR_curve_list

#*** get target VI value of given time period
def get_target_VIs(target_features_fileDir, VI_bandname, time_T):
    file_paths = [os.path.join(target_features_fileDir, f'{i}.csv') for i in range(1, time_T + 1)]
    time_T_df = pd.read_csv(file_paths[-1])
    time_T_FIDs = time_T_df['FID'].unique()
    temp_data_list = []
    for index, file_path in enumerate(file_paths, start=1):
        df = pd.read_csv(file_path)
        df = df[df['FID'].isin(time_T_FIDs)]
        time = 241 + (index - 1) * 10
        time_column_name = f'{VI_bandname}_{int(time)}'
        df = df[['FID', VI_bandname]].rename(columns={VI_bandname: time_column_name})
        temp_data_list.append(df)

    combined_data = pd.concat(temp_data_list, axis=1, join='outer')
    combined_data = combined_data.loc[:, ~combined_data.columns.duplicated()]

    combined_data = combined_data[combined_data['FID'].isin(time_T_FIDs)]

    # 对缺失值进行线性插值
    vi_columns = [col for col in combined_data.columns if col.startswith(VI_bandname)]
    combined_data[vi_columns] = combined_data[vi_columns].interpolate(method='linear', axis=1, limit_direction='both')

    # # 确保数据列数足够应用 Savitzky-Golay 滤波器
    # window_length = 5  # 滑动窗口大小
    # polyorder = 2      # 多项式阶数
    # if len(vi_columns) >= window_length:
    #     # 使用填充方法而非np.pad来避免NaN，并使用savitzky-golay平滑
    #     combined_data[vi_columns] = combined_data[vi_columns].apply(
    #         lambda x: savgol_filter(x.fillna(method='ffill').fillna(method='bfill'), window_length, polyorder)
    #         if x.notna().sum() >= window_length else x
    #     )

    # # 平滑后检查数据完整性，确保没有NaN
    # combined_data[vi_columns] = combined_data[vi_columns].fillna(method='ffill').fillna(method='bfill')

    return combined_data

#*** functions related of DTLS similarity calculation
# function to compute DTW distance and path for one tme series
def compute_dtw(reference,sample,search_radius):
    distance, path = dtw_path(reference, sample,global_constraint='sakoe_chiba',sakoe_chiba_radius=search_radius)
    return distance, path

# function to compute SAD of the target surface reflectance at given time T and refer surface reflectance at related time refer_T
def compute_sad(refer_SRs, target_SRs):
    refer_SRs = np.array(refer_SRs)
    target_SRs = np.array(target_SRs)
    if refer_SRs.shape != target_SRs.shape:
        raise ValueError("refer_SRs 和 target_SRs 的形状必须匹配！")
    dot_product = np.einsum('ij,ij->i', refer_SRs, target_SRs)
    refer_magnitudes = np.linalg.norm(refer_SRs, axis=1)
    target_magnitudes = np.linalg.norm(target_SRs, axis=1)
    # Avoid division by zero by setting SAD to 1.0 for zero-magnitude cases
    zero_mask = (refer_magnitudes == 0) | (target_magnitudes == 0)
    # Calculate cosine similarity
    cos_theta = np.divide(dot_product, refer_magnitudes * target_magnitudes, where=~zero_mask)
    # Clamp values to the range [-1, 1] to avoid any issues with arccos
    cos_theta = np.clip(cos_theta, -1.0, 1.0)
    # Calculate the spectral angle distance in radians and normalize by pi, then scale by 10000
    spectral_angle_distance = 2 * np.arccos(cos_theta) / np.pi
    # Set SAD to 1.0 (or any indicator value you choose) where magnitudes are zero
    spectral_angle_distance[zero_mask] = 1.0
    return spectral_angle_distance*10000

# DTLS method for sample similatity distance calculation for a given cluster and a given time T
def DTLS_singleCluster(refer_curve,target_curve_df,target_features_fileDir,time_T,refer_SRpara,SR_bandnames,search_radius):
    # convert the refer curve dataframe to array for dtw method input
    time_columns = [col for col in refer_curve.columns if col.startswith('Time_')]
    refer_curve_array = refer_curve.iloc[0][time_columns].values

    # convert the target curve dataframe to array for dtw method input
    vi_columns = [col for col in target_curve_df.columns if col.startswith('EVI_')]
    samples_curve_series = target_curve_df.loc[:, vi_columns].values
    extended_samples_curve_series = np.pad(
        samples_curve_series,
        pad_width=((0, 0), (search_radius, search_radius)),  # No padding on rows, padding on columns
        mode='edge'
    )

    # Parallelize the DTW computation across multiple cores
    num_cores = -1  # Use all available cores. You can specify a number like num_cores=4 to limit.
    results = Parallel(n_jobs=num_cores)(delayed(compute_dtw)(refer_curve_array, sample,search_radius) for sample in extended_samples_curve_series)
    # Extract distances and paths from the results
    optimal_path,dtw_distance = zip(*results)

     # get the target surface reflectance curve of given last time using the related sr file
    target_lastT_srCurve = get_target_srCurve(target_features_fileDir,time_T,SR_bandnames,target_curve_df)

    # get the refer surface reflectance curves of given last time using the harmonic parameters
    relavent_target_lastT = time_T + search_radius - 1
    refer_lastT = get_referTime(optimal_path,relavent_target_lastT)
    refer_lastT_srCurve = get_refer_srCurve(refer_SRpara,SR_bandnames,refer_lastT)

    # get the related spectral angle distance of related time period
    spectral_angle_distance = compute_sad(refer_lastT_srCurve,target_lastT_srCurve)

    DTLS_distance = dtw_distance + spectral_angle_distance

    return dtw_distance,spectral_angle_distance #DTLS_distance

#*** main producer of DTLS for sample similarity distance calculation for each cluster at given time time_T
def DTLS_distance_calculate(target_features_fileDir,refer_SR_file,refer_VI_file,VI_bandname,SR_bandnames,out_fileDir,time_T,search_radius,tilname):
    # get refer sr harmonic parameters
    refer_SRpara_df = pd.read_csv(refer_SR_file)
    # get refer vi curve
    referT_indexs = np.linspace(1-search_radius, time_T+search_radius, time_T+2*search_radius)
    referT_values = 241 + (referT_indexs-1)*10
    refer_VI_curve_df = get_refer_VIcurve(refer_VI_file,referT_values,VI_bandname)
    # get target vi value of each sample
    target_VI_df = get_target_VIs(target_features_fileDir,VI_bandname,time_T)

    sample_cluster_resultDF = pd.DataFrame()
    sample_cluster_resultDF['FID'] = target_VI_df['FID']
    sample_cluster_resultDF['EVI'] = target_VI_df[f'EVI_{int(241 + (time_T-1)*10)}']
    clusters = refer_VI_curve_df['Cluster'].unique()
    for cluster in clusters:
        print('  cluster ',cluster,' labeling calculating ....')
        # get the refer curve information and values of given cluster
        refer_VI_curve_cluster = refer_VI_curve_df[refer_VI_curve_df['Cluster'] == cluster]
        # get refer surface reflectance harmonic parameters of given cluster
        refer_SRpara_cluster = refer_SRpara_df[refer_SRpara_df['Cluster'] == cluster]
        # get the DTLS index value of given cluster
        dtw_distance_column,sad_distance_column = DTLS_singleCluster(refer_VI_curve_cluster,target_VI_df,target_features_fileDir,time_T,refer_SRpara_cluster,SR_bandnames,search_radius)
        # define the DTLS value column name of given cluster and add to the reslut dataframe
        dtw_distance_columnName = f'{int(cluster)}_dtw_distance'
        sad_distance_columnName = f'{int(cluster)}_sad_distance'
        sample_cluster_resultDF[dtw_distance_columnName] = dtw_distance_column
        sample_cluster_resultDF[sad_distance_columnName] = sad_distance_column

    sample_cluster_resultDF['cluster label'] = sample_cluster_resultDF[[distance_columnName for distance_columnName in sample_cluster_resultDF.columns if 'distance' in distance_columnName]].idxmin(axis=1)
    sample_cluster_resultDF['cluster label'] = sample_cluster_resultDF['cluster label'].str.replace('_distance', '', regex=False)
    # weite the final DTLS value of each sample with each cluster to a csv file
    sample_label_file = os.path.join(out_fileDir, f'sample_distance_{time_T}_{tilname}.csv')
    sample_cluster_resultDF.to_csv(sample_label_file, index=False)


组合不同不同维度距离，获取最终距离指标

In [None]:
def DTLS_distance_combinate(Results_dir,VI_bandname,tilname,timeT_indexs):
    sample_label_file0 = os.path.join(Results_dir, f'sample_distance_1_{tilname}.csv')
    sample_label_df0 = pd.read_csv(sample_label_file0)

    max_EVI = sample_label_df0.set_index('FID')[VI_bandname]  # 将 FID 设为索引
    max_sad_distances = sample_label_df0.set_index('FID')[[col for col in sample_label_df0.columns if '_sad_distance' in col]].copy()
    cluster_names = set(col.split('_')[0] for col in sample_label_df0.columns if '_dtw_distance' in col)

    for time_T in timeT_indexs:
        time_T = int(time_T)
        sample_label_file = os.path.join(Results_dir, f'sample_distance_{time_T}_{tilname}.csv')
        final_result_file = os.path.join(Results_dir, f'sample_label_{time_T}_{tilname}.csv')
        distance_df = pd.read_csv(sample_label_file).set_index('FID')
        final_distance_df = pd.DataFrame(index=distance_df.index)

        EVI_cur = distance_df[VI_bandname]
        dtw_distance_curs = distance_df[[col for col in distance_df.columns if '_dtw_distance' in col]]
        sad_distance_curs = distance_df[[col for col in distance_df.columns if '_sad_distance' in col]]

        max_EVI = max_EVI.reindex(max_EVI.index.union(distance_df.index), fill_value=-np.inf)
        max_sad_distances = max_sad_distances.reindex(max_EVI.index)

        merged_EVI = pd.concat([EVI_cur, max_EVI], axis=1, keys=['EVI_cur', 'EVI_max'])

        for cluster in cluster_names:
            sad_distance_cur = sad_distance_curs[f'{cluster}_sad_distance']
            sad_distance_max = max_sad_distances[f'{cluster}_sad_distance']

            # 更新 sad 距离：如果当前 EVI 大于最大 EVI，则用当前时相的 sad 距离，否则用最大 EVI 的 sad 距离
            updated_sad_distance = sad_distance_cur.where(
                (merged_EVI['EVI_cur'] > merged_EVI['EVI_max']) | merged_EVI['EVI_max'].isna(),
                sad_distance_max
            )

            # 更新最大 EVI 和对应的 sad_distance
            max_EVI = merged_EVI['EVI_cur'].where(merged_EVI['EVI_cur'] > merged_EVI['EVI_max'], merged_EVI['EVI_max'])
            max_sad_distances[f'{cluster}_sad_distance'] = updated_sad_distance

            # 计算 dtls 距离
            dtw_distance = dtw_distance_curs[f'{cluster}_dtw_distance']
            dtls_distance = dtw_distance + updated_sad_distance
            final_distance_df[f'{cluster}_distance'] = dtls_distance

        # 获取最小距离的 cluster 作为标签
        final_distance_df['cluster label'] = final_distance_df[[col for col in final_distance_df.columns if 'distance' in col]].idxmin(axis=1)
        final_distance_df['cluster label'] = final_distance_df['cluster label'].str.replace('_distance', '', regex=False)

        # 将 FID 添加回最终结果并保存
        final_distance_df.reset_index(inplace=True)  # 重置索引以便输出时包含 FID 列
        final_distance_df.to_csv(final_result_file, index=False)
        print(f'Time step {time_T} calculation done.')

计算获取样点标签和CDL之间的一致性

In [None]:
def get_finalLable_accuracyAssessment(cdl_file,target_sampleResults_dir,metrics_file,tilname,timeT_indexs,cdl_maizeLabels,cluster_maizeLabels):
    cdl_df = pd.read_csv(cdl_file)
    final_labels = pd.DataFrame()
    final_labels['FID'] = cdl_df['FID']
    final_labels['CDL_Label'] = cdl_df['cropland']
    single_label_file = os.path.join(target_sampleResults_dir, f'single_labels_{tilname}.csv')
    cumulated_label_file = os.path.join(target_sampleResults_dir,f'cumulative_mode_labels_{tilname}.csv')

    for time in timeT_indexs:
        time = int(time)
        sample_label_file = os.path.join(target_sampleResults_dir, f'sample_label_{time}_{tilname}.csv')
        sample_label_df = pd.read_csv(sample_label_file)
        final_labels[f'{time}_Label'] = sample_label_df['cluster label']
    final_labels.to_csv(single_label_file, index=False)
    print(single_label_file,'write done.')

    # Initialize a new DataFrame to store the mode labels
    mode_labels = pd.DataFrame()
    mode_labels['FID'] = final_labels['FID']
    mode_labels['CDL_Label'] = final_labels['CDL_Label']

    # Calculate cumulative mode for each time
    for i, time in enumerate(timeT_indexs):
        time = int(time)
        # select columns up to the current time to calculate mode
        cols_to_mode = [f'{int(t)}_Label' for t in timeT_indexs[:i+1]]

        # Calculate mode along the rows for the selected columns
        mode_labels[f'{time}_Mode_Label'] = final_labels[cols_to_mode].mode(axis=1)[0]

    # Save the cumulative mode labels DataFrame
    mode_labels.to_csv(cumulated_label_file, index=False)
    print(cumulated_label_file,'write done.')

    # 初始化存储精度指标的 DataFrame
    metrics_df = pd.DataFrame(columns=['Time', '00', '01', '10', '11',
                                        'Precision (Class 0)', 'Recall (Class 0)', 'Accuracy', 'F1-score (Class 0)', 'IoU (Class 0)',
                                        'Precision (Class 1)', 'Recall (Class 1)', 'F1-score (Class 1)', 'IoU (Class 1)'])
    # 计算每个时间的精度指标
    for time in timeT_indexs:
        time = int(time)
        time_label_column = f'{time}_Mode_Label'
        # Filter out rows with NaN values in the current time label column
        valid_labels = mode_labels.dropna(subset=[time_label_column]).copy()
        # Generate four classification categories
        valid_labels['11'] = ((valid_labels[time_label_column].isin(cluster_maizeLabels)) &
                              (valid_labels['CDL_Label'].isin(cdl_maizeLabels))).astype(int)
        valid_labels['01'] = ((valid_labels[time_label_column].isin(cluster_maizeLabels)) &
                              (~valid_labels['CDL_Label'].isin(cdl_maizeLabels))).astype(int)
        valid_labels['10'] = ((~valid_labels[time_label_column].isin(cluster_maizeLabels)) &
                              (valid_labels['CDL_Label'].isin(cdl_maizeLabels))).astype(int)
        valid_labels['00'] = ((~valid_labels[time_label_column].isin(cluster_maizeLabels)) &
                              (~valid_labels['CDL_Label'].isin(cdl_maizeLabels))).astype(int)
        count_11 = valid_labels['11'].sum()
        count_01 = valid_labels['01'].sum()
        count_10 = valid_labels['10'].sum()
        count_00 = valid_labels['00'].sum()

        y_true = valid_labels['CDL_Label'].isin(cdl_maizeLabels).astype(int)
        y_pred = valid_labels[time_label_column].isin(cluster_maizeLabels).astype(int)

        # Metrics for Class 0
        precision_0 = precision_score(y_true, y_pred, pos_label=0)
        recall_0 = recall_score(y_true, y_pred, pos_label=0)
        f1_score_0 = f1_score(y_true, y_pred, pos_label=0)
        tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
        iou_0 = tn / (tn + fp + fn)  # IoU for Class 0
        # Metrics for Class 1
        precision_1 = precision_score(y_true, y_pred, pos_label=1)
        recall_1 = recall_score(y_true, y_pred, pos_label=1)
        f1_score_1 = f1_score(y_true, y_pred, pos_label=1)
        iou_1 = tp / (tp + fp + fn)  # IoU for Class 1
        overall_accuracy = accuracy_score(y_true, y_pred)
        metrics_row = pd.DataFrame({
            'Time': [time],
            '00': [count_00],
            '01': [count_01],
            '10': [count_10],
            '11': [count_11],
            'Precision (Class 0)': [precision_0],
            'Recall (Class 0)': [recall_0],
            'Accuracy': [overall_accuracy],
            'F1-score (Class 0)': [f1_score_0],
            'IoU (Class 0)': [iou_0],
            'Precision (Class 1)': [precision_1],
            'Recall (Class 1)': [recall_1],
            'F1-score (Class 1)': [f1_score_1],
            'IoU (Class 1)': [iou_1]
        })

        metrics_df = pd.concat([metrics_df, metrics_row], ignore_index=True)
    metrics_df.to_csv(metrics_file, index=False)
    print(metrics_file,'write done.')



绘制标签精度时间变化曲线

In [None]:
def plot_accuracyCurve(metrics_file,tilname):
    # Read the metrics file
    metrics_df = pd.read_csv(metrics_file)

    # Sort by time to ensure correct plotting order
    metrics_df = metrics_df.sort_values(by='Time')

    # Map Time (1-37) to a range starting from day 240, incrementing by 10 days for each step
    mapped_days = [240 + (i - 1) * 10 for i in metrics_df['Time']]
    metrics_df['Mapped_Days'] = mapped_days

    # Set up the plot
    plt.figure(figsize=(8, 5.2))

    plt.rcParams['font.family'] = 'serif'
    plt.rcParams['font.size'] = 8

    # Plot metrics for Class 1
    plt.plot(metrics_df['Mapped_Days'], metrics_df['Precision (Class 1)'], label='Precision (Maize)', marker='x')
    plt.plot(metrics_df['Mapped_Days'], metrics_df['F1-score (Class 1)'], label='F1-score (Maize)', marker='x')

    # Plot overall accuracy
    plt.plot(metrics_df['Mapped_Days'], metrics_df['Accuracy'], label='Overall Accuracy', marker='o')

    # Plot metrics for Class 0
    plt.plot(metrics_df['Mapped_Days'], metrics_df['Precision (Class 0)'], label='Precision (Non-Maize)', marker='x')
    plt.plot(metrics_df['Mapped_Days'], metrics_df['F1-score (Class 0)'], label='F1-score (Non-Maize)', marker='x')

    # Add legend, title, and labels
    plt.xlabel('Time (Days)')
    plt.ylabel('Score')
    plt.title('Time Series of Accuracy Metrics for Maize and Non-Maize smaples')

    # Define custom x-ticks from day 240 of the current year to day 240 of the next year
    custom_ticks = np.arange(240, 240 + 370, 30)  # Generates ticks from 240 to 240 + 370, every 30 days
    custom_labels = [tick % 365 for tick in custom_ticks]  # Convert to 0-364 scale for display

    # Customize x-axis ticks and labels
    plt.xticks(ticks=custom_ticks, labels=custom_labels, rotation=45)

    # Set x-axis limits to start and end precisely at 240 and 240+370
    plt.xlim(240, 240 + 365)

    # Show legend and grid
    plt.legend()
    plt.grid(True)
    plt.savefig(f'{tilname}_samples_accuracy.jpg', dpi=300,bbox_inches='tight')

    # Display the plot
    plt.show()

主函数

In [None]:
target_features_fileDir = '/content/drive/MyDrive/14SNE/samples_SR2023/'
refer_SR_file = '/content/drive/MyDrive/14SNE/all_SRbands_cluster_parameters.csv'
refer_VI_file = '/content/drive/MyDrive/14SNE/all_VIbands_cluster_parameters.csv'
cdl_file = '/content/drive/MyDrive/14SNE/cdlTarget_samples_14SNE.csv'

target_sampleResults_dir = '/content/drive/MyDrive/14SNE/target_sample_label/'
os.makedirs(target_sampleResults_dir, exist_ok=True)

tilname = '14SNE'
metrics_file = f'/content/drive/MyDrive/{tilname}/accuracy_metrics_{tilname}.csv'

VI_bandname = 'EVI'
SR_bandnames = ['B', 'G', 'R', 'RE1', 'RE2', 'RE3', 'NIR', 'RE4', 'SWIR1', 'SWIR2']
search_radius = 2

cdl_maizeLabels = [24,27]
cluster_maizeLabels = [11,21]

timeT_indexs = np.linspace(1, 37, 37)
for time_T in timeT_indexs:
    time_T = int(time_T)
    print('time ',time_T,'th labeling calculating....')
    DTLS_distance_calculate(target_features_fileDir,refer_SR_file,refer_VI_file,VI_bandname,SR_bandnames,target_sampleResults_dir,time_T,search_radius,tilname)
    print('time ',time_T,'th labeling finished....')

DTLS_distance_combinate(target_sampleResults_dir,VI_bandname,tilname,timeT_indexs)
get_finalLable_accuracyAssessment(cdl_file,target_sampleResults_dir,metrics_file,tilname,timeT_indexs,cdl_maizeLabels,cluster_maizeLabels)
plot_accuracyCurve(metrics_file,tilname)