In [1]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split

root_path = 'dataset/datahow_2020/insilico_data'

# 假设数据存储在名为data.xlsx的Excel文件中，读取数据
data = pd.read_excel(f'{root_path}/rawdata.xlsx')

# 选择用于PCA分析的特征列（去除非数值列）
features = data.drop(columns=['Experiment No.', 'Time [h]'])

# 执行PCA降维
pca = PCA(n_components=2)
pca_result = pca.fit_transform(features)

# 将PCA结果添加到DataFrame中
data['PCA1'] = pca_result[:, 0]
data['PCA2'] = pca_result[:, 1]

# 使用KMeans聚类算法进行聚类
kmeans = KMeans(n_clusters=5, random_state=42)  # 假设分为5个簇
data['Cluster'] = kmeans.fit_predict(pca_result)

# 创建插值数据集，按照每个簇的20%进行分层采样
interpolation_data = data.groupby('Cluster').apply(lambda x: x.sample(frac=0.2, random_state=42)).reset_index(drop=True)

# 获取非插值数据集，即训练数据集
train_data = data[~data.index.isin(interpolation_data.index)]

# 删除训练数据和插值数据中的PCA和Cluster列
interpolation_data = interpolation_data.drop(columns=['PCA1', 'PCA2', 'Cluster'])
train_data = train_data.drop(columns=['PCA1', 'PCA2', 'Cluster'])

# 保存插值数据集到Excel文件
interpolation_data.to_excel(f'{root_path}/interpolation_data.xlsx', index=False)

# 保存训练数据集到Excel文件
train_data.to_excel(f'{root_path}/train_data.xlsx', index=False)


  super()._check_params_vs_input(X, default_n_init=10)


In [3]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split

# 假设数据存储在名为data.xlsx的Excel文件中，读取数据
data = pd.read_excel(f'{root_path}/rawdata.xlsx')

# 提取所有实验的编号
experiments = data['Experiment No.'].unique()

# 选择用于PCA分析的特征列（去除非数值列和实验编号）
features = data.drop(columns=['Experiment No.', 'Time [h]'])

# 对每个实验进行聚合，计算每个实验的均值，以便进行PCA降维
experiment_means = data.groupby('Experiment No.').mean().drop(columns=['Time [h]'])

# 执行PCA降维
pca = PCA(n_components=2)
pca_result = pca.fit_transform(experiment_means)

# 将PCA结果添加到experiment_means DataFrame中
experiment_means['PCA1'] = pca_result[:, 0]
experiment_means['PCA2'] = pca_result[:, 1]

# 使用KMeans聚类算法进行聚类
kmeans = KMeans(n_clusters=5, random_state=42)  # 假设分为5个簇
experiment_means['Cluster'] = kmeans.fit_predict(pca_result)

# 创建插值数据集，按照每个簇的20%进行分层采样
interpolation_experiments = experiment_means.groupby('Cluster').apply(lambda x: x.sample(frac=0.2, random_state=42)).reset_index(drop=True)
interpolation_experiment_ids = interpolation_experiments.index

# 获取插值数据集，即所有来自插值实验的数据
interpolation_data = data[data['Experiment No.'].isin(interpolation_experiment_ids)]

# 获取非插值数据集，即训练数据集
train_data = data[~data['Experiment No.'].isin(interpolation_experiment_ids)]

# 保存插值数据集到Excel文件
interpolation_data.to_excel(f'{root_path}/interpolation_data.xlsx', index=False)

# 保存训练数据集到Excel文件
train_data.to_excel(f'{root_path}/train_data.xlsx', index=False)



  super()._check_params_vs_input(X, default_n_init=10)
