In [None]:
import pandas as pd
import random

# 文件路径
file_path = "merged_ieee123_128bus_voltage_data.csv"
output_file = "incremental_random_10000_groups.csv"

try:
    # 读取文件
    df = pd.read_csv(file_path)

    # 每组的行数
    rows_per_group = 128

    # 总行数和分组数量
    total_rows = len(df)
    total_groups = total_rows // rows_per_group

    # 检查是否有足够的组
    if total_groups < 10000:
        raise ValueError(f"数据中只有 {total_groups} 组，无法随机选择 10000 组。")

    # 创建组索引
    group_indices = list(range(total_groups))

    # 随机选择 10,000 个组
    random_indices = random.sample(group_indices, 10000)

    # 初始化输出文件（写入标题）
    df.iloc[:0].to_csv(output_file, index=False)  # 写入标题但不写入数据

    # 按组逐步写入数据
    for idx in random_indices:
        start_idx = idx * rows_per_group
        end_idx = start_idx + rows_per_group
        group_data = df.iloc[start_idx:end_idx]

        # 追加写入文件
        group_data.to_csv(output_file, index=False, mode='a', header=False)

    print(f"随机选择了 10000 组数据，共 {10000 * rows_per_group} 行。")
    print(f"结果已保存到 {output_file}")

except FileNotFoundError:
    print(f"文件 {file_path} 未找到，请检查路径！")
except ValueError as e:
    print(e)
except Exception as e:
    print(f"处理文件时出错：{e}")


  df = pd.read_csv(file_path)


随机选择了 10000 组数据，共 1280000 行。
结果已保存到 incremental_random_10000_groups.csv


In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# 读取数据
file_path = 'merged_ieee123_128bus_voltage_data_test.csv'
data = pd.read_csv(file_path)

# 检查数据
print("数据预览：")
print(data.head())

# 提取特征列和标签列
features = ['Magnitude_1', 'Angle_1', 'Magnitude_2', 'Angle_2', 'Magnitude_3', 'Angle_3']
label = 'True Fault Location'

# 将数据分为每组128节点
num_nodes = 128
num_groups = data.shape[0] // num_nodes  # 计算总组数

# 初始化特征和标签
X = []  # 输入特征
y = []  # 标签（包括故障位置或无故障标志）

for group_id in range(num_groups):
    # 提取每组的128行数据
    group_data = data.iloc[group_id * num_nodes:(group_id + 1) * num_nodes]
    
    # 将128个节点的特征展平为一行作为输入
    X.append(group_data[features].values.flatten())
    
    # 判断是否为无故障组（所有标签均为零）
    fault_labels = group_data[label].values
    if fault_labels.sum() == 0:  # 无故障组
        y.append(num_nodes)  # 将无故障组标记为新类别（索引为128）
    else:
        y.append(np.argmax(fault_labels))  # 故障组，取故障节点位置索引

X = np.array(X)
y = np.array(y)

# 数据集划分
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 特征标准化
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# SVM模型训练

svm_model = SVC(kernel='linear', probability=True, random_state=42)
svm_model.fit(X_train, y_train)

# 模型预测
y_pred = svm_model.predict(X_test)

# 评估性能
print("模型性能：")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, zero_division=1))

# 显示部分结果
results = pd.DataFrame({
    "True_Label": y_test,
    "Predicted_Label": y_pred
})
print("预测结果：")
print(results)


  data = pd.read_csv(file_path)


数据预览：
    Bus  BaseKV  Node_1  Magnitude_1  Angle_1     pu_1  Node_2 Magnitude_2  \
0  150R    4.16       1     2160.210      0.0  0.89942       2      2161.6   
1   149    4.16       1     2160.190      0.0  0.89941       2      2161.6   
2     1    4.16       1      915.506    -44.0  0.38118       2     2807.26   
3     2    4.16       0        0.000      0.0  0.00000       2     2807.08   
4     3    4.16       0        0.000      0.0  0.00000       0         0.0   

   Angle_2    pu_2  Node_3 Magnitude_3  Angle_3     pu_3  Fault Location  \
0   -120.0     0.9       3     2161.61    120.0      0.9               1   
1   -120.0     0.9       3     2161.61    120.0      0.9               1   
2   -127.9  1.1688       3     2293.43    134.2  0.95489               1   
3   -127.9  1.1688       0         0.0      0.0      0.0               1   
4      0.0     0.0       3     2292.25    134.2   0.9544               1   

  Fault Type  Fault Resistance  Load Factor  True Fault Location  \


ValueError: Input X contains NaN.
SVC does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# 读取数据
file_path = 'merged_ieee123_128bus_voltage_data_test.csv'
data = pd.read_csv(file_path)

# 检查数据
print("数据预览：")
print(data.head())

# 提取特征列和标签列
features = ['Magnitude_1', 'Angle_1', 'Magnitude_2', 'Angle_2', 'Magnitude_3', 'Angle_3']
label = 'True Fault Location'

# 将数据分为每组128节点
num_nodes = 128
num_groups = data.shape[0] // num_nodes  # 计算总组数

# 初始化特征和标签
X = []  # 输入特征
y = []  # 标签（包括故障位置或无故障标志）

for group_id in range(num_groups):
    # 提取每组的128行数据
    group_data = data.iloc[group_id * num_nodes:(group_id + 1) * num_nodes]
    
    # 将128个节点的特征展平为一行作为输入
    X.append(group_data[features].values.flatten())
    
    # 判断是否为无故障组（所有标签均为零）
    fault_labels = group_data[label].values
    if fault_labels.sum() == 0:  # 无故障组
        y.append(num_nodes)  # 将无故障组标记为新类别（索引为128）
    else:
        y.append(np.argmax(fault_labels))  # 故障组，取故障节点位置索引

X = np.array(X)
y = np.array(y)

# 数据集划分

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 特征标准化
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# 随机森林模型训练
rf_model = RandomForestClassifier(n_estimators=300, random_state=42)
rf_model.fit(X_train, y_train)

# 模型预测
y_pred = rf_model.predict(X_test)

# 评估性能
print("模型性能：")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, zero_division=1))

# 显示部分结果
results = pd.DataFrame({
    "True_Label": y_test,
    "Predicted_Label": y_pred
})
print("预测结果：")
print(results)


  data = pd.read_csv(file_path)


ValueError: Input X contains NaN.
SVC does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
import pandas as pd

# 读取 CSV 文件
file_path = 'merged_ieee123_128bus_voltage_data_test.csv'  # 替换为您的文件路径
data = pd.read_csv(file_path)

# 检查是否存在 NaN 数据
if data.isnull().values.any():
    print("CSV 文件中存在 NaN 数据！")
    # 打印 NaN 数据的具体信息
    print("\n每列 NaN 数据的数量：")
    print(data.isnull().sum())
    print("\n包含 NaN 数据的行：")
    print(data[data.isnull().any(axis=1)])
else:
    print("CSV 文件中不存在 NaN 数据。")


  data = pd.read_csv(file_path)


CSV 文件中不存在 NaN 数据。
