In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import os
import numpy as np
import ast
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, accuracy_score, classification_report
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor


def get_files_by_keywords(directory, keywords):
    # 遍历指定文件夹，根据关键字数组提取文件。
    matched_files = []
    for filename in os.listdir(directory):
        if all(keyword in filename for keyword in keywords):
            matched_files.append(filename)
    return matched_files


def scaler_data(data_to_normalize, columns_to_normalize=["eirp", "seq_time"]):
    # 归一化数值列
    scaler = MinMaxScaler()
    data_scaled = data_to_normalize.copy()
    for column in columns_to_normalize:
        data_scaled[column] = data_scaled[column].astype(float)
    data_scaled.loc[:, columns_to_normalize] = scaler.fit_transform(data_scaled.loc[:, columns_to_normalize])
    return data_scaled


def process_array_string(array_string):
    # 去掉最大值、最小值并计算平均值
    # 如果输入是单个值，则直接返回
    if isinstance(array_string, (int, float)):
        return array_string
    # 将字符串转换为列表
    values = ast.literal_eval(array_string)
    # 如果列表长度大于2，去掉最大值和最小值
    if len(values) > 2:
        values.remove(max(values))
        values.remove(min(values))
    mean_value = np.mean(values)
    return mean_value


##### 修改这里 #####
ap_name = "3ap"
project_root = "/Users/daylight/Desktop/macos/1Code/Competition/2024ShuMo"
data_dir = project_root + "/data/results/question1_add_column"
training_data_names = get_files_by_keywords(data_dir, ["training", ap_name, "csv"])
training_data_all_path = project_root + f"/data/processed/training_data_{ap_name}.csv"
os.makedirs(os.path.dirname(training_data_all_path), exist_ok=True)
training_data_names = sorted(training_data_names)

#### 读取所有训练数据 #####
training_data_all = pd.DataFrame()
# 初始化一个变量来追踪全局的最大 test_id
current_max_test_id = 0
file_split_id = []  # 记录分隔文件的id位置
for file in training_data_names:
    file_path = os.path.join(data_dir, file)
    df = pd.read_csv(file_path)
    # 获取当前合并DataFrame中的最大test_id，如果为空则设置为0
    if not training_data_all.empty:
        max_test_id = training_data_all["test_id"].max()
    else:
        max_test_id = 0
    # 调整新df的test_id，保证test_id连续递增
    df["test_id"] = df["test_id"] + max_test_id
    # 将当前DataFrame追加到总的training_data_all中
    training_data_all = pd.concat([training_data_all, df], ignore_index=True)
    file_split_id.append(training_data_all["test_id"].max())

columns_class = ["ap_id", "sta_id"]
columns_numerical = ["eirp", "nav", "add_change", "predict seq_time"]
columns_basic = ["test_id", "seq_time", "nss", "mcs", "protocol"] + columns_numerical
# protocol_name = ["tcp", "udp"]

### 提取对应的列rssi ###
ap_0_sta_0 = ["ap_from_ap_1_mean_ant_rssi", "ap_from_ap_2_mean_ant_rssi", "sta_to_ap_0_mean_ant_rssi", "sta_to_ap_1_mean_ant_rssi", "sta_to_ap_2_mean_ant_rssi", "sta_from_ap_0_mean_ant_rssi", "sta_from_ap_1_mean_ant_rssi", "sta_from_ap_2_mean_ant_rssi", "sta_from_sta_1_rssi", "sta_from_sta_2_rssi"]
ap_1_sta_1 = ["ap_from_ap_0_mean_ant_rssi", "ap_from_ap_2_mean_ant_rssi", "sta_to_ap_0_mean_ant_rssi", "sta_to_ap_1_mean_ant_rssi", "sta_to_ap_2_mean_ant_rssi", "sta_from_ap_0_mean_ant_rssi", "sta_from_ap_1_mean_ant_rssi", "sta_from_ap_2_mean_ant_rssi", "sta_from_sta_0_rssi", "sta_from_sta_2_rssi"]
ap_2_sta_2 = ["ap_from_ap_0_mean_ant_rssi", "ap_from_ap_1_mean_ant_rssi", "sta_to_ap_0_mean_ant_rssi", "sta_to_ap_1_mean_ant_rssi", "sta_to_ap_2_mean_ant_rssi", "sta_from_ap_0_mean_ant_rssi", "sta_from_ap_1_mean_ant_rssi", "sta_from_ap_2_mean_ant_rssi", "sta_from_sta_0_rssi", "sta_from_sta_1_rssi"]

training_data_all_ap_0 = training_data_all.loc[training_data_all["ap_id"] == "ap_0"].copy()
for i, column in enumerate(ap_0_sta_0):
    training_data_all_ap_0[column] = training_data_all_ap_0[column].apply(process_array_string)

training_data_all_ap_1 = training_data_all.loc[training_data_all["ap_id"] == "ap_1"].copy()
for i, column in enumerate(ap_1_sta_1):
    training_data_all_ap_1[ap_0_sta_0[i]] = training_data_all_ap_1[column].apply(process_array_string)

training_data_all_ap_2 = training_data_all.loc[training_data_all["ap_id"] == "ap_2"].copy()
for i, column in enumerate(ap_2_sta_2):
    training_data_all_ap_2[ap_0_sta_0[i]] = training_data_all_ap_2[column].apply(process_array_string)

training_data_all_processed = pd.concat([training_data_all_ap_0[columns_basic + ap_0_sta_0], training_data_all_ap_1[columns_basic + ap_0_sta_0]], ignore_index=True)
training_data_all_processed = pd.concat([training_data_all_processed, training_data_all_ap_2[columns_basic + ap_0_sta_0]], ignore_index=True)


######## 训练模型 #######
training_data = training_data_all_processed.loc[:, columns_basic + ap_0_sta_0].copy()
# 编码非数值变量
training_data_encoded = pd.get_dummies(training_data, columns=["protocol"])

# 创建新的联合类标签 (nss 和 mcs 组合成一组类)
training_data_encoded["nss_mcs"] = training_data_encoded["nss"].astype(str) + "_" + training_data_encoded["mcs"].astype(str)

# 拼接向量
X = training_data_encoded[["eirp", "nav"] + ap_0_sta_0 + [col for col in training_data_encoded.columns if col.startswith("protocol_")]]
y = training_data_encoded["nss_mcs"]
# 移除只有 1 个样本的类别
unique, counts = np.unique(y, return_counts=True)
class_counts = dict(zip(unique, counts))
classes_to_keep = [label for label, count in class_counts.items() if count > 1]

# 创建掩码，过滤掉样本较少的类别
mask = np.isin(y, classes_to_keep)
X_filtered = X[mask]
y_filtered = y[mask]

# 使用 LabelEncoder 将 y 中的字符串标签转换为整数编码
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y_filtered)

# 将数据分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X_filtered, y_encoded, test_size=0.2, stratify=y_encoded)

# 标准化特征
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

#### 模型训练 ####
# 随机森林
# model = RandomForestClassifier(random_state=42)
# XGBoost
model = xgb.XGBClassifier(objective="multi:softmax", num_class=14, random_state=42)
# 线性
# model = LogisticRegression(multi_class='multinomial', solver='lbfgs', random_state=42)
# 支持向量机
# model = SVC(kernel='rbf', random_state=42)
# 朴素贝叶斯
# model = GaussianNB()
# model = MLPClassifier(hidden_layer_sizes=(128, 64), max_iter=1000, random_state=42)

# 训练模型
model.fit(X_train_scaled, y_train)
# 对测试集进行预测
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f"Classification Accuracy: {accuracy}")

# 生成分类报告
y_test_labels = label_encoder.inverse_transform(y_test)
y_pred_labels = label_encoder.inverse_transform(y_pred)
unique_labels = sorted(list(set(np.unique(y_test_labels)) | set(np.unique(y_pred_labels))))
report = classification_report(y_test_labels, y_pred_labels, labels=unique_labels, target_names=[str(label) for label in unique_labels])
print(report)
# model.classes_

Classification Accuracy: 0.7151162790697675
              precision    recall  f1-score   support

         1_0       0.00      0.00      0.00         1
         1_5       0.00      0.00      0.00         2
        2_10       0.67      0.50      0.57         8
        2_11       0.92      0.92      0.92        71
         2_3       0.33      1.00      0.50         1
         2_4       0.72      0.62      0.67        21
         2_5       0.57      0.50      0.53        16
         2_6       0.54      0.78      0.64         9
         2_7       0.67      0.46      0.55        13
         2_8       0.53      0.56      0.55        16
         2_9       0.59      0.71      0.65        14

    accuracy                           0.72       172
   macro avg       0.50      0.55      0.51       172
weighted avg       0.73      0.72      0.72       172



In [6]:
####### 添加了传输方式变换列之后的 test_set 文件夹 。 自动识别 set_2#######
test_data_names = get_files_by_keywords(data_dir, [ap_name, "csv"])
test_data_names = sorted(test_data_names)
question2_results_dir = project_root + "/data/results/question2"
os.makedirs(question2_results_dir, exist_ok=True)
for test_data_name in test_data_names:
    file_path = os.path.join(data_dir, test_data_name)
    test_data_all = pd.read_csv(file_path)

    test_data_ap_0 = test_data_all.loc[test_data_all["ap_id"] == "ap_0"].copy()
    for i, column in enumerate(ap_0_sta_0):
        test_data_ap_0[column] = test_data_ap_0[column].apply(process_array_string)

    test_data_ap_1 = test_data_all.loc[test_data_all["ap_id"] == "ap_1"].copy()
    for i, column in enumerate(ap_1_sta_1):
        test_data_ap_1[ap_0_sta_0[i]] = test_data_ap_1[column].apply(process_array_string)

    test_data_ap_2 = test_data_all.loc[test_data_all["ap_id"] == "ap_2"].copy()
    for i, column in enumerate(ap_2_sta_2):
        test_data_ap_2[ap_0_sta_0[i]] = test_data_ap_2[column].apply(process_array_string)

    test_data_processed = pd.concat([test_data_ap_0[columns_basic + ap_0_sta_0], test_data_ap_1[columns_basic + ap_0_sta_0]], ignore_index=True)
    test_data_processed = pd.concat([test_data_processed, test_data_ap_2[columns_basic + ap_0_sta_0]], ignore_index=True)

    ####### 预测数据 ########
    test_data = test_data_processed.loc[:, columns_basic + ap_0_sta_0].copy()
    # 编码非数值变量
    test_data_encoded = pd.get_dummies(test_data, columns=["protocol"])
    X_test_data = test_data_encoded[columns_numerical + ap_0_sta_0 + [col for col in test_data_encoded.columns if col.startswith("protocol_")]]
    X_test_data = X_test_data[X_train.columns]

    # print(f"########## {test_data_name} ###########")
    # print(X_test_data.info())

    # 对测试数据进行归一化（使用与训练集相同的 scaler）
    X_test_final_scaled = scaler.transform(X_test_data)

    # 使用训练好的模型进行预测
    y_test_pred = model.predict(X_test_final_scaled)
    y_test_pred_labels = label_encoder.inverse_transform(y_test_pred)

    y_pred_df = pd.DataFrame([label.split("_") for label in y_test_pred_labels], columns=["nss", "mcs"])

    # 输出预测结果
    test_data_all[["predict nss", "predict mcs"]] = y_pred_df[["nss", "mcs"]]
    test_data_all.to_csv(f"{question2_results_dir}/{test_data_name}", index=False)

    # plt.figure(figsize=(10, 3))
    # plt.plot(np.arange(len(test_data["predict seq_time"])), test_data["predict seq_time"], label="predict seq_time")
    # plt.plot(np.arange(len(test_data["ap_from_ap_1_mean_ant_rssi"])), test_data["ap_from_ap_1_mean_ant_rssi"], label="ap_from_ap_1_mean_ant_rssi")
    # plt.legend()
    # plt.show()

########## test_set_1_3ap.csv ###########
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105 entries, 0 to 104
Data columns (total 14 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   eirp                         105 non-null    int64  
 1   nav                          105 non-null    int64  
 2   ap_from_ap_1_mean_ant_rssi   105 non-null    float64
 3   ap_from_ap_2_mean_ant_rssi   105 non-null    float64
 4   sta_to_ap_0_mean_ant_rssi    105 non-null    float64
 5   sta_to_ap_1_mean_ant_rssi    105 non-null    float64
 6   sta_to_ap_2_mean_ant_rssi    105 non-null    float64
 7   sta_from_ap_0_mean_ant_rssi  105 non-null    float64
 8   sta_from_ap_1_mean_ant_rssi  105 non-null    float64
 9   sta_from_ap_2_mean_ant_rssi  105 non-null    float64
 10  sta_from_sta_1_rssi          105 non-null    float64
 11  sta_from_sta_2_rssi          105 non-null    float64
 12  protocol_tcp                 105 non