In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib
# 指定支持中文的字体，例如 'SimHei'
plt.rcParams['font.sans-serif'] = ['SimHei']
# 解决负号 '-' 显示为方块的问题
plt.rcParams['axes.unicode_minus'] = False

In [5]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# 加载数据集
df = pd.read_csv('system_log.csv')

# --- 数据预处理 ---

# 1. 将时间戳转换为datetime对象并排序
df['timestamp'] = pd.to_datetime(df['timestamp'])
df = df.sort_values('timestamp').reset_index(drop=True)

# 2. 识别数据采集的会话（Session）
time_diff = df['timestamp'].diff()
session_threshold = pd.Timedelta('1 minute')
df['session_id'] = (time_diff > session_threshold).cumsum()

# 3. 设置时间索引
# 这是最核心的改动。让所有操作都基于一个显式的时间索引。
df = df.set_index('timestamp')

# 4. 在每个会话内部进行滑动窗口处理
window_size = '10s' 
features_to_roll = [
    'mouse_left_click',
    'mouse_right_click',
    'mouse_scroll',
    'keyboard_counts',
    'mouse_distance',
    'bytes_sent_per_sec',
    'bytes_recv_per_sec',
    'packets_sent_per_sec',
    'packets_recv_per_sec',
    'read_bytes_per_sec',
    'write_bytes_per_sec'
]

for col in features_to_roll:
    new_col_name = f'{col}_freq'
    # 现在我们按session_id分组，并在时间索引上直接进行滚动计算
    # 注意，这里不再需要 on='timestamp' 参数，因为rolling操作默认就在索引上执行
    rolled_series = df.groupby('session_id')[col].rolling(window=window_size).sum()
    
    # 结果的索引是 (session_id, timestamp)，我们需要去掉第一层session_id来对齐
    df[new_col_name] = rolled_series.reset_index(level=0, drop=True)

df = df.reset_index()

# 6. 将时间戳转换为自第一个时间戳以来的总秒数
df['timestamp'] = (df['timestamp'] - df['timestamp'].min()).dt.total_seconds()

# 7. 对 'label' 列进行编码
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])
print("标签分布情况:\n", df['label'].value_counts())
num_classes = df['label'].nunique()

# 9. 处理滑动窗口产生的NaN值（通常是每个会话最开始的几个点）
df.fillna(0, inplace=True)

# 保存处理后的数据
df.to_csv('processed_system.csv', index=False)

标签分布情况:
 label
1    1916
3    1466
2     649
0     585
Name: count, dtype: int64


In [None]:
# 分离特征 (X) 和目标 (y)
features_to_drop = [
    'label', 'timestamp', 'session_id',
    'mouse_distance', 'mouse_left_click', 'mouse_right_click', 'mouse_scroll', 'keyboard_counts',
    'bytes_sent_per_sec', 'bytes_recv_per_sec', 'packets_sent_per_sec',
    'packets_recv_per_sec', 'read_bytes_per_sec', 'write_bytes_per_sec'
]
X = df.drop(features_to_drop, axis=1)
y = df['label']

print("用于训练的特征列信息:")
X.info()

# 将数据拆分为训练集和测试集
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

--- 数据集划分信息 ---
训练集大小: 3150
验证集大小: 0
测试集大小: 1466
--------------------
总会话数: 5
训练集会话ID数量: 4
验证集会话ID数量: 0
测试集会话ID数量: 1


In [None]:
model = XGBClassifier(
    objective='multi:softprob',  # 明确指定为多分类任务
    num_class=num_classes,       # 明确告知类别的数量
    eval_metric='mlogloss'       
)
# 评估方法一: 5折交叉验证
print("--- 1. 执行5折交叉验证 ---")
# 使用 cross_val_score 函数进行交叉验证，cv=5 表示5折
cv_scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
print(f"交叉验证的每次准确率分数: {cv_scores}")
print(f"交叉验证的平均准确率: {np.mean(cv_scores):.4f} (+/- {np.std(cv_scores):.4f})\n")
# 在训练集上训练模型，用于后续的评估
model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)

# 在测试集上进行预测
y_pred = model.predict(X_test)

--- 1. 执行基于会话分组的5折交叉验证 ---


2 fits failed out of a total of 5.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "e:\anaconda3\envs\esp_robot_env\Lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "e:\anaconda3\envs\esp_robot_env\Lib\site-packages\xgboost\core.py", line 729, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "e:\anaconda3\envs\esp_robot_env\Lib\site-packages\xgboost\sklearn.py", line 1641, in fit
    raise ValueError(
ValueError: Invalid classes inferred from unique values of `y`.  Expected: [0 1 2], got [0 1 3]

-----------------------------------------------

交叉验证的每次准确率分数: [0.99154746 0.                nan        nan 0.98412698]
交叉验证的平均准确率: nan (+/- nan)



ValueError: could not convert string to float: '-nan(ind)'

In [None]:
# 评估方法二: 分类报告
print("--- 2. 分类报告 ---")
# 获取原始的类别名称，用于报告显示
target_names = label_encoder.classes_
# 打印每个类别的精确率、召回率和F1分数
print(classification_report(y_test, y_pred, target_names=target_names))

--- 2. 分类报告 ---
              precision    recall  f1-score   support

      coding       1.00      1.00      1.00        52
      gaming       1.00      1.00      1.00       193
        idle       1.00      1.00      1.00        74
       video       1.00      1.00      1.00       143

    accuracy                           1.00       462
   macro avg       1.00      1.00      1.00       462
weighted avg       1.00      1.00      1.00       462



In [None]:
# 评估方法三: 混淆矩阵可视化
print("--- 3. 生成混淆矩阵图 (confusion_matrix.png) ---")
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=target_names, yticklabels=target_names)
plt.title('混淆矩阵 (Confusion Matrix)', fontsize=16)
plt.ylabel('真实标签 (Actual Label)', fontsize=12)
plt.xlabel('预测标签 (Predicted Label)', fontsize=12)
# 保存图像到文件
plt.savefig('confusion_matrix.png')
plt.close() # 关闭图像，防止显示混乱
print("混淆矩阵图已保存。\n")

--- 3. 生成混淆矩阵图 (confusion_matrix.png) ---
混淆矩阵图已保存。



In [None]:
# 评估方法四: 特征重要性可视化
print("--- 4. 生成特征重要性图 (feature_importance.png) ---")
feature_importances = model.feature_importances_
features = X.columns
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})
# 按重要性得分降序排列
importance_df = importance_df.sort_values(by='Importance', ascending=False)
plt.figure(figsize=(12, 6))
sns.barplot(x='Importance', y='Feature', data=importance_df, palette='viridis')
plt.title('特征重要性 (Feature Importance)', fontsize=16)
plt.xlabel('重要性得分 (Importance Score)', fontsize=12)
plt.ylabel('特征 (Features)', fontsize=12)
plt.tight_layout() # 调整布局以防标签重叠
# 保存图像到文件
plt.savefig('feature_importance.png')
plt.close() # 关闭图像
print("特征重要性图已保存。\n")

print("所有评估已完成。")

--- 4. 生成特征重要性图 (feature_importance.png) ---



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x='Importance', y='Feature', data=importance_df, palette='viridis')


特征重要性图已保存。

所有评估已完成。


In [None]:
# 保存模型到文件
joblib.dump(model, 'xgboost_model.joblib')

# 保存 LabelEncoder 到文件
joblib.dump(label_encoder, 'label_encoder.joblib')

print("模型已保存为 'xgboost_model.joblib'")
print("编码器已保存为 'label_encoder.joblib'")

模型已保存为 'xgboost_model.joblib'
编码器已保存为 'label_encoder.joblib'
