In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib
# 指定支持中文的字体，例如 'SimHei'
plt.rcParams['font.sans-serif'] = ['SimHei']
# 解决负号 '-' 显示为方块的问题
plt.rcParams['axes.unicode_minus'] = False

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# 加载数据集
df = pd.read_csv('system_log.csv')

# --- 数据预处理 ---

# 1. 将时间戳转换为datetime对象并排序
df['timestamp'] = pd.to_datetime(df['timestamp'])
df = df.sort_values('timestamp').reset_index(drop=True)

# 2. 识别数据采集的会话（Session）
time_diff = df['timestamp'].diff()
session_threshold = pd.Timedelta('1 minute')
df['session_id'] = (time_diff > session_threshold).cumsum()

# 3. 【关键修正】将'timestamp'设置成索引
# 这是最核心的改动。让所有操作都基于一个显式的时间索引。
df = df.set_index('timestamp')

# 4. 在每个会话内部进行滑动窗口处理
window_size = '10s' 
features_to_roll = [
    'mouse_left_click',
    'mouse_right_click',
    'mouse_scroll',
    'keyboard_counts',
    'mouse_distance'
]

for col in features_to_roll:
    new_col_name = f'{col}_freq'
    # 现在我们按session_id分组，并在时间索引上直接进行滚动计算
    # 注意，这里不再需要 on='timestamp' 参数，因为rolling操作默认就在索引上执行
    rolled_series = df.groupby('session_id')[col].rolling(window=window_size).sum()
    
    # 结果的索引是 (session_id, timestamp)，我们需要去掉第一层session_id来对齐
    df[new_col_name] = rolled_series.reset_index(level=0, drop=True)

df = df.reset_index()

# 6. 将时间戳转换为自第一个时间戳以来的总秒数
df['timestamp'] = (df['timestamp'] - df['timestamp'].min()).dt.total_seconds()

# 7. 对 'label' 列进行编码
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])
print("标签分布情况:\n", df['label'].value_counts())
num_classes = df['label'].nunique()

# 8. 用idle均值校准系统资源使用率
idle_value = label_encoder.transform(['idle'])[0]
is_idle = df['label'] == idle_value
idle_data = df[is_idle]

cpu_mean = idle_data['cpu_percent'].mean()
ram_mean = idle_data['ram_percent'].mean()
gpu_mean = idle_data['gpu_percent'].mean()
gpu_vram_mean = idle_data['gpu_vram_percent'].mean()

print(f"\nidle状态下RAM均值: {ram_mean:.4f}")
print(f"idle状态下CPU均值: {cpu_mean:.4f}")
print(f"idle状态下GPU均值: {gpu_mean:.4f}")
print(f"idle状态下GPU VRAM均值: {gpu_vram_mean:.4f}\n")

df['cpu_percent'] -= cpu_mean
df['ram_percent'] -= ram_mean
df['gpu_percent'] -= gpu_mean
df['gpu_vram_percent'] -= gpu_vram_mean

# 9. 处理滑动窗口产生的NaN值（通常是每个会话最开始的几个点）
df.fillna(0, inplace=True)

print("处理后的数据预览:")
print(df[['timestamp', 'session_id', 'keyboard_counts', 'keyboard_counts_freq']].tail(20))
df.to_csv('processed_system_log.csv', index=False)

标签分布情况:
 label
1    1191
3     562
2     534
0     421
Name: count, dtype: int64

idle状态下RAM均值: 59.5805
idle状态下CPU均值: 6.0687
idle状态下GPU均值: 26.4307
idle状态下GPU VRAM均值: 16.2869

处理后的数据预览:
      timestamp  session_id  keyboard_counts  keyboard_counts_freq
2688     8618.0           5                0                   0.0
2689     8619.0           5                0                   0.0
2690     8620.0           5                0                   0.0
2691     8621.0           5                0                   0.0
2692     8622.0           5                0                   0.0
2693     8623.0           5                0                   0.0
2694     8624.0           5                0                   0.0
2695     8625.0           5                0                   0.0
2696     8626.0           5                0                   0.0
2697     8627.0           5                0                   0.0
2698     8628.0           5                0                   0.0
2699     86

In [None]:
# 分离特征 (X) 和目标 (y)
X = df.drop(['label','timestamp','mouse_distance','mouse_left_click','mouse_right_click','mouse_scroll','keyboard_counts'], axis=1)
y = df['label']
X.info()
# 将数据拆分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2708 entries, 0 to 2707
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   mouse_distance     2708 non-null   float64
 1   mouse_left_click   2708 non-null   int64  
 2   mouse_right_click  2708 non-null   int64  
 3   mouse_scroll       2708 non-null   int64  
 4   keyboard_counts    2708 non-null   int64  
 5   cpu_percent        2708 non-null   float64
 6   ram_percent        2708 non-null   float64
 7   gpu_percent        2708 non-null   float64
 8   gpu_vram_percent   2708 non-null   float64
dtypes: float64(5), int64(4)
memory usage: 190.5 KB


In [None]:
model = XGBClassifier(
    objective='multi:softprob',  # 明确指定为多分类任务
    num_class=num_classes,       # 明确告知类别的数量
    eval_metric='mlogloss'       
)
# 评估方法一: 5折交叉验证
print("--- 1. 执行5折交叉验证 ---")
# 使用 cross_val_score 函数进行交叉验证，cv=5 表示5折
cv_scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
print(f"交叉验证的每次准确率分数: {cv_scores}")
print(f"交叉验证的平均准确率: {np.mean(cv_scores):.4f} (+/- {np.std(cv_scores):.4f})\n")
# 在训练集上训练模型，用于后续的评估
model.fit(X_train, y_train)

# 在测试集上进行预测
y_pred = model.predict(X_test)

--- 1. 执行5折交叉验证 ---
交叉验证的每次准确率分数: [0.9797048  1.         0.99077491 0.99630314 0.94454713]
交叉验证的平均准确率: 0.9823 (+/- 0.0201)



In [None]:
# 评估方法二: 分类报告
print("--- 2. 分类报告 ---")
# 获取原始的类别名称，用于报告显示
target_names = label_encoder.classes_
# 打印每个类别的精确率、召回率和F1分数
print(classification_report(y_test, y_pred, target_names=target_names))

--- 2. 分类报告 ---
              precision    recall  f1-score   support

      coding       0.98      0.98      0.98        89
      gaming       1.00      1.00      1.00       240
        idle       1.00      1.00      1.00       109
       video       0.98      0.97      0.98       104

    accuracy                           0.99       542
   macro avg       0.99      0.99      0.99       542
weighted avg       0.99      0.99      0.99       542



In [None]:
# 评估方法三: 混淆矩阵可视化
print("--- 3. 生成混淆矩阵图 (confusion_matrix.png) ---")
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=target_names, yticklabels=target_names)
plt.title('混淆矩阵 (Confusion Matrix)', fontsize=16)
plt.ylabel('真实标签 (Actual Label)', fontsize=12)
plt.xlabel('预测标签 (Predicted Label)', fontsize=12)
# 保存图像到文件
plt.savefig('confusion_matrix.png')
plt.close() # 关闭图像，防止显示混乱
print("混淆矩阵图已保存。\n")

--- 3. 生成混淆矩阵图 (confusion_matrix.png) ---
混淆矩阵图已保存。



In [None]:
# 评估方法四: 特征重要性可视化
print("--- 4. 生成特征重要性图 (feature_importance.png) ---")
feature_importances = model.feature_importances_
features = X.columns
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})
# 按重要性得分降序排列
importance_df = importance_df.sort_values(by='Importance', ascending=False)
plt.figure(figsize=(12, 6))
sns.barplot(x='Importance', y='Feature', data=importance_df, palette='viridis')
plt.title('特征重要性 (Feature Importance)', fontsize=16)
plt.xlabel('重要性得分 (Importance Score)', fontsize=12)
plt.ylabel('特征 (Features)', fontsize=12)
plt.tight_layout() # 调整布局以防标签重叠
# 保存图像到文件
plt.savefig('feature_importance.png')
plt.close() # 关闭图像
print("特征重要性图已保存。\n")

print("所有评估已完成。")

--- 4. 生成特征重要性图 (feature_importance.png) ---
特征重要性图已保存。

所有评估已完成。



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x='Importance', y='Feature', data=importance_df, palette='viridis')


In [None]:
# 保存模型到文件
joblib.dump(model, 'xgboost_model.joblib')

# 保存 LabelEncoder 到文件
joblib.dump(label_encoder, 'label_encoder.joblib')

print("模型已保存为 'xgboost_model.joblib'")
print("编码器已保存为 'label_encoder.joblib'")

模型已保存为 'xgboost_model.joblib'
编码器已保存为 'label_encoder.joblib'
