In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib
# 指定支持中文的字体，例如 'SimHei'
plt.rcParams['font.sans-serif'] = ['SimHei']
# 解决负号 '-' 显示为方块的问题
plt.rcParams['axes.unicode_minus'] = False

In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# 加载数据集
df = pd.read_csv('system_log.csv')

# --- 数据预处理 ---

# 1. 将时间戳转换为datetime对象并排序
df['timestamp'] = pd.to_datetime(df['timestamp'])
df = df.sort_values('timestamp').reset_index(drop=True)

# 2. 识别数据采集的会话（Session）
time_diff = df['timestamp'].diff()
session_threshold = pd.Timedelta('1 minute')
df['session_id'] = (time_diff > session_threshold).cumsum()

# 3. 【关键修正】将'timestamp'设置成索引
# 这是最核心的改动。让所有操作都基于一个显式的时间索引。
df = df.set_index('timestamp')

# 4. 在每个会话内部进行滑动窗口处理
window_size = '10s' 
features_to_roll = [
    'mouse_left_click',
    'mouse_right_click',
    'mouse_scroll',
    'keyboard_counts',
    'mouse_distance'
]

for col in features_to_roll:
    new_col_name = f'{col}_freq'
    # 现在我们按session_id分组，并在时间索引上直接进行滚动计算
    # 注意，这里不再需要 on='timestamp' 参数，因为rolling操作默认就在索引上执行
    rolled_series = df.groupby('session_id')[col].rolling(window=window_size).sum()
    
    # 结果的索引是 (session_id, timestamp)，我们需要去掉第一层session_id来对齐
    df[new_col_name] = rolled_series.reset_index(level=0, drop=True)

df = df.reset_index()

# 6. 将时间戳转换为自第一个时间戳以来的总秒数
df['timestamp'] = (df['timestamp'] - df['timestamp'].min()).dt.total_seconds()

# 7. 对 'label' 列进行编码
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])
print("标签分布情况:\n", df['label'].value_counts())
num_classes = df['label'].nunique()

# 9. 处理滑动窗口产生的NaN值（通常是每个会话最开始的几个点）
df.fillna(0, inplace=True)

print("处理后的数据预览:")
print(df[['timestamp', 'session_id', 'keyboard_counts', 'keyboard_counts_freq']].tail(20))
df.to_csv('processed_system.csv', index=False)

标签分布情况:
 label
3    3923
1    2390
2    1901
0    1019
Name: count, dtype: int64
处理后的数据预览:
      timestamp  session_id  keyboard_counts  keyboard_counts_freq
9213   351773.0          11                0                   0.0
9214   351774.0          11                0                   0.0
9215   351775.0          11                0                   0.0
9216   351776.0          11                0                   0.0
9217   351777.0          11                0                   0.0
9218   351778.0          11                0                   0.0
9219   351779.0          11                0                   0.0
9220   351780.0          11                0                   0.0
9221   351781.0          11                0                   0.0
9222   351782.0          11                0                   0.0
9223   351783.0          11                0                   0.0
9224   351784.0          11                0                   0.0
9225   351785.0          11           

In [3]:
# 分离特征 (X) 和目标 (y)
X = df.drop(['label','timestamp','mouse_distance','mouse_left_click','mouse_right_click','mouse_scroll','keyboard_counts','session_id'], axis=1)
y = df['label']
X.info()
# 将数据拆分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9233 entries, 0 to 9232
Data columns (total 9 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   cpu_percent             9233 non-null   float64
 1   ram_percent             9233 non-null   float64
 2   gpu_percent             9233 non-null   float64
 3   gpu_vram_percent        9233 non-null   float64
 4   mouse_left_click_freq   9233 non-null   float64
 5   mouse_right_click_freq  9233 non-null   float64
 6   mouse_scroll_freq       9233 non-null   float64
 7   keyboard_counts_freq    9233 non-null   float64
 8   mouse_distance_freq     9233 non-null   float64
dtypes: float64(9)
memory usage: 649.3 KB


In [4]:
model = XGBClassifier(
    objective='multi:softprob',  # 明确指定为多分类任务
    num_class=num_classes,       # 明确告知类别的数量
    eval_metric='mlogloss'       
)
# 评估方法一: 5折交叉验证
print("--- 1. 执行5折交叉验证 ---")
# 使用 cross_val_score 函数进行交叉验证，cv=5 表示5折
cv_scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
print(f"交叉验证的每次准确率分数: {cv_scores}")
print(f"交叉验证的平均准确率: {np.mean(cv_scores):.4f} (+/- {np.std(cv_scores):.4f})\n")
# 在训练集上训练模型，用于后续的评估
model.fit(X_train, y_train)

# 在测试集上进行预测
y_pred = model.predict(X_test)

--- 1. 执行5折交叉验证 ---
交叉验证的每次准确率分数: [0.97401191 0.98267461 0.99729291 0.98699892 0.99187432]
交叉验证的平均准确率: 0.9866 (+/- 0.0080)



In [5]:
# 评估方法二: 分类报告
print("--- 2. 分类报告 ---")
# 获取原始的类别名称，用于报告显示
target_names = label_encoder.classes_
# 打印每个类别的精确率、召回率和F1分数
print(classification_report(y_test, y_pred, target_names=target_names))

--- 2. 分类报告 ---
              precision    recall  f1-score   support

      coding       0.99      0.98      0.99       192
      gaming       1.00      1.00      1.00       489
        idle       1.00      1.00      1.00       384
       video       1.00      1.00      1.00       782

    accuracy                           1.00      1847
   macro avg       1.00      1.00      1.00      1847
weighted avg       1.00      1.00      1.00      1847



In [6]:
# 评估方法三: 混淆矩阵可视化
print("--- 3. 生成混淆矩阵图 (confusion_matrix.png) ---")
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=target_names, yticklabels=target_names)
plt.title('混淆矩阵 (Confusion Matrix)', fontsize=16)
plt.ylabel('真实标签 (Actual Label)', fontsize=12)
plt.xlabel('预测标签 (Predicted Label)', fontsize=12)
# 保存图像到文件
plt.savefig('confusion_matrix.png')
plt.close() # 关闭图像，防止显示混乱
print("混淆矩阵图已保存。\n")

--- 3. 生成混淆矩阵图 (confusion_matrix.png) ---
混淆矩阵图已保存。



In [7]:
# 评估方法四: 特征重要性可视化
print("--- 4. 生成特征重要性图 (feature_importance.png) ---")
feature_importances = model.feature_importances_
features = X.columns
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})
# 按重要性得分降序排列
importance_df = importance_df.sort_values(by='Importance', ascending=False)
plt.figure(figsize=(12, 6))
sns.barplot(x='Importance', y='Feature', data=importance_df, palette='viridis')
plt.title('特征重要性 (Feature Importance)', fontsize=16)
plt.xlabel('重要性得分 (Importance Score)', fontsize=12)
plt.ylabel('特征 (Features)', fontsize=12)
plt.tight_layout() # 调整布局以防标签重叠
# 保存图像到文件
plt.savefig('feature_importance.png')
plt.close() # 关闭图像
print("特征重要性图已保存。\n")

print("所有评估已完成。")

--- 4. 生成特征重要性图 (feature_importance.png) ---



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x='Importance', y='Feature', data=importance_df, palette='viridis')


特征重要性图已保存。

所有评估已完成。


In [8]:
# 保存模型到文件
joblib.dump(model, 'xgboost_model.joblib')

# 保存 LabelEncoder 到文件
joblib.dump(label_encoder, 'label_encoder.joblib')

print("模型已保存为 'xgboost_model.joblib'")
print("编码器已保存为 'label_encoder.joblib'")

模型已保存为 'xgboost_model.joblib'
编码器已保存为 'label_encoder.joblib'
