In [7]:
import os
import pandas as pd

# 指定你放 csv 檔案的資料夾路徑
folder_path = "./dataset/normal"  # ⬅ 請換成你的實際路徑

# 取得所有 csv 檔案名稱
csv_files = [f for f in os.listdir(folder_path) if f.endswith(".csv")]

# 讀入並合併成一個 DataFrame
df_list = [pd.read_csv(os.path.join(folder_path, f)) for f in csv_files]
combined_df = pd.concat(df_list, ignore_index=True)

# ✅ 完成！你可以直接使用 combined_df 做後續操作
print(f"✅ 已合併 {len(csv_files)} 個檔案，共 {len(combined_df)} 筆資料")

✅ 已合併 3 個檔案，共 6532 筆資料


In [10]:
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler, LabelEncoder

df = combined_df


# Handle missing values
df.fillna({
    "protocol": "Other",
    "src_port": -1,
    "dst_port": -1,
    "packet_length": 0,
    "payload_len": 0,
    "ttl": 0,
    "tcp_flags_int": 0,
    "tcp_window": 0
}, inplace=True)

# Encode protocol to numeric values
df["protocol_encoded"] = LabelEncoder().fit_transform(df["protocol"])

# Define features to use
feature_cols = [
    "protocol_encoded",
    "src_port",
    "dst_port",
    "packet_length",
    "payload_len",
    "ttl",
    "tcp_flags_int",
    "tcp_window"
]

# Extract feature matrix and normalize
X = df[feature_cols]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train Isolation Forest
model = IsolationForest(contamination=0.1, random_state=42)
df["prediction"] = model.fit_predict(X_scaled)
df["anomaly"] = df["prediction"].apply(lambda x: 1 if x == -1 else 0)


In [11]:
import joblib  # for model saving
# Save model and scaler
joblib.dump(model, "isolation_forest_model.joblib")
joblib.dump(scaler, "feature_scaler.joblib")

['feature_scaler.joblib']