In [1]:
import os
import pandas as pd

# 設定資料夾根目錄（此資料夾下有多個子資料夾）
root_folder = "./new_dataset/csv/"  # ← 請修改為你的實際資料夾路徑

# 準備儲存所有讀到的 DataFrame
df_list = []

# 遞迴讀取每個子資料夾裡的 CSV 檔案
for subdir, dirs, files in os.walk(root_folder):
    for file in files:
        if file.endswith(".csv"):
            file_path = os.path.join(subdir, file)
            try:
                df = pd.read_csv(file_path)
                df["source_file"] = file  # 可選：記錄檔名
                df["source_folder"] = os.path.basename(subdir)  # 可選：記錄資料夾名
                df_list.append(df)
            except Exception as e:
                print(f"⚠️ 無法讀取 {file_path}: {e}")

# 合併成一個總表
if df_list:
    combined_df = pd.concat(df_list, ignore_index=True)
    print(f"✅ 合併完成：共 {len(df_list)} 個檔案，{len(combined_df)} 筆資料")
else:
    print("⚠️ 沒有讀取到任何 CSV 檔案")

# 若要儲存成新檔：
# combined_df.to_csv("merged_all_data.csv", index=False)

from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler, LabelEncoder

ddf = combined_df

# Handle missing values
df.fillna({
    "protocol": "Other",
    "src_port": -1,
    "dst_port": -1,
    "packet_length": 0,
    "payload_len": 0,
    "ttl": 0,
    "tcp_flags_int": 0,
    "tcp_window": 0,
    "global_delta_time": 0,
    "src_ip_delta_time": 0
}, inplace=True)

# Encode protocol to numeric values
df["protocol_encoded"] = LabelEncoder().fit_transform(df["protocol"])

# Define features to use
feature_cols = [
    "protocol_encoded",
    "src_port",
    "dst_port",
    "packet_length",
    "payload_len",
    "ttl",
    "tcp_flags_int",
    "tcp_window",
    "global_delta_time",
    "src_ip_delta_time"
]


# Extract feature matrix and normalize
X = df[feature_cols]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train Isolation Forest
model = IsolationForest(contamination=0.1, random_state=42)
df["prediction"] = model.fit_predict(X_scaled)
df["anomaly"] = df["prediction"].apply(lambda x: 1 if x == -1 else 0)

import joblib  # for model saving
# Save model and scaler
joblib.dump(model, "isolation_forest_model.joblib")
joblib.dump(scaler, "feature_scaler.joblib")


  df = pd.read_csv(file_path)


✅ 合併完成：共 14 個檔案，2647923 筆資料


['feature_scaler.joblib']

In [2]:
import os
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
import joblib

# === 1. 遞迴讀取所有 CSV 檔案 ===
root_folder = "./new_dataset/cleaned_csv_simple_freq"
df_list = []

for subdir, dirs, files in os.walk(root_folder):
    for file in files:
        if file.endswith(".csv"):
            file_path = os.path.join(subdir, file)
            try:
                df = pd.read_csv(file_path)
                df["source_file"] = file
                df["source_folder"] = os.path.basename(subdir)
                df_list.append(df)
            except Exception as e:
                print(f"⚠️ 無法讀取 {file_path}: {e}")

# === 2. 合併所有 DataFrame ===
if not df_list:
    raise ValueError("❌ 沒有讀取到任何 CSV 檔案")

combined_df = pd.concat(df_list, ignore_index=True)
print(f"✅ 合併完成：共 {len(df_list)} 個檔案，{len(combined_df)} 筆資料")

# === 3. 補值處理 ===
combined_df.fillna({
    "src_port": -1,
    "dst_port": -1,
    "packet_length": 0,
    "payload_len": 0,
    "ttl": 0,
    "tcp_flags_int": 0,
    "tcp_window": 0,
    "src_ip_delta_time": 0,
    "log_src_ip_avg_freq": 0
}, inplace=True)

# === 4. 特徵欄位定義（新版本） ===
feature_cols = [
    "src_ip_delta_time",
    "src_port",
    "dst_port",
    "packet_length",
    "payload_len",
    "ttl",
    "tcp_flags_int",
    "tcp_window",
    "log_src_ip_avg_freq"
]

# 確保欄位都存在
feature_cols = [col for col in feature_cols if col in combined_df.columns]

# === 5. 特徵標準化 ===
X = combined_df[feature_cols]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# === 6. Isolation Forest 訓練 ===
model = IsolationForest(contamination=0.1, random_state=42)
combined_df["prediction"] = model.fit_predict(X_scaled)
combined_df["anomaly"] = combined_df["prediction"].apply(lambda x: 1 if x == -1 else 0)

# === 7. 儲存模型與結果 ===
joblib.dump(model, "isolation_forest_model.joblib")
joblib.dump(scaler, "feature_scaler.joblib")
combined_df.to_csv("packets_with_anomaly.csv", index=False)

print("✅ 模型訓練與結果輸出完成。")


  df = pd.read_csv(file_path)


✅ 合併完成：共 14 個檔案，2647923 筆資料
✅ 模型訓練與結果輸出完成。


In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
import joblib

# === 1. 遞迴讀取所有 CSV 檔案 ===
root_folder = "./new_dataset/cleaned_csv_simplelog_freq"
df_list = []

for subdir, dirs, files in os.walk(root_folder):
    for file in files:
        if file.endswith(".csv"):
            file_path = os.path.join(subdir, file)
            try:
                df = pd.read_csv(file_path)
                df["source_file"] = file
                df["source_folder"] = os.path.basename(subdir)
                df_list.append(df)
            except Exception as e:
                print(f"⚠️ 無法讀取 {file_path}: {e}")

# === 2. 合併所有 DataFrame ===
if not df_list:
    raise ValueError("❌ 沒有讀取到任何 CSV 檔案")

combined_df = pd.concat(df_list, ignore_index=True)
print(f"✅ 合併完成：共 {len(df_list)} 個檔案，{len(combined_df)} 筆資料")

# === 3. 若尚未有 log frequency 欄位，自動建立 ===
if "src_ip_avg_freq" in combined_df.columns and "log_src_ip_avg_freq" not in combined_df.columns:
    combined_df["log_src_ip_avg_freq"] = np.log1p(combined_df["src_ip_avg_freq"])

# === 4. 補值處理 ===
combined_df.fillna({
    "src_port": -1,
    "dst_port": -1,
    "packet_length": 0,
    "payload_len": 0,
    "ttl": 0,
    "tcp_flags_int": 0,
    "tcp_window": 0,
    "src_ip_delta_time": 0,
    "log_src_ip_avg_freq": 0
}, inplace=True)

# === 5. 特徵欄位定義 ===
feature_cols = [
    "src_ip_delta_time",
    "src_port",
    "dst_port",
    "packet_length",
    "payload_len",
    "ttl",
    "tcp_flags_int",
    "tcp_window",
    "log_src_ip_avg_freq"
]
feature_cols = [col for col in feature_cols if col in combined_df.columns]

# === 6. 特徵標準化 ===
X = combined_df[feature_cols]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# === 7. 訓練 Isolation Forest ===
model = IsolationForest(contamination=0.1, random_state=42)
combined_df["prediction"] = model.fit_predict(X_scaled)
combined_df["anomaly"] = combined_df["prediction"].apply(lambda x: 1 if x == -1 else 0)

# === 8. 儲存模型與結果 ===
joblib.dump(model, "isolation_forest_model.joblib")
joblib.dump(scaler, "feature_scaler.joblib")
combined_df.to_csv("packets_with_anomaly.csv", index=False)

print("✅ 模型訓練與結果輸出完成（含 log frequency 特徵）。")


  df = pd.read_csv(file_path)


✅ 合併完成：共 16 個檔案，2650002 筆資料
✅ 模型訓練與結果輸出完成（含 log frequency 特徵）。


In [2]:
import os
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
import joblib

# === 1. 遞迴讀取所有 CSV 檔案 ===
root_folder = "./new_dataset/new_cleaned_csv_simplelog_freq"
df_list = []

for subdir, dirs, files in os.walk(root_folder):
    for file in files:
        if file.endswith(".csv"):
            file_path = os.path.join(subdir, file)
            try:
                df = pd.read_csv(file_path)
                df["source_file"] = file
                df["source_folder"] = os.path.basename(subdir)
                df_list.append(df)
            except Exception as e:
                print(f"⚠️ 無法讀取 {file_path}: {e}")

# === 2. 合併所有 DataFrame ===
if not df_list:
    raise ValueError("❌ 沒有讀取到任何 CSV 檔案")

combined_df = pd.concat(df_list, ignore_index=True)
print(f"✅ 合併完成：共 {len(df_list)} 個檔案，{len(combined_df)} 筆資料")

# === 3. 若尚未有 log frequency 欄位，自動建立 ===
if "src_ip_avg_freq" in combined_df.columns and "log_src_ip_avg_freq" not in combined_df.columns:
    combined_df["log_src_ip_avg_freq"] = np.log1p(combined_df["src_ip_avg_freq"])

# === 4. 補值處理（只針對保留欄位）
combined_df.fillna({
    "src_ip_delta_time": 0,
    "log_src_ip_avg_freq": 0
}, inplace=True)

# === 5. 僅保留需要的欄位
keep_columns = [
    "src_ip_delta_time",
    "log_src_ip_avg_freq",
    "src_ip",
    "dst_ip"
]
combined_df = combined_df[keep_columns]

# === 6. 特徵欄位（只給模型使用前兩項）
feature_cols = [
    "src_ip_delta_time",
    "log_src_ip_avg_freq"
]

# === 7. 特徵標準化
X = combined_df[feature_cols]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# === 8. 訓練 Isolation Forest
model = IsolationForest(contamination=0.1, random_state=42)
combined_df["prediction"] = model.fit_predict(X_scaled)
combined_df["anomaly"] = combined_df["prediction"].apply(lambda x: 1 if x == -1 else 0)

# === 9. 儲存模型與結果
joblib.dump(model, "isolation_forest_model.joblib")
joblib.dump(scaler, "feature_scaler.joblib")
combined_df.to_csv("packets_with_anomaly.csv", index=False)

print("✅ 模型訓練完成（僅使用 log frequency 與 src_ip_delta_time 特徵）。")


✅ 合併完成：共 16 個檔案，2650002 筆資料
✅ 模型訓練完成（僅使用 log frequency 與 src_ip_delta_time 特徵）。
