# INSTALLATION OF DEPENDENCIES

In [4]:
# Install ---> https://npcap.com/#download
try:
    import nfstream
except ImportError:
    !pip install nfstream pandas xgboost scikit-learn tqdm

import pandas as pd
import glob
import os
import numpy as np
from nfstream import NFStreamer
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
import pickle

# 1. CONFIGURATION & DATASET PATHS

In [5]:
DATASET_DIRS = [
    # --- GOTHAM DATASET 2025 ---
    "./data/raw/benign",
    "./data/raw/malicious/merlin",
    
    # --- IOT-23 DATASET ---
    # "./IoT-23/CTU-IoT-Malware-Capture-34-1/", 
    # "./IoT-23/CTU-Honeypot-Capture-4-1/"
]

# Features selected based on Uğurlu et al. (2021) paper methodology.
# The paper emphasizes time-related features like IAT (Inter-Arrival Time) and packet sizes.
# We exclude IPs and Ports to force the model to learn behavior, not addresses.
EXCLUDED_COLS = [
    'id', 'src_ip', 'src_mac', 'src_oui', 'dst_ip', 'dst_mac', 'dst_oui',
    'src_port', 'dst_port', 'protocol', 'ip_version', 'vlan_id', 'tunnel_id',
    'bidirectional_first_seen_ms', 'bidirectional_last_seen_ms',
    'src2dst_first_seen_ms', 'src2dst_last_seen_ms',
    'dst2src_first_seen_ms', 'dst2src_last_seen_ms',
    'client_fingerprint', 'user_agent', 'content_type', 'requested_server_name',
    'application_category_name', 'application_is_guessed'
]

# 2. FEATURE EXTRACTION (FLOW-BASED)

In [6]:
def process_pcap_to_df(pcap_path):
    """
    Extracts flow features from a PCAP file using nfstream.
    
    Args:
        pcap_path (str): Path to the .pcap file.
        
    Returns:
        pd.DataFrame: DataFrame containing flow features and the detected protocol label.
    """
    try:
        # statistical_analysis=True computes the features (Mean, StdDev, Duration) 
        # required by the paper's methodology.
        streamer = NFStreamer(source=pcap_path, 
                              statistical_analysis=True, 
                              n_dissections=20) # n_dissections helps identify the protocol label
        
        df = streamer.to_pandas()
        
        if df.empty:
            return None
            
        # We filter out flows where the protocol could not be identified ('Unknown')
        # to ensure high-quality training data.
        df = df[~df['application_name'].str.contains("Unknown", na=False)]
        
        return df
        
    except Exception as e:
        print(f"[!] Error processing {os.path.basename(pcap_path)}: {e}")
        return None

# 3. DATASET GENERATION

In [7]:
print("[*] Starting Dataset Generation...")
print("[*] Reading PCAP files from Benign and C&C folders...")

dataframes = []

for folder in DATASET_DIRS:
    # Recursively find all .pcap files in the folder
    pcap_files = glob.glob(os.path.join(folder, "*.pcap"))
    
    if not pcap_files:
        print(f"[!] Warning: No pcap files found in {folder}")
        continue
        
    print(f" -> Processing {len(pcap_files)} files in {folder}...")
    
    # Process each file (Using a subset [:20] for speed demonstration. Remove [:20] for full training)
    for pcap_file in pcap_files: # <-- Remove limit for full dataset
        df = process_pcap_to_df(pcap_file)
        if df is not None:
            dataframes.append(df)

if not dataframes:
    raise ValueError("No valid data extracted. Please check your dataset paths.")

# Merge all processed flows into one big dataset
full_dataset = pd.concat(dataframes, ignore_index=True)

print(f"\n[+] Dataset Loaded Successfully!")
print(f"[+] Total Flows: {len(full_dataset)}")
print(f"[+] Detected Protocols (Classes): {full_dataset['application_name'].unique()}")

[*] Starting Dataset Generation...
[*] Reading PCAP files from Benign and C&C folders...
 -> Processing 78 files in ./data/raw/benign...
 -> Processing 7 files in ./data/raw/malicious/merlin...


  df = pd.read_csv(
  df = pd.read_csv(
  df = pd.read_csv(



[+] Dataset Loaded Successfully!
[+] Total Flows: 72143
[+] Detected Protocols (Classes): ['DNS' 'ICMPV6' 'NTP' 'ICMP' 'MQTT' 'COAP' 'DTLS' 'TLS' 'RTCP' 'RTP'
 'RTSP' 'Skype_Teams.Skype_TeamsCall' 'Teredo' 'SSH' 'BGP' 's7comm'
 'Telnet' 'Whois-DAS' 'FTP_DATA' 'SMTP' 'XDMCP' 'Kerberos' 'NetBIOS'
 'HTTP' 'POP3' 'IMAP' 'IRC' 'RPC' 'Z3950' 'LDAP' 'Modbus' 'SMBv23'
 'Syslog' 'SMTPS' 'IPSec' 'AFP' 'HTTP_Proxy' 'SAP' 'TINC' 'MySQL'
 'DoH_DoT' 'RSYNC' 'VMware' 'IMAPS' 'POPS' 'Starcraft' 'SOCKS' 'OpenVPN'
 'LotusNotes' 'Diameter' 'MsSQL-TDS' 'Citrix' 'H323' 'Radius'
 'FTP_CONTROL' 'OSPF' 'RTMP' 'NFS' 'CiscoSkinny' 'TiVoConnect' 'IEC60870'
 'SIP' 'TargusDataspeed' 'RDP' 'Viber' 'LLMNR' 'PostgreSQL' 'VNC' 'Oracle'
 'TeamViewer' 'IAX' 'CHECKMK' 'Munin' 'Redis' 'DNP3']


# 4. PREPROCESSING & TRAINING (XGBOOST)

In [9]:
print("\n[*] Preparing Data for XGBoost Training...")

# Select only numeric features (statistical features)
X = full_dataset.drop(columns=[c for c in EXCLUDED_COLS if c in full_dataset.columns])
# The target variable is 'application_name' (The Protocol, e.g., MQTT, HTTP)
# We drop it from X because that's what we want to predict
X = X.drop(columns=['application_name'], errors='ignore')
X = X.select_dtypes(include=[np.number])

# Handle Missing Values (Impute with 0)
X = X.fillna(0)

# Encode Target Labels (Strings -> Numbers)
y = full_dataset['application_name']
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split into Training (70%) and Testing (30%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=42, stratify=y_encoded)

print(f" -> Training Samples: {X_train.shape[0]}")
print(f" -> Testing Samples: {X_test.shape[0]}")
print(f" -> Features used: {X.shape[1]}")

print("\n[*] Training XGBoost Classifier...")
# We use XGBoost as it provided the best accuracy (94.53%) in the Uğurlu et al. paper.
model = XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    eval_metric='mlogloss',
    n_jobs=-1  # Use all CPU cores
)

model.fit(X_train, y_train)

print("[+] Model Training Complete!")


[*] Preparing Data for XGBoost Training...
 -> Training Samples: 50500
 -> Testing Samples: 21643
 -> Features used: 59

[*] Training XGBoost Classifier...
[+] Model Training Complete!


# 5. EVALUATION

In [11]:
print("\n[*] Evaluating Model Performance...")
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"\n=== OVERALL ACCURACY: {accuracy * 100:.4f}% ===")
print("\n=== DETAILED REPORT BY PROTOCOL ===")

all_labels_ids = range(len(label_encoder.classes_))
# Decode labels back to protocol names for the report
print(classification_report(
    y_test, 
    y_pred, 
    labels=all_labels_ids, 
    target_names=label_encoder.classes_,
    zero_division=0  
))


[*] Evaluating Model Performance...

=== OVERALL ACCURACY: 98.4475% ===

=== DETAILED REPORT BY PROTOCOL ===
                             precision    recall  f1-score   support

                        AFP       0.00      0.00      0.00         4
                        BGP       0.00      0.00      0.00        10
                    CHECKMK       0.00      0.00      0.00         3
                       COAP       1.00      1.00      1.00      1476
                CiscoSkinny       0.00      0.00      0.00         4
                     Citrix       0.00      0.00      0.00        10
                       DNP3       0.00      0.00      0.00         0
                        DNS       1.00      1.00      1.00      7992
                       DTLS       1.00      1.00      1.00       588
                   Diameter       0.00      0.00      0.00         7
                    DoH_DoT       0.00      0.00      0.00         4
                FTP_CONTROL       0.00      0.00      0.00   

# 6. INFERENCE FUNCTION (USE THIS FOR NEW FILES)

In [19]:
def identify_protocol(pcap_path):
    """
    Takes any raw PCAP (encrypted or not) and predicts the predominant protocol
    using the trained XGBoost model based on flow statistics.
    """
    print(f"\n[*] Analyzing: {pcap_path}")
    
    # 1. Extract Stats (Same way as training)
    streamer = NFStreamer(source=pcap_path, statistical_analysis=True)
    df_new = streamer.to_pandas()
    
    if df_new.empty:
        return "Error: Empty or unreadable PCAP."
        
    # 2. Align Features (Ensure columns match training data)
    X_new = df_new.reindex(columns=X.columns, fill_value=0).fillna(0)
    
    # 3. Predict
    preds = model.predict(X_new)
    pred_labels = label_encoder.inverse_transform(preds)
    
    # 4. Find most frequent protocol
    from collections import Counter
    counts = Counter(pred_labels)
    top_protocol, count = counts.most_common(1)[0]
    
    return f"Result: The file contains mostly '{top_protocol}' traffic ({count} flows)."

# Example Usage:
test_file = "./data/raw/malicious/mirai-dos/iotsim-building-monitor-1_0-0_to_OpenvSwitch-28_1-0.pcap"
print(identify_protocol(test_file))


[*] Analyzing: ./data/raw/malicious/mirai-dos/iotsim-building-monitor-1_0-0_to_OpenvSwitch-28_1-0.pcap


  df = pd.read_csv(


Result: The file contains mostly 'Skype_Teams.Skype_TeamsCall' traffic (1068117 flows).


# 6. SAVED MODEL

In [16]:
print("[*] Saving model...")

with open("xgboost_protocol_selector.pkl", "wb") as f:
    pickle.dump(model, f)

with open("label_encoder.pkl", "wb") as f:
    pickle.dump(label_encoder, f)

with open("feature_columns.pkl", "wb") as f:
    pickle.dump(list(X.columns), f)

print("[+] ¡SAVED!")

[*] Saving model...
[+] ¡SAVED!
