In [5]:
# ============================================================================
# IMPORTS
# ============================================================================
import pandas as pd
import numpy as np
from pymfe.mfe import MFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
import joblib
import os
import time
import warnings
warnings.filterwarnings('ignore')

# ============================================================================
# CONFIGURATION
# ============================================================================
BASE_PATH = 'selector-data'

# Doorbell-data (Context 1)
DOORBELL_PATH = os.path.join(BASE_PATH, 'Doorbell-data')
DOORBELL_FILES = ['ack', 'benign_traffic', 'scan', 'syn', 'udp', 'udpplain']

# Other-devices-data (Context 2)
OTHER_PATH = os.path.join(BASE_PATH, 'Other-devices-data')
OTHER_FILES = [
    'BenignTraffic.pcap',
    'Mirai-greeth_flood2.pcap',
    'Mirai-greeth_flood3.pcap',
    'Mirai-greeth_flood4.pcap',
    'Mirai-greip_flood1.pcap',
    'Mirai-greip_flood2.pcap',
    'Mirai-greip_flood3.pcap',
    'Mirai-greip_flood4.pcap',
    'Mirai-udpplain.pcap',
    'Mirai-udpplain1.pcap',
    'Mirai-udpplain2.pcap',
    'Mirai-udpplain3.pcap',
    'Mirai-udpplain4.pcap'
]


CHUNK_SIZE = 5000          
STRIDE = 10000             

# ============================================================================
# FUNCTION: Extract meta-features using ONLY pymfe
# ============================================================================
def extract_metafeatures(df):

    try:
        # Initialize pymfe
        mfe = MFE(
            groups=["general", "statistical"],
            summary=["mean", "sd"]
        )
        
        # Fit on the data (unsupervised)
        mfe.fit(df.values, None)
        
        # Extract features
        feature_names, feature_values = mfe.extract()
        
        # Convert to dict
        meta = dict(zip(feature_names, feature_values))
        
        # Handle inf/nan from pymfe
        for key in meta:
            if not np.isfinite(meta[key]):
                meta[key] = 0.0
        
        return meta
    
    except Exception as e:
        # Minimal fallback
        return {
            'nr_attr': df.shape[1],
            'nr_inst': len(df),
            'mean': df.mean().mean() if df.shape[1] > 0 else 0
        }

# ============================================================================
# FUNCTION: Process file into chunks (with sampling)
# ============================================================================
def process_file_chunks(file_path, chunk_size, stride):

    samples = []
    
    # Load file
    df = pd.read_csv(file_path)
    file_size = len(df)
    
    # Process in chunks with stride
    n_chunks = 0
    for i in range(0, file_size - chunk_size + 1, stride):
        chunk = df.iloc[i:i + chunk_size]
        
        # Extract meta-features
        if n_chunks % 5 == 0:  # Progress every 5 chunks
            print(f".", end="", flush=True)
        
        meta = extract_metafeatures(chunk)
        samples.append(meta)
        n_chunks += 1
    
    # Calculate coverage
    rows_processed = n_chunks * chunk_size
    coverage = min(100, (rows_processed / file_size) * 100)
    
    return samples, coverage

# ============================================================================
# FUNCTION: Process datasets with chunking
# ============================================================================
def process_datasets(file_list, base_path, label):

    X_samples = []
    y_labels = []
    
    print(f"\n Processing: {label}")
    print("="*70)
    
    total_files_coverage = []
    
    for idx, filename in enumerate(file_list, 1):
        start_time = time.time()
        
        # Try multiple extensions
        paths = [
            os.path.join(base_path, filename),
            os.path.join(base_path, filename + '.csv'),
            os.path.join(base_path, filename + '.pcap.csv')
        ]
        
        file_path = None
        for p in paths:
            if os.path.exists(p):
                file_path = p
                break
        
        if not file_path:
            print(f"  [{idx}/{len(file_list)}]   {filename} - Not found")
            continue
        
        try:
            print(f"  [{idx}/{len(file_list)}] {filename}... ", end="", flush=True)
            
            samples, coverage = process_file_chunks(file_path, CHUNK_SIZE, STRIDE)
            total_files_coverage.append(coverage)
            
            # Add to training set
            for meta in samples:
                X_samples.append(list(meta.values()))
                y_labels.append(label)
            
            elapsed = time.time() - start_time
            print(f"  {len(samples)} chunks (~{coverage:.0f}% sampled)")
        
        except Exception as e:
            print(f"\n  [{idx}/{len(file_list)}]  {filename}: {e}")
    
    avg_coverage = np.mean(total_files_coverage) if total_files_coverage else 0
    print(f"\n {label}: {len(y_labels)} samples | Avg coverage: ~{avg_coverage:.0f}%")
    return X_samples, y_labels

# ============================================================================
# TRAINING
# ============================================================================
print("="*70)
print(" TRAINING CONTEXT CLASSIFIER ")
print("="*70)

total_start = time.time()

# Process both contexts
X_doorbell, y_doorbell = process_datasets(DOORBELL_FILES, DOORBELL_PATH, 'Doorbell')
X_other, y_other = process_datasets(OTHER_FILES, OTHER_PATH, 'Other')

# Combine datasets
X = np.array(X_doorbell + X_other)
y = np.array(y_doorbell + y_other)

print(f"\n Dataset generated: {X.shape}")
print(f"   Balance: Doorbell={sum(y=='Doorbell')}, Other={sum(y=='Other')}")

# Check for inf/nan
if not np.isfinite(X).all():
    print("\n  Cleaning inf/nan values...")
    X = np.nan_to_num(X, nan=0.0, posinf=1e10, neginf=-1e10)
    print("    Cleaned")

if len(X) < 10:
    print("\n ERROR: Too few samples generated")
else:
    # Normalize features
    print("\n Normalizing features...")
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Train/test split
    print(" Splitting train/test (80/20)...")
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=0.2, stratify=y, random_state=42
    )
    print(f"   Train: {len(X_train)} | Test: {len(X_test)}")
    
    # Train Random Forest
    print("\n Training Random Forest...")
    model = RandomForestClassifier(
        n_estimators=100,
        max_depth=10,
        class_weight='balanced',
        random_state=42,
        n_jobs=-1
    )
    model.fit(X_train, y_train)
    print("   Complete")
    
    # Evaluate on test set
    print("\n TEST SET EVALUATION:")
    y_pred = model.predict(X_test)
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    # Cross-validation
    print("\n Cross-Validation (5-fold)...")
    scores = cross_val_score(model, X_scaled, y, cv=5, n_jobs=-1)
    print(f"   Accuracy: {scores.mean():.2%} ± {scores.std():.2%}")
    print(f"   Individual scores: {[f'{s:.1%}' for s in scores]}")
    
    
    if len(feature_names) == X.shape[1]:
        importances = pd.DataFrame({
            'feature': feature_names,
            'importance': model.feature_importances_
        }).sort_values('importance', ascending=False)
        print(importances.head(10).to_string(index=False))
    
    # Save model
    print("\n Saving model...")
    joblib.dump(model, 'device_selector_classifier_pymfe.pkl')
    joblib.dump(scaler, 'device_selector_scaler_pymfe.pkl')
    print("  device_selector_classifier_pymfe.pkl")
    print("  device_selector_scaler_pymfe.pkl")


print("\n" + "="*70)
print("COMPLETE")
print("="*70)



 TRAINING CONTEXT CLASSIFIER 

 Processing: Doorbell
  [1/6] ack... ..  10 chunks (~49% sampled)
  [2/6] benign_traffic... .  5 chunks (~50% sampled)
  [3/6] scan... ...  11 chunks (~51% sampled)
  [4/6] syn... ...  12 chunks (~49% sampled)
  [5/6] udp... .....  24 chunks (~50% sampled)
  [6/6] udpplain... ..  8 chunks (~49% sampled)

 Doorbell: 70 samples | Avg coverage: ~50%

 Processing: Other
  [1/13] BenignTraffic.pcap... ........  36 chunks (~50% sampled)
  [2/13] Mirai-greeth_flood2.pcap... .  3 chunks (~44% sampled)
  [3/13] Mirai-greeth_flood3.pcap... .  3 chunks (~44% sampled)
  [4/13] Mirai-greeth_flood4.pcap... .  3 chunks (~44% sampled)
  [5/13] Mirai-greip_flood1.pcap... .  4 chunks (~57% sampled)
  [6/13] Mirai-greip_flood2.pcap... .  3 chunks (~43% sampled)
  [7/13] Mirai-greip_flood3.pcap... .  4 chunks (~57% sampled)
  [8/13] Mirai-greip_flood4.pcap... .  4 chunks (~56% sampled)
  [9/13] Mirai-udpplain.pcap... .  4 chunks (~55% sampled)
  [10/13] Mirai-udpplain1.pcap.

In [3]:
# DEVICE CLASSIFIER - PREDICTION TEST

import pandas as pd
import numpy as np
import joblib
from pymfe.mfe import MFE
import warnings

# Suppress all warnings for cleaner output
warnings.filterwarnings('ignore')

# CONFIGURATION
MODEL_PATH = 'models/device_selector_classifier_pymfe.pkl'
SCALER_PATH = 'models/device_selector_scaler_pymfe.pkl'
TEST_FILE = 'Mirai-greip_flood19.pcap.csv'

# LOAD MODEL AND PREPROCESSING PIPELINE
print("Loading trained model and scaler...")
model = joblib.load(MODEL_PATH)
scaler = joblib.load(SCALER_PATH)
print("✓ Model loaded successfully\n")

# LOAD TEST DATA

print(f"Reading test file: {TEST_FILE}")
df = pd.read_csv(TEST_FILE)
print(f" Loaded dataset with {df.shape[0]} rows and {df.shape[1]} features\n")

# EXTRACT META-FEATURES
print("Extracting meta-features...")
mfe = MFE(
    groups=["general", "statistical"], 
    summary=["mean", "sd"],
    suppress_warnings=True  # Suppress pymfe internal warnings
)

# Fit and extract features
mfe.fit(df.values, None)
feature_names, feature_values = mfe.extract()

# Clean infinite and NaN values
feature_values = [0.0 if not np.isfinite(v) else v for v in feature_values]
print(f"✓ Extracted {len(feature_values)} meta-features\n")

# NORMALIZE AND PREDICT
print("Running prediction...")

# Prepare input for model
X_new = np.array([feature_values])
X_scaled = scaler.transform(X_new)

# Get prediction and probabilities
prediction = model.predict(X_scaled)[0]
probabilities = model.predict_proba(X_scaled)[0]
confidence = probabilities.max()

# DISPLAY RESULTS
print("\n" + "="*60)
print("PREDICTION RESULTS")
print("="*60)
print(f"\nPredicted Class: {prediction}")
print(f"Confidence: {confidence:.2%}")

Loading trained model and scaler...
✓ Model loaded successfully

Reading test file: Mirai-greip_flood19.pcap.csv
 Loaded dataset with 34844 rows and 39 features

Extracting meta-features...
✓ Extracted 48 meta-features

Running prediction...

PREDICTION RESULTS

Predicted Class: Other
Confidence: 100.00%
