In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import IsolationForest
from sklearn.covariance import EllipticEnvelope
from sklearn.neighbors import LocalOutlierFactor
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
import logging
from typing import List, Dict, Tuple
import time
from datetime import datetime

class NetworkAnomalyDetector:
    def __init__(self, contamination: float = 0.1, random_state: int = 42):
        self.contamination = contamination
        self.random_state = random_state
        self.feature_names = None
        self.models = {
            'isolation_forest': IsolationForest(
                contamination=contamination,
                random_state=random_state,
                n_jobs=-1
            ),
            'robust_covariance': EllipticEnvelope(
                contamination=contamination,
                random_state=random_state
            ),
            'lof': LocalOutlierFactor(
                contamination=contamination,
                n_jobs=-1,
                novelty=False
            )
        }
        self.autoencoder = None
        self.preprocessor = None
        self.setup_logging()

    def setup_logging(self):
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s'
        )
        self.logger = logging.getLogger(__name__)

    def _create_autoencoder(self, input_dim: int) -> Model:
        encoding_dim = max(input_dim // 2, 8)
        hidden_dim = max(input_dim // 4, 4)
        bottleneck_dim = max(input_dim // 8, 2)

        input_layer = Input(shape=(input_dim,))
        encoded = Dense(encoding_dim, activation='relu')(input_layer)
        encoded = Dropout(0.2)(encoded)
        encoded = Dense(hidden_dim, activation='relu')(encoded)
        bottleneck = Dense(bottleneck_dim, activation='relu')(encoded)
        decoded = Dense(hidden_dim, activation='relu')(bottleneck)
        decoded = Dropout(0.2)(decoded)
        decoded = Dense(encoding_dim, activation='relu')(decoded)
        output_layer = Dense(input_dim, activation='sigmoid')(decoded)
       
        autoencoder = Model(input_layer, output_layer)
        autoencoder.compile(optimizer='adam', loss='mse')
        return autoencoder

    def _preprocess_data(self, data: List[Dict]) -> Tuple[pd.DataFrame, np.ndarray]:
        # Convert to DataFrame
        df = pd.DataFrame(data)
       
        # Basic features
        df['timestamp'] = pd.to_datetime(df['timestamp'])
        df['hour'] = df['timestamp'].dt.hour
        df['minute'] = df['timestamp'].dt.minute
       
        # Connection frequencies
        df['app_freq'] = df.groupby('app_name')['app_name'].transform('count')
        df['remote_addr_freq'] = df.groupby('remote_addr')['remote_addr'].transform('count')
        df['port_freq'] = df.groupby('local_port')['local_port'].transform('count')
       
        # Port categories (as string to avoid categorical dtype issues)
        df['port_category'] = pd.cut(
            df['local_port'],
            bins=[0, 1024, 49151, 65535],
            labels=['system', 'user', 'dynamic']
        ).astype(str)
       
        # Localhost flag
        df['is_localhost'] = df['remote_addr'].str.startswith('127.0.0.1').astype(int)
       
        # Calculate connection counts (simplified approach)
        df['connections_count'] = 1  # Start with 1 connection per row
       
        # Define features for the model
        numeric_features = [
            'local_port', 'hour', 'minute',
            'app_freq', 'remote_addr_freq', 'port_freq',
            'is_localhost', 'connections_count'
        ]
        categorical_features = ['proto', 'app_name', 'port_category']

        # Initialize preprocessor if needed
        if self.preprocessor is None:
            self.preprocessor = ColumnTransformer(
                transformers=[
                    ('num', StandardScaler(), numeric_features),
                    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
                ]
            )
            self.preprocessor.fit(df[numeric_features + categorical_features])

        # Transform features
        features = self.preprocessor.transform(df[numeric_features + categorical_features])
       
        # Convert to dense array if sparse
        if not isinstance(features, np.ndarray):
            features = features.toarray()

        # Store feature names
        if self.feature_names is None:
            # Get categorical feature names after one-hot encoding
            cat_encoder = self.preprocessor.named_transformers_['cat']
            cat_feature_names = []
            for i, feature in enumerate(categorical_features):
                for category in cat_encoder.categories_[i]:
                    cat_feature_names.append(f"{feature}_{category}")
           
            self.feature_names = numeric_features + cat_feature_names

        return df, features

    def fit(self, data: List[Dict]) -> None:
        self.logger.info("Starting model training...")
        start_time = time.time()

        try:
            # Preprocess data
            _, features = self._preprocess_data(data)
            n_samples = features.shape[0]

            # Train models
            self.logger.info("Training Isolation Forest...")
            self.models['isolation_forest'].fit(features)

            self.logger.info("Training Robust Covariance...")
            self.models['robust_covariance'].fit(features)

            self.logger.info("Training LOF...")
            self.models['lof'].fit(features)

            # Train Autoencoder
            self.logger.info("Training Autoencoder...")
            if self.autoencoder is None:
                self.autoencoder = self._create_autoencoder(features.shape[1])

            self.autoencoder.fit(
                features,
                features,
                epochs=50,
                batch_size=32,
                validation_split=0.2,
                callbacks=[EarlyStopping(patience=3, restore_best_weights=True)],
                verbose=0
            )

            training_time = time.time() - start_time
            self.logger.info(f"Model training completed in {training_time:.2f} seconds")

        except Exception as e:
            self.logger.error(f"Error during training: {str(e)}")
            raise

    def predict(self, data: List[Dict], threshold_multiplier: float = 1.5) -> Dict:
        self.logger.info("Starting anomaly detection...")
       
        try:
            # Preprocess data
            df, features = self._preprocess_data(data)
            n_samples = features.shape[0]
           
            # Initialize predictions dictionary
            predictions = {}
           
            # Get predictions from each model
            predictions['isolation_forest'] = (
                self.models['isolation_forest'].predict(features) == -1
            )

            predictions['robust_covariance'] = (
                self.models['robust_covariance'].predict(features) == -1
            )

            predictions['lof'] = (
                self.models['lof'].fit_predict(features) == -1
            )

            # Autoencoder predictions
            reconstructed = self.autoencoder.predict(features)
            reconstruction_error = np.mean(np.power(features - reconstructed, 2), axis=1)
            threshold = np.mean(reconstruction_error) + threshold_multiplier * np.std(reconstruction_error)
            predictions['autoencoder'] = reconstruction_error > threshold

            # Combine predictions
            ensemble_scores = np.zeros(n_samples)
            for pred_array in predictions.values():
                ensemble_scores += pred_array.astype(int)
           
            anomaly_scores = ensemble_scores / len(predictions)

            # Get frequency information for filtering
            app_frequencies = df.groupby('app_name').size()
            remote_addr_frequencies = df.groupby('remote_addr').size()
           
            # Calculate frequency thresholds (e.g., above 75th percentile)
            app_freq_threshold = np.percentile(app_frequencies, 75)
            remote_freq_threshold = np.percentile(remote_addr_frequencies, 75)
           
            # Filter anomalies based on high frequency
            high_freq_mask = (
                (df.groupby('app_name')['app_name'].transform('count') > app_freq_threshold) |
                (df.groupby('remote_addr')['remote_addr'].transform('count') > remote_freq_threshold)
            ).values
           
            # Only mark as anomalies if both anomaly score is high AND frequency is high
            is_anomaly = (anomaly_scores >= 0.5) & high_freq_mask

            # Prepare results
            results = {
                'anomaly_scores': anomaly_scores,
                'is_anomaly': is_anomaly,
                'model_predictions': predictions,
                'reconstruction_error': reconstruction_error,
                'frequency_info': {
                    'app_frequencies': app_frequencies.to_dict(),
                    'remote_addr_frequencies': remote_addr_frequencies.to_dict(),
                    'app_threshold': app_freq_threshold,
                    'remote_threshold': remote_freq_threshold
                }
            }

            # Add feature importance if available
            if hasattr(self.models['isolation_forest'], 'feature_importances_'):
                results['feature_importance'] = dict(zip(
                    self.feature_names,
                    self.models['isolation_forest'].feature_importances_
                ))

            # Add anomaly details
            anomaly_details = []
            for idx, is_anomaly in enumerate(results['is_anomaly']):
                if is_anomaly:
                    anomaly_details.append({
                        'timestamp': data[idx]['timestamp'],
                        'app_name': data[idx]['app_name'],
                        'remote_addr': data[idx]['remote_addr'],
                        'local_port': data[idx]['local_port'],
                        'anomaly_score': anomaly_scores[idx],
                        'detection_methods': [
                            method for method, preds in predictions.items()
                            if preds[idx]
                        ]
                    })

            results['anomaly_details'] = anomaly_details
            self.logger.info(f"Detection completed. Found {len(anomaly_details)} anomalies.")
            return results

        except Exception as e:
            self.logger.error(f"Error during prediction: {str(e)}")
            raise

def format_results(results: Dict) -> str:
    output = ["=== Network Anomaly Detection Results ===\n"]
   
    total_connections = len(results['anomaly_scores'])
    total_anomalies = sum(results['is_anomaly'])
   
    output.append(f"Total connections analyzed: {total_connections}")
    output.append(f"Anomalies detected: {total_anomalies} ({total_anomalies/total_connections:.1%})")
   
    if results['anomaly_details']:
        output.append("\nDetailed Anomaly Information:")
        for detail in sorted(results['anomaly_details'],
                           key=lambda x: x['anomaly_score'],
                           reverse=True):
            output.append(f"\nTimestamp: {detail['timestamp']}")
            output.append(f"Application: {detail['app_name']}")
            output.append(f"Remote Address: {detail['remote_addr']}")
            output.append(f"Local Port: {detail['local_port']}")
            output.append(f"Anomaly Score: {detail['anomaly_score']:.3f}")
            output.append(f"Frequency Info:")
            output.append(f"  - App Frequency: {results['frequency_info']['app_frequencies'].get(detail['app_name'], 0)}")
            output.append(f"  - Remote Addr Frequency: {results['frequency_info']['remote_addr_frequencies'].get(detail['remote_addr'], 0)}")
            output.append(f"Detection Methods: {', '.join(detail['detection_methods'])}")
       
        output.append(f"\nFrequency Thresholds:")
        output.append(f"App Frequency Threshold: {results['frequency_info']['app_threshold']:.0f}")
        output.append(f"Remote Address Frequency Threshold: {results['frequency_info']['remote_threshold']:.0f}")
   
    if 'feature_importance' in results:
        output.append("\nTop Contributing Features:")
        sorted_features = sorted(
            results['feature_importance'].items(),
            key=lambda x: x[1],
            reverse=True
        )[:5]
        for feature, importance in sorted_features:
            output.append(f"{feature}: {importance:.3f}")
   
    return "\n".join(output)

In [2]:
data_list = [
    {
        "local_addr": "127.0.0.1",
        "remote_addr": "127.0.0.1",
        "local_port": 49152,
        "proto": "tcp",
        "app_name": "TabNine-dee",
        "timestamp": "2024-11-16 01:00:00"  
    },
        {
        "local_addr": "127.0.0.1",
        "remote_addr": "127.0.0.1",
        "local_port": 4452,
        "proto": "tcp",
        "app_name": "TabNine-dee",
        "timestamp": "2024-11-16 01:00:00"  
    },
        {
        "local_addr": "127.0.0.1",
        "remote_addr": "127.0.0.1",
        "local_port": 4152,
        "proto": "tcp",
        "app_name": "TabNine-dee",
        "timestamp": "2024-11-16 01:00:00"  
    },
        {
        "local_addr": "127.0.0.1",
        "remote_addr": "127.0.0.1",
        "local_port": 493,
        "proto": "tcp",
        "app_name": "TabNine-dee",
        "timestamp": "2024-11-16 01:00:00"  
    },
        {
        "local_addr": "127.0.0.1",
        "remote_addr": "127.0.0.1",
        "local_port": 492,
        "proto": "tcp",
        "app_name": "TabNine-dee",
        "timestamp": "2024-11-16 01:00:00"  
    },
        {
        "local_addr": "127.0.0.1",
        "remote_addr": "127.0.0.1",
        "local_port": 49152,
        "proto": "tcp",
        "app_name": "TabNine-dee",
        "timestamp": "2024-11-16 01:00:00"  
    },
        {
        "local_addr": "127.0.0.1",
        "remote_addr": "127.0.0.1",
        "local_port": 49152,
        "proto": "tcp",
        "app_name": "TabNine-dee",
        "timestamp": "2024-11-16 01:00:00"  
    },
        {
        "local_addr": "127.0.0.1",
        "remote_addr": "127.0.0.1",
        "local_port": 49152,
        "proto": "tcp",
        "app_name": "TabNine-dee",
        "timestamp": "2024-11-16 01:00:00"  
    },
    {
        "local_addr": "10.105.8.94",
        "remote_addr": "35.174.127.31",
        "local_port": 57534,
        "proto": "tcp",
        "app_name": "firefox",
        "timestamp": "2024-11-16 01:00:00"
    },
    {
        "local_addr": "127.0.0.1",
        "remote_addr": "127.0.0.1",
        "local_port": 480,
        "proto": "tcp",
        "app_name": "TabNine",
        "timestamp": "2024-11-16 01:00:00"
    },
    {
        "local_addr": "10.105.8.94",
        "remote_addr": "34.117.41.85",
        "local_port": 39832,
        "proto": "tcp",
        "app_name": "warp-terminal",
        "timestamp": "2024-11-16 01:00:00"
    },
    {
        "local_addr": "10.105.8.94",
        "remote_addr": "34.123.33.186",
        "local_port": 39308,
        "proto": "tcp",
        "app_name": "TabNine",
        "timestamp": "2024-11-16 01:00:00"
    },
    {
        "local_addr": "10.105.8.94",
        "remote_addr": "142.250.192.2",
        "local_port": 40042,
        "proto": "tcp",
        "app_name": "brave",
        "timestamp": "2024-11-16 01:00:00"
    },
    {
        "local_addr": "127.0.0.1",
        "remote_addr": "127.0.0.1",
        "local_port": 49152,
        "proto": "tcp",
        "app_name": "TabNine-dee",
        "timestamp": "2024-11-16 01:00:00"
    },
    {
        "local_addr": "10.105.8.94",
        "remote_addr": "34.170.65.59",
        "local_port": 45770,
        "proto": "tcp",
        "app_name": "TabNine",
        "timestamp": "2024-11-16 01:00:00"
    },
    {
        "local_addr": "10.145.254.10",
        "remote_addr": "34.96.117.51",
        "local_port": 49608,
        "proto": "tcp",
        "app_name": "warp-terminal",
        "timestamp": "2024-11-16 01:00:00"
    },
    {
        "local_addr": "10.105.8.94",
        "remote_addr": "142.250.77.19",
        "local_port": 51310,
        "proto": "tcp",
        "app_name": "brave",
        "timestamp": "2024-11-16 01:00:00"
    },
    {
        "local_addr": "10.105.8.94",
        "remote_addr": "13.89.178.26",
        "local_port": 37338,
        "proto": "tcp",
        "app_name": "Code",
        "timestamp": "2024-11-16 01:00:00"
    },
    {
        "local_addr": "10.105.8.94",
        "remote_addr": "34.197.157.88",
        "local_port": 56172,
        "proto": "tcp",
        "app_name": "brave",
        "timestamp": "2024-11-16 01:00:00"
    },
        {
        "local_addr": "10.105.8.94",
        "remote_addr": "13.89.178.26",
        "local_port": 37338,
        "proto": "tcp",
        "app_name": "Code",
        "timestamp": "2024-11-16 01:00:00"
    },
    {
        "local_addr": "10.105.8.94",
        "remote_addr": "34.197.157.88",
        "local_port": 56172,
        "proto": "tcp",
        "app_name": "brave",
        "timestamp": "2024-11-16 01:00:00"
    },
        {
        "local_addr": "10.105.8.94",
        "remote_addr": "13.89.178.26",
        "local_port": 37338,
        "proto": "tcp",
        "app_name": "Code",
        "timestamp": "2024-11-16 01:00:00"
    },
    {
        "local_addr": "10.105.8.94",
        "remote_addr": "34.197.157.88",
        "local_port": 56172,
        "proto": "tcp",
        "app_name": "brave",
        "timestamp": "2024-11-16 01:00:00"
    },
        {
        "local_addr": "10.105.8.94",
        "remote_addr": "13.89.178.26",
        "local_port": 37338,
        "proto": "tcp",
        "app_name": "Code",
        "timestamp": "2024-11-16 01:00:00"
    },
    {
        "local_addr": "10.105.8.94",
        "remote_addr": "34.197.157.88",
        "local_port": 56172,
        "proto": "tcp",
        "app_name": "brave",
        "timestamp": "2024-11-16 01:00:00"
    },
    {
        "local_addr": "10.105.8.94",
        "remote_addr": "140.82.112.25",
        "local_port": 59970,
        "proto": "tcp",
        "app_name": "brave",
        "timestamp": "2024-11-16 01:00:00"
    },
    {
        "local_addr": "10.105.8.94",
        "remote_addr": "34.117.41.85",
        "local_port": 33412,
        "proto": "tcp",
        "app_name": "warp-terminal",
        "timestamp": "2024-11-16 01:00:00"
    },
    {
        "local_addr": "10.105.8.94",
        "remote_addr": "142.250.207.2",
        "local_port": 55382,
        "proto": "tcp",
        "app_name": "brave",
        "timestamp": "2024-11-16 01:00:00"
    }
    ,
        {
        "local_addr": "10.105.8.94",
        "remote_addr": "142.250.207.2",
        "local_port": 55382,
        "proto": "tcp",
        "app_name": "brave",
        "timestamp": "2024-11-16 01:00:00"
    },
        {
        "local_addr": "10.105.8.94",
        "remote_addr": "142.250.207.2",
        "local_port": 55382,
        "proto": "tcp",
        "app_name": "brave",
        "timestamp": "2024-11-16 01:00:00"
    },
        {
        "local_addr": "10.105.8.94",
        "remote_addr": "142.250.207.2",
        "local_port": 55382,
        "proto": "tcp",
        "app_name": "brave",
        "timestamp": "2024-11-16 01:00:00"
    },
        {
        "local_addr": "10.105.8.94",
        "remote_addr": "142.250.207.2",
        "local_port": 55382,
        "proto": "tcp",
        "app_name": "brave",
        "timestamp": "2024-11-16 01:00:00"
    },
        {
        "local_addr": "10.105.8.94",
        "remote_addr": "142.250.207.2",
        "local_port": 55382,
        "proto": "tcp",
        "app_name": "brave",
        "timestamp": "2024-11-16 01:00:00"
    },
        {
        "local_addr": "10.105.8.94",
        "remote_addr": "142.250.207.2",
        "local_port": 5245,
        "proto": "tcp",
        "app_name": "brave",
        "timestamp": "2024-11-16 01:00:00"
    },
        {
        "local_addr": "10.105.8.94",
        "remote_addr": "142.250.207.2",
        "local_port": 52,
        "proto": "tcp",
        "app_name": "brave",
        "timestamp": "2024-11-16 01:00:00"
    },
        {
        "local_addr": "10.105.8.94",
        "remote_addr": "142.250.207.2",
        "local_port": 582,
        "proto": "tcp",
        "app_name": "brave",
        "timestamp": "2024-11-16 01:00:00"
    },    {
        "local_addr": "10.105.8.94",
        "remote_addr": "142.250.207.2",
        "local_port": 5382,
        "proto": "tcp",
        "app_name": "brave",
        "timestamp": "2024-11-16 01:00:00"
    }
]

In [3]:
n = NetworkAnomalyDetector()

In [4]:
n.fit(data_list)

2024-12-11 10:59:46,359 - INFO - Starting model training...
2024-12-11 10:59:46,386 - INFO - Training Isolation Forest...
2024-12-11 10:59:46,465 - INFO - Training Robust Covariance...
2024-12-11 10:59:46,475 - INFO - Training LOF...
2024-12-11 10:59:46,509 - INFO - Training Autoencoder...
2024-12-11 10:59:47,877 - INFO - Model training completed in 1.52 seconds


In [5]:
results = n.predict(data_list)

2024-12-11 11:00:02,415 - INFO - Starting anomaly detection...


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step


2024-12-11 11:00:02,508 - INFO - Detection completed. Found 1 anomalies.


In [7]:
results["anomaly_details"]

[{'timestamp': '2024-11-16 01:00:00',
  'app_name': 'TabNine',
  'remote_addr': '127.0.0.1',
  'local_port': 480,
  'anomaly_score': 0.75,
  'detection_methods': ['isolation_forest', 'lof', 'autoencoder']}]