In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler, RobustScaler
from imblearn.combine import SMOTEENN
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (
    BatchNormalization, Conv1D, MaxPooling1D, GRU, Dense, 
    Dropout, Bidirectional, Flatten
)
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import keras_tuner as kt
import kagglehub
from sklearn.feature_selection import SelectKBest, mutual_info_classif


class NetworkTrafficPipeline:
    def __init__(self):
        self.preprocessor = None
        self.model = None
        self.path = None
        
    def download_dataset(self):
        """Download dataset using kagglehub."""
        self.path = kagglehub.dataset_download("dhoogla/unswnb15")
        print("Dataset downloaded to:", self.path)
        return self.path
    
    def load_data(self):
        """Load training and testing datasets."""
        train_data = pd.read_parquet(f"{self.path}/UNSW_NB15_training-set.parquet")
        test_data = pd.read_parquet(f"{self.path}/UNSW_NB15_testing-set.parquet")
        return train_data, test_data
# Preprocessor Class
class DataAnalyzer:
    """
    A comprehensive class for exploratory data analysis and feature engineering insights.
    
    This class provides methods for:
    - Basic statistical analysis
    - Correlation detection
    - Outlier identification
    - Feature engineering suggestions and visualization
    - Data distribution analysis
    """
    
    def __init__(self, df):
        """
        Initialize the analyzer with a pandas DataFrame.
        
        Args:
            df (pd.DataFrame): Input dataset for analysis
        """
        self.df = df.copy()  # Create a copy to avoid modifying original data
        self.num_cols = self.df.select_dtypes(include=['int64', 'float64']).columns
        self.cat_cols = self.df.select_dtypes(include=['object', 'category', 'bool']).columns

    def basic_analysis(self):
        """
        Perform basic dataset analysis.
        
        Returns:
            dict: Contains missing values, numerical statistics, and categorical counts
        """
        analysis = {
            'missing_values': {
                'count': self.df.isnull().sum(),
                'percentage': (self.df.isnull().sum() / len(self.df) * 100).round(2)
            },
            'numerical_stats': self.df[self.num_cols].describe(),
            'categorical_stats': {
                col: {
                    'unique_values': self.df[col].nunique(),
                    'value_counts': self.df[col].value_counts(),
                    'missing_percentage': (self.df[col].isnull().sum() / len(self.df) * 100).round(2)
                } for col in self.cat_cols
            }
        }
        return analysis

    def identify_correlations(self, threshold=0.7):
        """
        Identify strong correlations between numerical features.
        
        Args:
            threshold (float): Correlation coefficient threshold (default: 0.7)
            
        Returns:
            pd.DataFrame: Pairs of highly correlated features
        """
        if len(self.num_cols) < 2:
            return pd.DataFrame()
            
        corr_matrix = self.df[self.num_cols].corr()
        high_corr = np.where(np.abs(corr_matrix) > threshold)
        high_corr_pairs = [(corr_matrix.index[x], corr_matrix.columns[y], corr_matrix.iloc[x, y])
                          for x, y in zip(*high_corr) if x != y and x < y]
        return pd.DataFrame(high_corr_pairs, columns=['Feature1', 'Feature2', 'Correlation'])

    def detect_outliers(self, cols=None, method='zscore', threshold=3):
        """
        Detect outliers using multiple methods.
        
        Args:
            cols (list): Specific columns to check (default: all numerical columns)
            method (str): 'zscore' or 'iqr' (default: 'zscore')
            threshold (float): Threshold for outlier detection (default: 3)
            
        Returns:
            dict: Outlier statistics for each analyzed column
        """
        if cols is None:
            cols = self.num_cols
            
        outliers = {}
        for col in cols:
            if method == 'zscore':
                z_scores = np.abs(stats.zscore(self.df[col].dropna()))
                outlier_mask = z_scores > threshold
            else:  # IQR method
                Q1 = self.df[col].quantile(0.25)
                Q3 = self.df[col].quantile(0.75)
                IQR = Q3 - Q1
                outlier_mask = ((self.df[col] < (Q1 - 1.5 * IQR)) | 
                              (self.df[col] > (Q3 + 1.5 * IQR)))
                
            outliers[col] = {
                'count': outlier_mask.sum(),
                'percentage': (outlier_mask.sum() / len(self.df[col].dropna()) * 100).round(2),
                'indexes': self.df[outlier_mask].index.tolist(),
                'range': {
                    'min': self.df[col][~outlier_mask].min(),
                    'max': self.df[col][~outlier_mask].max()
                }
            }
        return outliers

    def analyze_class_distribution(self, data, plot=True):
        """
        Analyze the distribution of classes in a dataset.
        
        Args:
            data (pd.DataFrame): Dataset 
            plot (bool): If True, display a bar plot of class distribution (default: True)
            
        Returns:
            pd.Series: Contagem de cada classe
        """
        class_distribution = data['attack_cat'].value_counts()
        
        if plot:
            plt.figure(figsize=(12, 6))
            sns.barplot(x=class_distribution.index, y=class_distribution.values)
            plt.xticks(rotation=45)
            plt.title('Attack Category Distribution')
            plt.xlabel('Attack Category')
            plt.ylabel('Count')
            plt.tight_layout()
            plt.show()
            
            # Gráfico de pizza para proporção normal vs. ataque
            plt.figure(figsize=(8, 8))
            attack_prop = data['label'].value_counts()
            plt.pie(attack_prop, labels=['Normal', 'Attack'], autopct='%1.1f%%')
            plt.title('Normal vs. Attack Traffic Distribution')
            plt.show()
        
        return class_distribution


class Preprocessor:
    def __init__(self, train_data, test_data):
        self.train_data = train_data.copy()
        self.test_data = test_data.copy()
        self.scalers = {
            'standard': StandardScaler(),
            'robust': RobustScaler()
        }
        self.label_encoder = {}
        self.categorical_columns = ['proto', 'service', 'state']
        self.selected_features = None
        self.feature_selector = None

    def add_engineered_features(self, data):
        """Add engineered features to improve model performance."""
        # Packet-based features
        data['bytes_per_packet'] = data['sbytes'] / (data['spkts'] + 1e-8)
        data['packet_size_ratio'] = (data['sbytes'] + 1e-8) / (data['dbytes'] + 1e-8)
        
        # Time-based features
        data['bytes_per_second'] = (data['sbytes'] + data['dbytes']) / (data['dur'] + 1e-8)
        data['packets_per_second'] = (data['spkts'] + data['dpkts']) / (data['dur'] + 1e-8)
        
        # Rate-based features
        data['srate'] = data['spkts'] / (data['dur'] + 1e-8)
        data['drate'] = data['dpkts'] / (data['dur'] + 1e-8)
        
        # Statistical features
        data['byte_ratio'] = np.log1p(data['sbytes']) - np.log1p(data['dbytes'])
        data['packet_ratio'] = np.log1p(data['spkts']) - np.log1p(data['dpkts'])
        
        # Interaction features
        data['sload'] = (data['sbytes'] * data['srate']) / (1e-8 + data['dur'])
        data['dload'] = (data['dbytes'] * data['drate']) / (1e-8 + data['dur'])
        
        return data

    def handle_outliers(self, data, columns, method='iqr'):
        """Handle outliers using IQR method."""
        for column in columns:
            Q1 = data[column].quantile(0.25)
            Q3 = data[column].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            data[column] = data[column].clip(lower_bound, upper_bound)
        return data

    def select_features(self, X, y, n_features=24):
        """Select most important features using mutual information."""
        if self.feature_selector is None:
            self.feature_selector = SelectKBest(score_func=mutual_info_classif, k=n_features)
            self.feature_selector.fit(X, y)
            self.selected_features = X.columns[self.feature_selector.get_support()].tolist()
        return X[self.selected_features]

    def balance_classes(self, X, y):
        """Advanced class balancing using combination of techniques."""
        # Remove noise using Tomek links
        tl = TomekLinks(sampling_strategy='majority')
        X_cleaned, y_cleaned = tl.fit_resample(X, y)
        
        # Apply ADASYN for minority classes
        minority_classes = [cls for cls, count in pd.Series(y_cleaned).value_counts().items() 
                          if count < len(y_cleaned) * 0.1]
        
        if minority_classes:
            sampling_strategy = {cls: int(len(y_cleaned) * 0.1) for cls in minority_classes}
            adasyn = ADASYN(sampling_strategy=sampling_strategy, random_state=42)
            X_balanced, y_balanced = adasyn.fit_resample(X_cleaned, y_cleaned)
        else:
            X_balanced, y_balanced = X_cleaned, y_cleaned
        
        return X_balanced, y_balanced

    def train_set_preprocess(self):
        """Preprocess training data."""
        # Remove unnecessary columns
        columns_to_drop = ['swin', 'stcpb', 'dtcpb', 'dwin', 'is_sm_ips_ports', 'ct_flw_http_mthd']
        self.train_data.drop(columns=columns_to_drop, errors='ignore', inplace=True)
        
        # Add engineered features
        self.train_data = self.add_engineered_features(self.train_data)
        
        # Handle categorical features
        for col in self.categorical_columns:
            self.label_encoder[col] = LabelEncoder()
            self.train_data[col] = self.label_encoder[col].fit_transform(self.train_data[col])
        
        # Separate features and target
        X = self.train_data.drop(['attack_cat', 'label'], axis=1)
        y = self.train_data['attack_cat']
        
        # Handle numerical features
        numerical_columns = X.select_dtypes(include=['float64', 'int64']).columns
        X = self.handle_outliers(X, numerical_columns)
        
        # Encode target
        self.label_encoder['attack_cat'] = LabelEncoder()
        y = self.label_encoder['attack_cat'].fit_transform(y)
        
        # Feature selection
        X = self.select_features(X, y)
        
        # Scale features
        X = pd.DataFrame(self.scalers['robust'].fit_transform(X), columns=X.columns)
        
        return X, y

    def test_set_preprocess(self):
        """Preprocess test data."""
        self.test_data.drop(columns=['swin', 'stcpb', 'dtcpb', 'dwin', 'is_sm_ips_ports', 'ct_flw_http_mthd'], 
                           errors='ignore', inplace=True)
        
        self.test_data = self.add_engineered_features(self.test_data)
        
        for col in self.categorical_columns:
            known_classes = set(self.label_encoder[col].classes_)
            self.test_data[col] = self.test_data[col].map(lambda x: x if x in known_classes else list(known_classes)[0])
            self.test_data[col] = self.label_encoder[col].transform(self.test_data[col])
        
        X = self.test_data.drop(['attack_cat', 'label'], axis=1)
        y = self.test_data['attack_cat']
        
        numerical_columns = X.select_dtypes(include=['float64', 'int64']).columns
        X = self.handle_outliers(X, numerical_columns)
        
        known_classes = set(self.label_encoder['attack_cat'].classes_)
        y = y.map(lambda x: x if x in known_classes else list(known_classes)[0])
        y = self.label_encoder['attack_cat'].transform(y)
        
        X = X[self.selected_features]
        X = pd.DataFrame(self.scalers['robust'].transform(X), columns=X.columns)
        
        return X, y

# CNN-GRU Model with AutoML
class CNNGRUAutoML:
    def __init__(self, input_shape, num_classes, sequence_length=10):
        self.input_shape = input_shape
        self.num_classes = num_classes
        self.sequence_length = sequence_length
    
    def build_model(self, hp):
        model = Sequential()
        
        # Convolutional layers
        for i in range(hp.Int("conv_blocks", 1, 3)):
            model.add(Conv1D(
                filters=hp.Choice(f"filters_{i}", [32, 64, 128]),
                kernel_size=hp.Choice(f"kernel_size_{i}", [3, 5]),
                activation="relu",
                padding="same"))
            model.add(MaxPooling1D(pool_size=2))
        
        # GRU layers
        for i in range(hp.Int("gru_blocks", 1, 3)):
            model.add(Bidirectional(GRU(
                units=hp.Choice(f"gru_units_{i}", [64, 128, 256]),
                return_sequences=True if i < hp.Int("gru_blocks", 1, 3) - 1 else False)))
            model.add(Dropout(hp.Float(f"dropout_gru_{i}", 0.1, 0.5, step=0.1)))
        
        # Dense layers
        model.add(Dense(hp.Choice("dense_units", [64, 128, 256]), activation="relu"))
        model.add(Dropout(hp.Float("dropout_dense", 0.1, 0.5, step=0.1)))
        model.add(Dense(self.num_classes, activation="softmax"))
        
        # Compile
        model.compile(
            optimizer=Adam(learning_rate=hp.Choice("learning_rate", [1e-3, 1e-4])),
            loss="sparse_categorical_crossentropy",
            metrics=["accuracy"])
        
        return model

    def search_hyperparameters(self, X_train, y_train, X_val, y_val, max_trials=10):
        tuner = kt.Hyperband(
            self.build_model,
            objective="val_accuracy",
            max_epochs=50,
            directory="automl_cnn_gru",
            project_name="unsw_nb15")
        
        tuner.search(X_train, y_train, validation_data=(X_val, y_val), epochs=50, batch_size=32)
        best_model = tuner.get_best_models(num_models=1)[0]
        return best_model

if __name__ == "__main__":
    # Instanciar o pipeline
    pipeline = NetworkTrafficPipeline()
    
    # Baixar o dataset
    path = pipeline.download_dataset()
    
    # Carregar os dados
    train_data, test_data = pipeline.load_data()
    
    # Preprocessamento
    preprocessor = Preprocessor(train_data, test_data)
    X_train, y_train = preprocessor.train_set_preprocess()
    X_test, y_test = preprocessor.test_set_preprocess()
    
    # Preparar dados sequenciais
    def prepare_sequence_data(X, sequence_length):
        pad_size = sequence_length - 1
        X_padded = np.pad(X, ((pad_size, 0), (0, 0)), mode='edge')
        sequences = [X_padded[i:i + sequence_length] for i in range(len(X))]
        return np.array(sequences)
    
    sequence_length = 10
    X_train_seq = prepare_sequence_data(X_train, sequence_length)
    X_test_seq = prepare_sequence_data(X_test, sequence_length)
    
    # Divisão treino-validação
    from sklearn.model_selection import train_test_split
    X_train_seq, X_val_seq, y_train, y_val = train_test_split(X_train_seq, y_train, test_size=0.2, random_state=42)
    
    # AutoML com CNN-GRU
    cnn_gru_automl = CNNGRUAutoML(input_shape=X_train_seq.shape[2], num_classes=len(np.unique(y_train)))
    best_model = cnn_gru_automl.search_hyperparameters(X_train_seq, y_train, X_val_seq, y_val)
    
    # Avaliação nos dados de teste
    test_loss, test_accuracy = best_model.evaluate(X_test_seq, y_test)
    print(f"Test Loss: {test_loss}, Test Accuracy: {test_accuracy}")

Trial 54 Complete [00h 07m 15s]
val_accuracy: 0.7976560592651367

Best val_accuracy So Far: 0.825686514377594
Total elapsed time: 16h 38m 27s

Search: Running Trial #55

Value             |Best Value So Far |Hyperparameter
1                 |1                 |conv_blocks
128               |32                |filters_0
3                 |5                 |kernel_size_0
2                 |1                 |gru_blocks
64                |128               |gru_units_0
0.2               |0.5               |dropout_gru_0
256               |64                |dense_units
0.1               |0.2               |dropout_dense
0.0001            |0.001             |learning_rate
256               |64                |gru_units_1
0.3               |0.1               |dropout_gru_1
256               |64                |gru_units_2
0.1               |0.2               |dropout_gru_2
128               |32                |filters_1
3                 |3                 |kernel_size_1
128               

KeyboardInterrupt: 