In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
import kagglehub
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import TomekLinks
from imblearn.over_sampling import ADASYN
from sklearn.feature_selection import SelectKBest, mutual_info_classif
import warnings
warnings.filterwarnings('ignore')
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.regularizers import l1_l2
from sklearn.ensemble import RandomForestClassifier
from scipy import stats
from tensorflow.keras.layers import Dense, GRU, Dropout, BatchNormalization, Conv1D, Bidirectional
from tensorflow.keras.models import load_model
from typing import Dict, List, Optional, Tuple
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Dict, List, Optional, Tuple
from sklearn.metrics import classification_report, confusion_matrix
from scipy import stats
import warnings
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (
    Input, Conv1D, BatchNormalization, Activation, SpatialDropout1D, Add,
    GRU, Concatenate, GlobalAveragePooling1D, GlobalMaxPooling1D, Dense,
    Dropout, LayerNormalization
)
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import (
    EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
)
from tensorflow.keras.optimizers.schedules import ExponentialDecay
from tensorflow.keras.regularizers import l2
from tensorflow.keras.layers import MultiHeadAttention
import tensorflow as tf
import xgboost as xgb
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight


seed_value = 42
np.random.seed(seed_value)
tf.random.set_seed(seed_value)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
class NetworkTrafficPipeline:
    def __init__(self):
        self.preprocessor = None
        self.model = None
        self.path = None
        
    def download_dataset(self):
        """Download dataset using kagglehub."""
        self.path = kagglehub.dataset_download("dhoogla/unswnb15")
        print("Dataset downloaded to:", self.path)
        return self.path
    
    def load_data(self):
        """Load training and testing datasets."""
        train_data = pd.read_parquet(f"{self.path}/UNSW_NB15_training-set.parquet")
        test_data = pd.read_parquet(f"{self.path}/UNSW_NB15_testing-set.parquet")
        return train_data, test_data

In [4]:
# Initialize pipeline
pipeline = NetworkTrafficPipeline()
    
# Download and load data
path = pipeline.download_dataset()
train_data, test_data = pipeline.load_data()

Dataset downloaded to: C:\Users\Administrator\.cache\kagglehub\datasets\dhoogla\unswnb15\versions\5


In [5]:
from typing import Dict, List, Tuple
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, RobustScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from imblearn.under_sampling import TomekLinks
from imblearn.over_sampling import SMOTENC

class UNSWPreprocessor:
    def __init__(self, data: pd.DataFrame, test_data: pd.DataFrame, val_size: float = 0.2):
        self.data = data.copy()
        self.test_data = test_data.copy()
        self.val_size = val_size
        self.scalers = {
            'robust': RobustScaler(),
            'standard': StandardScaler()
        }
        self.label_encoder: Dict[str, LabelEncoder] = {}
        self.categorical_columns = ['proto', 'service', 'state']
        self.selected_features = None
        self.feature_selector = None
        self.merge_threshold = 0.05
        self.merged_classes_mapping = None
        self.bounds = {}
        
        # Correlation analysis thresholds
        self.high_corr_threshold = 0.9
        self.correlation_pairs = [
            ('sbytes', 'spkts'), ('sbytes', 'sloss'), 
            ('dbytes', 'dpkts'), ('dbytes', 'dloss'),
            ('dwin', 'swin'), ('synack', 'tcprtt'),
            ('ct_dst_sport_ltm', 'ct_src_dport_ltm')
        ]
        
        # Important features based on correlation with target
        self.important_features = [
            'dload', 'ct_dst_sport_ltm', 'dmean', 'rate',
            'ct_src_dport_ltm', 'is_sm_ips_ports'
        ]

    def handle_unknown_labels(self, data: pd.DataFrame, col: str) -> np.ndarray:
        """
        Handles unknown labels by assigning them a default value (-1)
        """
        encoded_column = []
        for value in data[col]:
            if value in self.label_encoder[col].classes_:
                encoded_column.append(self.label_encoder[col].transform([value])[0])
            else:
                encoded_column.append(-1)
        return np.array(encoded_column)

    def split_train_val(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]:
        return train_test_split(
            X, y,
            test_size=self.val_size,
            stratify=y,
            random_state=42
        )

    def add_engineered_features(self, data: pd.DataFrame) -> pd.DataFrame:
        eps = 1e-8
        
        # Basic rate and ratio features
        data['bytes_per_packet'] = (data['sbytes'] + data['dbytes']) / (data['spkts'] + data['dpkts'] + eps)
        data['bytes_ratio'] = np.log1p(data['sbytes']) - np.log1p(data['dbytes'])
        data['packets_ratio'] = np.log1p(data['spkts']) - np.log1p(data['dpkts'])
        data['packet_rate'] = (data['spkts'] + data['dpkts']) / (data['dur'] + eps)
        data['byte_rate'] = (data['sbytes'] + data['dbytes']) / (data['dur'] + eps)
        
        # Source and destination specific features
        data['srate'] = data['spkts'] / (data['dur'] + eps)
        data['drate'] = data['dpkts'] / (data['dur'] + eps)
        data['sload'] = (data['sbytes'] * data['srate']) / (eps + data['dur'])
        data['dload'] = (data['dbytes'] * data['drate']) / (eps + data['dur'])
        
        # Loss-based features
        data['total_loss_ratio'] = (data['sloss'] + data['dloss']) / (data['spkts'] + data['dpkts'] + eps)
        data['loss_ratio_diff'] = (data['sloss'] / (data['spkts'] + eps)) - (data['dloss'] / (data['dpkts'] + eps))
        
        # Connection-based features
        if 'ct_state_ttl' in data.columns:
            data['conn_ratio'] = data['ct_src_dport_ltm'] / (data['ct_dst_sport_ltm'] + eps)
            data['state_ratio'] = data['ct_state_ttl'] / (data['ct_dst_ltm'] + eps)
            data['ct_dst_src_ratio'] = data['ct_dst_ltm'] / (data['ct_src_dport_ltm'] + eps)
        
        # Statistical features
        data['mean_diff'] = data['smean'] - data['dmean']
        data['jitter_ratio'] = np.log1p(data['sjit']) - np.log1p(data['djit'])
        
        # Complexity features
        if 'swin' in data.columns and 'dwin' in data.columns:
            data['protocol_complexity'] = data['swin'] * data['dwin']
        if 'tcprtt' in data.columns and 'synack' in data.columns:
            data['tcp_behaviour'] = (data['tcprtt'] * data['synack']) / (data['dur'] + eps)
        
        return data

    def handle_outliers(self, data: pd.DataFrame, columns: List[str], is_test: bool = False) -> pd.DataFrame:
        data_clean = data.copy()
        iqr_factor = 2.0 if is_test else 1.5
        
        for column in columns:
            Q1 = self.bounds[column]['Q1']
            Q3 = self.bounds[column]['Q3']
            IQR = Q3 - Q1
            lower_bound = Q1 - iqr_factor * IQR
            upper_bound = Q3 + iqr_factor * IQR
            
            if is_test:
                lower_bound = min(lower_bound, data_clean[column].min())
                upper_bound = max(upper_bound, data_clean[column].max())
            
            data_clean[column] = data_clean[column].clip(lower_bound, upper_bound)
            
        return data_clean

    def create_class_mapping(self, y: pd.Series) -> Dict[str, str]:
        class_counts = y.value_counts(normalize=True)
        classes_to_merge = class_counts[class_counts < self.merge_threshold].index
        return {cls: 'other_attack' for cls in classes_to_merge}

    def merge_minority_classes(self, y: pd.Series) -> pd.Series:
        if self.merged_classes_mapping is None:
            self.merged_classes_mapping = self.create_class_mapping(y)
        return y.replace(self.merged_classes_mapping)

    def balance_classes(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series]:
        """
        Balance classes using a combination of undersampling and oversampling.
        Automatically switches between SMOTE and SMOTENC based on feature types.
        """
        # First use Tomek Links to remove majority class noise
        tl = TomekLinks(sampling_strategy='majority')
        X_cleaned, y_cleaned = tl.fit_resample(X, y)
        
        # Check if we have any categorical features left after feature selection
        categorical_features = [X.columns.get_loc(col) for col in self.categorical_columns 
                              if col in X.columns]
        
        if categorical_features:
            # Use SMOTENC if we have categorical features
            smote = SMOTENC(
                categorical_features=categorical_features, 
                sampling_strategy='auto',
                random_state=42
            )
        else:
            # Use regular SMOTE if we only have numerical features
            smote = SMOTE(
                sampling_strategy='auto',
                random_state=42
            )
            
        return smote.fit_resample(X_cleaned, y_cleaned)

    def select_features(self, X: pd.DataFrame, y: pd.Series, n_features: int = None) -> pd.DataFrame:
        """
        Select features while ensuring categorical features are preserved if needed.
        """
        if self.feature_selector is None:
            # Ensure we keep at least one categorical feature for SMOTENC
            categorical_cols_to_keep = [col for col in self.categorical_columns if col in X.columns]
            
            # Dynamic feature selection based on data dimensionality
            if n_features is None:
                n_features = min(int(np.sqrt(len(X.columns)) * 2), len(X.columns))
                # Ensure we have space for categorical features
                n_features = max(n_features, len(categorical_cols_to_keep))
            
            # Remove highly correlated features
            features_to_drop = []
            corr_matrix = X.corr().abs()
            
            for pair in self.correlation_pairs:
                if pair[0] in X.columns and pair[1] in X.columns:
                    if corr_matrix.loc[pair[0], pair[1]] > self.high_corr_threshold:
                        # Don't drop if it's a categorical feature we want to keep
                        if pair[0] not in categorical_cols_to_keep and pair[0] not in self.important_features:
                            features_to_drop.append(pair[0])
                        elif pair[1] not in categorical_cols_to_keep:
                            features_to_drop.append(pair[1])
            
            X = X.drop(columns=features_to_drop)
            
            # Sample for large datasets
            if len(X) > 50000:
                X_sample = X.sample(n=50000, random_state=42)
                y_sample = y[X_sample.index]
            else:
                X_sample, y_sample = X, y
            
            # Select features while preserving categorical columns
            non_categorical_cols = [col for col in X.columns if col not in categorical_cols_to_keep]
            n_features_to_select = n_features - len(categorical_cols_to_keep)
            
            if n_features_to_select > 0 and non_categorical_cols:
                self.feature_selector = SelectKBest(
                    score_func=lambda X, y: mutual_info_classif(X, y, n_jobs=-1),
                    k=n_features_to_select
                )
                # Fit only on non-categorical features
                X_non_cat = X_sample[non_categorical_cols]
                self.feature_selector.fit(X_non_cat, y_sample)
                
                # Get selected features
                selected_non_cat = X_non_cat.columns[self.feature_selector.get_support()].tolist()
                self.selected_features = categorical_cols_to_keep + selected_non_cat
            else:
                self.selected_features = X.columns.tolist()
            
        return X[self.selected_features]

    def preprocess_data(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.Series, pd.Series, pd.Series]:
        # Remove redundant or low variance columns
        columns_to_drop = ['swin', 'stcpb', 'dtcpb', 'dwin', 'is_sm_ips_ports', 
                          'ct_flw_http_mthd', 'label']
        
        for df in [self.data, self.test_data]:
            df.drop(columns=[col for col in columns_to_drop if col in df.columns], 
                   inplace=True)
            df = self.add_engineered_features(df)
        
        # Handle categorical features
        for col in self.categorical_columns:
            if col in self.data.columns:
                self.label_encoder[col] = LabelEncoder()
                self.data[col] = self.label_encoder[col].fit_transform(self.data[col])
                if col in self.test_data.columns:
                    self.test_data[col] = self.handle_unknown_labels(self.test_data, col)
        
        X = self.data.drop(['attack_cat'], axis=1)
        y = self.data['attack_cat']
        X_test = self.test_data.drop(['attack_cat'], axis=1)
        y_test = self.test_data['attack_cat']
        
        # Calculate bounds for outlier handling
        numerical_columns = X.select_dtypes(include=['float64', 'int64']).columns
        for column in numerical_columns:
            Q1 = X[column].quantile(0.25)
            Q3 = X[column].quantile(0.75)
            self.bounds[column] = {'Q1': Q1, 'Q3': Q3}
        
        # Handle outliers
        X = self.handle_outliers(X, numerical_columns)
        X_test = self.handle_outliers(X_test, numerical_columns, is_test=True)
        
        # Process target variable
        y = self.merge_minority_classes(y)
        y_test = self.merge_minority_classes(y_test)
        self.label_encoder['attack_cat'] = LabelEncoder()
        y = pd.Series(self.label_encoder['attack_cat'].fit_transform(y), index=y.index)
        y_test = self.handle_unknown_labels(pd.DataFrame({'attack_cat': y_test}), 'attack_cat')
        
        # Split data
        X_train, X_val, y_train, y_val = self.split_train_val(X, y)
        
        # Select features
        X_train = self.select_features(X_train, y_train)
        X_val = X_val[self.selected_features]
        X_test = X_test[self.selected_features]
        
        # Scale features
        for scaler_name, scaler in self.scalers.items():
            if scaler_name == 'robust':
                cols_to_scale = [col for col in X_train.columns if col not in self.categorical_columns]
            else:  # standard scaler for important features
                cols_to_scale = [col for col in X_train.columns if col in self.important_features]
            
            if cols_to_scale:
                X_train[cols_to_scale] = scaler.fit_transform(X_train[cols_to_scale])
                X_val[cols_to_scale] = scaler.transform(X_val[cols_to_scale])
                X_test[cols_to_scale] = scaler.transform(X_test[cols_to_scale])
        
        # Balance classes
        X_train_balanced, y_train_balanced = self.balance_classes(X_train, y_train)
        
        return X_train_balanced, X_val, X_test, y_train_balanced, y_val, y_test

In [6]:
preprocessor = UNSWPreprocessor(train_data, test_data, val_size=0.2)
X_train, X_val, X_test, y_train, y_val, y_test = preprocessor.preprocess_data()

In [13]:
import tensorflow as tf
from tensorflow.keras import layers, Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from keras_tuner import HyperModel, RandomSearch
import xgboost as xgb
import numpy as np
from sklearn.metrics import f1_score
import optuna

class AutoMLEnsembleHyperModel(HyperModel):
    def __init__(self, input_shape, num_classes, sequence_length):
        self.input_shape = input_shape
        self.num_classes = num_classes
        self.sequence_length = sequence_length
        
    def build(self, hp):
        inputs = tf.keras.Input(shape=(self.sequence_length, self.input_shape))
        
        # Hyperparameters for architecture
        base_filters = hp.Int('base_filters', 32, 128, step=32)
        num_residual_blocks = hp.Int('num_residual_blocks', 1, 3)
        num_attention_heads = hp.Int('num_attention_heads', 4, 16, step=4)
        dense_units = hp.Int('dense_units', 64, 512, step=64)
        dropout_rate = hp.Float('dropout_rate', 0.1, 0.5, step=0.1)
        
        # 1. Residual Blocks
        x = inputs
        for _ in range(num_residual_blocks):
            x = self._create_residual_block(x, base_filters, hp)
            
        # 2. Temporal Block
        x = self._create_temporal_block(x, base_filters, hp)
        
        # 3. Multi-Scale Block
        x = self._create_multi_scale_block(x, base_filters, hp)
        
        # 4. Attention Block
        x = self._create_attention_block(x, base_filters, num_attention_heads)
        
        # 5. Global Pooling
        x = layers.GlobalAveragePooling1D()(x)
        
        # 6. Dense Layers
        num_dense_layers = hp.Int('num_dense_layers', 1, 3)
        for i in range(num_dense_layers):
            x = layers.Dense(dense_units // (2**i))(x)
            x = layers.BatchNormalization()(x)
            x = layers.Activation(hp.Choice(f'dense_activation_{i}', ['relu', 'swish']))(x)
            x = layers.Dropout(dropout_rate)(x)
        
        outputs = layers.Dense(self.num_classes, activation='softmax')(x)
        
        model = Model(inputs, outputs)
        
        # Compile model
        learning_rate = hp.Float('learning_rate', 1e-4, 1e-2, sampling='log')
        model.compile(
            optimizer=Adam(learning_rate=learning_rate),
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )
        
        return model
    
    def _create_residual_block(self, x, filters, hp):
        kernel_size = hp.Int('res_kernel_size', 3, 7, step=2)
        activation = hp.Choice('res_activation', ['relu', 'swish'])
        
        shortcut = x
        x = layers.Conv1D(filters, kernel_size, padding='same')(x)
        x = layers.BatchNormalization()(x)
        x = layers.Activation(activation)(x)
        x = layers.SpatialDropout1D(0.2)(x)
        
        if x.shape[-1] != shortcut.shape[-1]:
            shortcut = layers.Conv1D(filters, 1, padding='same')(shortcut)
            
        return layers.Add()([x, shortcut])
    
    def _create_temporal_block(self, x, filters, hp):
        gru_units = hp.Int('gru_units', filters//2, filters, step=32)
        bidirectional = hp.Boolean('bidirectional')
        
        if bidirectional:
            x = layers.Bidirectional(layers.GRU(gru_units, return_sequences=True))(x)
        else:
            x = layers.GRU(gru_units, return_sequences=True)(x)
            
        return x
    
    def _create_multi_scale_block(self, x, filters, hp):
        num_scales = hp.Int('num_scales', 2, 4)
        scales = list(range(3, 3 + num_scales * 2, 2))
        convs = []
        
        for scale in scales:
            conv = layers.Conv1D(filters//len(scales), scale, padding='same')(x)
            conv = layers.BatchNormalization()(conv)
            conv = layers.Activation('swish')(conv)
            convs.append(conv)
            
        return layers.Concatenate()(convs)
    
    def _create_attention_block(self, x, filters, num_heads):
        attention_output = layers.MultiHeadAttention(
            num_heads=num_heads,
            key_dim=filters//num_heads
        )(x, x)
        
        x = layers.Add()([attention_output, x])
        return layers.LayerNormalization()(x)

class EnhancedAutoMLEnsemble:
    def __init__(self, input_shape, num_classes, sequence_length=10):
        self.input_shape = input_shape
        self.num_classes = num_classes
        self.sequence_length = sequence_length
        self.deep_model = None
        self.xgb_model = None
        self.model_weights = None
        
    def optimize(self, X_train, y_train, X_val, y_val, max_trials=10):
        """
        Otimiza o modelo com índices corrigidos
        """
        # Resetar índices e converter para numpy se necessário
        if isinstance(X_train, pd.DataFrame):
            X_train = X_train.reset_index(drop=True)
        if isinstance(y_train, pd.Series):
            y_train = y_train.reset_index(drop=True)
        if isinstance(X_val, pd.DataFrame):
            X_val = X_val.reset_index(drop=True)
        if isinstance(y_val, pd.Series):
            y_val = y_val.reset_index(drop=True)

        # 1. Optimize Deep Learning Model
        hypermodel = AutoMLEnsembleHyperModel(
            self.input_shape,
            self.num_classes,
            self.sequence_length
        )
        
        tuner = RandomSearch(
            hypermodel,
            objective='val_accuracy',
            max_trials=max_trials,
            directory='keras_tuner_dir',
            project_name='automl_ensemble'
        )
        
        stop_early = EarlyStopping(monitor='val_loss', patience=5)
        
        X_train_seq = self.prepare_sequence_data(X_train)
        X_val_seq = self.prepare_sequence_data(X_val)
        
        # Converter targets para numpy arrays
        y_train_np = np.array(y_train)
        y_val_np = np.array(y_val)
        
        tuner.search(
            X_train_seq, y_train_np,
            validation_data=(X_val_seq, y_val_np),
            epochs=5,
            callbacks=[stop_early]
        )
        
        # Get best deep learning model
        self.deep_model = tuner.get_best_models(1)[0]
        
        # 2. Extract features for XGBoost
        train_features = self.extract_deep_features(X_train)
        val_features = self.extract_deep_features(X_val)
        
        # 3. Optimize XGBoost with Optuna
        study = optuna.create_study(direction='maximize')
        
        def objective(trial):
            # Construir o modelo
            model = hypermodel.build(trial)
    
            # Treinar o modelo
            model.fit(
                X_train_seq, y_train_np,
                validation_data=(X_val_seq, y_val_np),
                epochs=5,
                callbacks=[stop_early],
                verbose=0
            )
    
            # Fazer previsões no conjunto de validação
            y_pred = model.predict(X_val_seq)
            y_pred_classes = y_pred.argmax(axis=1)
    
            # Calcular métricas
            accuracy = (y_pred_classes == y_val_np).mean()
            f1 = f1_score(y_val_np, y_pred_classes, average='weighted')
    
            # Combinar métricas (ajuste os pesos conforme necessário)
            weighted_score = 0.5 * accuracy + 0.5 * f1
            return weighted_score
        
        study.optimize(objective, n_trials=max_trials)
        
        # Create final XGBoost model
        self.xgb_model = xgb.XGBClassifier(
            **study.best_params,
            use_label_encoder=False,
            eval_metric='mlogloss',
            tree_method='hist'
        )
        self.xgb_model.fit(train_features, y_train_np)
        
        # 4. Optimize ensemble weights
        self._optimize_ensemble_weights(val_features, y_val_np)
        
    def prepare_sequence_data(self, X):
        """
        Prepara os dados em formato de sequência, garantindo índices corretos
        """
        # Converter para numpy array se for DataFrame
        if isinstance(X, pd.DataFrame):
            X = X.to_numpy()
        elif isinstance(X, pd.Series):
            X = X.to_numpy()
        
        # Garantir que X seja 2D se for 1D
        if len(X.shape) == 1:
            X = X.reshape(-1, self.input_shape)
            
        n_samples = X.shape[0]
    
        # Criar sequências
        sequences = np.zeros((n_samples, self.sequence_length, self.input_shape))
    
        for i in range(n_samples):
            start_idx = max(0, i - self.sequence_length + 1)
            end_idx = i + 1
            actual_seq = X[start_idx:end_idx]
        
            if len(actual_seq) < self.sequence_length:
                # Preencher com o primeiro valor
                sequences[i, :self.sequence_length-len(actual_seq)] = actual_seq[0]
                sequences[i, self.sequence_length-len(actual_seq):] = actual_seq
            else:
                sequences[i] = actual_seq[-self.sequence_length:]
    
        return sequences

    def extract_deep_features(self, X):
        feature_model = Model(
            inputs=self.deep_model.input,
            outputs=self.deep_model.layers[-2].output
        )
        X_seq = self.prepare_sequence_data(X)
        return feature_model.predict(X_seq)
    
    def _optimize_ensemble_weights(self, X_val, y_val):
        deep_pred = self.deep_model.predict(self.prepare_sequence_data(X_val))
        xgb_pred = self.xgb_model.predict_proba(X_val)
        
        def objective(trial):
            w1 = trial.suggest_float('deep_weight', 0.2, 0.8)
            w2 = 1 - w1
            
            ensemble_pred = w1 * deep_pred + w2 * xgb_pred
            y_pred = np.argmax(ensemble_pred, axis=1)
            return f1_score(y_val, y_pred, average='weighted')
        
        study = optuna.create_study(direction='maximize')
        study.optimize(objective, n_trials=100)
        
        self.model_weights = [study.best_params['deep_weight'],
                            1 - study.best_params['deep_weight']]
    
    def predict(self, X):
        X_seq = self.prepare_sequence_data(X)
        deep_pred = self.deep_model.predict(X_seq)
        X_features = self.extract_deep_features(X)
        xgb_pred = self.xgb_model.predict_proba(X_features)
        
        ensemble_pred = (self.model_weights[0] * deep_pred + 
                        self.model_weights[1] * xgb_pred)
        return np.argmax(ensemble_pred, axis=1)

In [11]:
import tensorflow as tf
from tensorflow.keras import layers, Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from keras_tuner import HyperModel
import xgboost as xgb
import numpy as np
from sklearn.metrics import f1_score
import optuna
import pandas as pd

class AutoMLEnsembleHyperModel(HyperModel):
    def __init__(self, input_shape, num_classes, sequence_length):
        self.input_shape = input_shape
        self.num_classes = num_classes
        self.sequence_length = sequence_length
        
    def build(self, trial):
        inputs = tf.keras.Input(shape=(self.sequence_length, self.input_shape))
    
        # Hyperparameters for architecture
        base_filters = trial.suggest_int('base_filters', 32, 128, step=32)
        num_residual_blocks = trial.suggest_int('num_residual_blocks', 1, 3)
        num_attention_heads = trial.suggest_int('num_attention_heads', 4, 16, step=4)
        dense_units = trial.suggest_int('dense_units', 64, 512, step=64)
        dropout_rate = trial.suggest_float('dropout_rate', 0.1, 0.5, step=0.1)
    
        # 1. Residual Blocks
        x = inputs
        for _ in range(num_residual_blocks):
            x = self._create_residual_block(x, base_filters, trial)
        
        # 2. Temporal Block
        x = self._create_temporal_block(x, base_filters, trial)
    
        # 3. Multi-Scale Block
        x = self._create_multi_scale_block(x, base_filters, trial)
    
        # 4. Attention Block
        x = self._create_attention_block(x, base_filters, num_attention_heads)
    
        # 5. Global Pooling
        x = layers.GlobalAveragePooling1D()(x)
    
        # 6. Dense Layers
        num_dense_layers = trial.suggest_int('num_dense_layers', 1, 3)
        for i in range(num_dense_layers):
            x = layers.Dense(dense_units // (2**i))(x)
            x = layers.BatchNormalization()(x)
            x = layers.Activation(trial.suggest_categorical(f'dense_activation_{i}', ['relu', 'swish']))(x)
            x = layers.Dropout(dropout_rate)(x)
    
        outputs = layers.Dense(self.num_classes, activation='softmax')(x)
    
        model = Model(inputs, outputs)
    
        # Compile model
        learning_rate = trial.suggest_float('learning_rate', 1e-4, 1e-2, log=True)
        model.compile(
            optimizer=Adam(learning_rate=learning_rate),
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )
    
        return model

    
    def _create_residual_block(self, x, filters, hp):
        kernel_size = hp.Int('res_kernel_size', 3, 7, step=2)
        activation = hp.Choice('res_activation', ['relu', 'swish'])
        
        shortcut = x
        x = layers.Conv1D(filters, kernel_size, padding='same')(x)
        x = layers.BatchNormalization()(x)
        x = layers.Activation(activation)(x)
        x = layers.SpatialDropout1D(0.2)(x)
        
        if x.shape[-1] != shortcut.shape[-1]:
            shortcut = layers.Conv1D(filters, 1, padding='same')(shortcut)
            
        return layers.Add()([x, shortcut])
    
    def _create_temporal_block(self, x, filters, hp):
        gru_units = hp.Int('gru_units', filters//2, filters, step=32)
        bidirectional = hp.Boolean('bidirectional')
        
        if bidirectional:
            x = layers.Bidirectional(layers.GRU(gru_units, return_sequences=True))(x)
        else:
            x = layers.GRU(gru_units, return_sequences=True)(x)
            
        return x
    
    def _create_multi_scale_block(self, x, filters, hp):
        num_scales = hp.Int('num_scales', 2, 4)
        scales = list(range(3, 3 + num_scales * 2, 2))
        convs = []
        
        for scale in scales:
            conv = layers.Conv1D(filters//len(scales), scale, padding='same')(x)
            conv = layers.BatchNormalization()(conv)
            conv = layers.Activation('swish')(conv)
            convs.append(conv)
            
        return layers.Concatenate()(convs)
    
    def _create_attention_block(self, x, filters, num_heads):
        attention_output = layers.MultiHeadAttention(
            num_heads=num_heads,
            key_dim=filters//num_heads
        )(x, x)
        
        x = layers.Add()([attention_output, x])
        return layers.LayerNormalization()(x)

class EnhancedAutoMLEnsemble:
    def __init__(self, input_shape, num_classes, sequence_length=10):
        self.input_shape = input_shape
        self.num_classes = num_classes
        self.sequence_length = sequence_length
        self.deep_model = None
        self.xgb_model = None
        self.model_weights = None
        
    def optimize(self, X_train, y_train, X_val, y_val, max_trials=10):
        X_train = X_train.reset_index(drop=True) if isinstance(X_train, pd.DataFrame) else X_train
        y_train = y_train.reset_index(drop=True) if isinstance(y_train, pd.Series) else y_train
        X_val = X_val.reset_index(drop=True) if isinstance(X_val, pd.DataFrame) else X_val
        y_val = y_val.reset_index(drop=True) if isinstance(y_val, pd.Series) else y_val
        
        y_train_np = np.array(y_train)
        y_val_np = np.array(y_val)
        
        hypermodel = AutoMLEnsembleHyperModel(
            self.input_shape,
            self.num_classes,
            self.sequence_length
        )
        
        X_train_seq = self.prepare_sequence_data(X_train)
        X_val_seq = self.prepare_sequence_data(X_val)

        def objective(trial):
            dl_params = {
                'base_filters': trial.suggest_int('base_filters', 32, 128, step=32),
                'num_residual_blocks': trial.suggest_int('num_residual_blocks', 1, 3),
                'num_attention_heads': trial.suggest_int('num_attention_heads', 4, 16, step=4),
                'dense_units': trial.suggest_int('dense_units', 64, 512, step=64),
                'dropout_rate': trial.suggest_float('dropout_rate', 0.1, 0.5, step=0.1),
                'learning_rate': trial.suggest_float('dl_learning_rate', 1e-4, 1e-2, log=True)
            }
            
            xgb_params = {
                'max_depth': trial.suggest_int('xgb_max_depth', 3, 10),
                'learning_rate': trial.suggest_float('xgb_learning_rate', 1e-3, 0.1, log=True),
                'n_estimators': trial.suggest_int('n_estimators', 100, 500),
                'min_child_weight': trial.suggest_int('min_child_weight', 1, 7),
                'subsample': trial.suggest_float('subsample', 0.6, 1.0),
                'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
                'gamma': trial.suggest_float('gamma', 1e-8, 1.0, log=True)
            }
            
            deep_model = hypermodel.build(trial)
            deep_model.fit(
                X_train_seq, y_train_np,
                validation_data=(X_val_seq, y_val_np),
                epochs=5,
                callbacks=[EarlyStopping(monitor='val_loss', patience=3)],
                verbose=0
            )
            
            deep_pred_prob = deep_model.predict(X_val_seq, verbose=0)
            deep_pred = np.argmax(deep_pred_prob, axis=1)
            deep_f1 = f1_score(y_val_np, deep_pred, average='weighted')
            
            feature_model = Model(inputs=deep_model.input, outputs=deep_model.layers[-2].output)
            train_features = feature_model.predict(X_train_seq, verbose=0)
            val_features = feature_model.predict(X_val_seq, verbose=0)
            
            xgb_model = xgb.XGBClassifier(
                **xgb_params,
                use_label_encoder=False,
                eval_metric='mlogloss',
                tree_method='hist'
            )
            xgb_model.fit(train_features, y_train_np)
            xgb_pred = xgb_model.predict(val_features)
            xgb_f1 = f1_score(y_val_np, xgb_pred, average='weighted')
            
            total_f1 = deep_f1 + xgb_f1
            deep_weight = deep_f1 / total_f1 if total_f1 > 0 else 0.5
            
            ensemble_pred = (deep_weight * deep_pred_prob + 
                           (1 - deep_weight) * xgb_model.predict_proba(val_features))
            final_pred = np.argmax(ensemble_pred, axis=1)
            final_f1 = f1_score(y_val_np, final_pred, average='weighted')
            
            if trial.number == 0 or final_f1 > trial.study.best_value:
                self.deep_model = deep_model
                self.xgb_model = xgb_model
                self.model_weights = [deep_weight, 1 - deep_weight]
                
            return final_f1
        
        study = optuna.create_study(
            direction='maximize',
            pruner=optuna.pruners.MedianPruner()
        )
        study.optimize(objective, n_trials=max_trials)
    
    def prepare_sequence_data(self, X):
        if isinstance(X, pd.DataFrame):
            X = X.to_numpy()
        elif isinstance(X, pd.Series):
            X = X.to_numpy()
        
        if len(X.shape) == 1:
            X = X.reshape(-1, self.input_shape)
            
        n_samples = X.shape[0]
        sequences = np.zeros((n_samples, self.sequence_length, self.input_shape))
    
        for i in range(n_samples):
            start_idx = max(0, i - self.sequence_length + 1)
            end_idx = i + 1
            actual_seq = X[start_idx:end_idx]
        
            if len(actual_seq) < self.sequence_length:
                sequences[i, :self.sequence_length-len(actual_seq)] = actual_seq[0]
                sequences[i, self.sequence_length-len(actual_seq):] = actual_seq
            else:
                sequences[i] = actual_seq[-self.sequence_length:]
    
        return sequences
    
    def predict(self, X):
        X_seq = self.prepare_sequence_data(X)
        deep_pred = self.deep_model.predict(X_seq, verbose=0)
        
        feature_model = Model(
            inputs=self.deep_model.input,
            outputs=self.deep_model.layers[-2].output
        )
        X_features = feature_model.predict(X_seq, verbose=0)
        xgb_pred = self.xgb_model.predict_proba(X_features)
        
        ensemble_pred = (self.model_weights[0] * deep_pred + 
                        self.model_weights[1] * xgb_pred)
        return np.argmax(ensemble_pred, axis=1)

In [None]:
X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
X_val = X_val.reset_index(drop=True)
y_val = y_val.reset_index(drop=True)

model = EnhancedAutoMLEnsemble(
    input_shape=X_train.shape[1],  # Número de features
    num_classes=len(np.unique(y_train)),  # Número de classes
    sequence_length=10  # Comprimento da sequência temporal
)

# 4. Otimize o modelo
model.optimize(X_train, y_train, X_val, y_val, max_trials=10)
# Make predictions
predictions = model.predict(X_test)

Trial 8 Complete [00h 09m 45s]
val_accuracy: 0.6541674733161926

Best val_accuracy So Far: 0.7033562660217285
Total elapsed time: 02h 01m 13s

Search: Running Trial #9

Value             |Best Value So Far |Hyperparameter
96                |64                |base_filters
2                 |2                 |num_residual_blocks
4                 |12                |num_attention_heads
192               |384               |dense_units
0.5               |0.1               |dropout_rate
7                 |3                 |res_kernel_size
swish             |swish             |res_activation
16                |16                |gru_units
True              |True              |bidirectional
3                 |3                 |num_scales
2                 |1                 |num_dense_layers
relu              |relu              |dense_activation_0
0.0071211         |0.00073687        |learning_rate
swish             |None              |dense_activation_1
relu              |None          