In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, StratifiedKFold
from imblearn.over_sampling import SMOTE
from collections import Counter, defaultdict
import torch
import torch.nn as nn
import torch.utils.data as data_utils
import torch.nn.functional as F
import torch.optim as optim

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    classification_report,
    roc_auc_score, 
    roc_curve
)

from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
from torch.utils.data import TensorDataset, DataLoader
from tqdm.notebook import tqdm



import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_columns', None)


In [None]:
df1 = pd.read_csv(r"E:\Thesis\Defence\AnD\Version_3\Recent_UpdatedNB15.csv")
print(f"Dataset Shape: {df1.shape}")
display(df1.head(10))
display(df1.tail(10))

In [None]:
le_city = LabelEncoder()
df1['is_sm_ips_ports'] = le_city.fit_transform(df1['is_sm_ips_ports'])
df1['label'] = le_city.fit_transform(df1['label'])

In [None]:
display(df1.head(10))

In [None]:
feature_cols = [col for col in df1.columns if col not in ['label', 'attack_cat']]
print(f"Number of features: {len(feature_cols)}")
print(f"Feature columns: {len(feature_cols)} columns")

# Cell 4: Create correlation matrix with binary label
features_with_label = feature_cols + ['label']
corr_matrix_binary = df1[features_with_label].corr()
print(f"Correlation matrix shape: {corr_matrix_binary.shape}")

# Cell 5: Display correlation with binary label
label_correlations = corr_matrix_binary['label'].drop('label').sort_values(key=abs, ascending=False)
print("Top 20 features correlated with binary label:")
print(label_correlations.head(20))

# Cell 6: Create correlation matrix with multi-class label
features_with_attack_cat = feature_cols + ['attack_cat']
corr_matrix_multiclass = df1[features_with_attack_cat].corr()
attack_cat_correlations = corr_matrix_multiclass['attack_cat'].drop('attack_cat').sort_values(key=abs, ascending=False)
print("Top 20 features correlated with attack_cat:")
print(attack_cat_correlations.head(20))

# Cell 7: Visualize correlation matrix - Binary Label (Top 20 features)
top_20_features_binary = label_correlations.head(20).index.tolist()
corr_subset_binary = df1[top_20_features_binary + ['label']].corr()

plt.figure(figsize=(14, 12))
sns.heatmap(corr_subset_binary, annot=True, cmap='RdBu_r', center=0, 
            square=True, fmt='.3f', cbar_kws={"shrink": .8})
plt.title('Correlation Matrix: Top 20 Features + Binary Label')
plt.tight_layout()
plt.show()

# Cell 8: Visualize correlation matrix - Multi-class Label (Top 20 features)
top_20_features_multiclass = attack_cat_correlations.head(20).index.tolist()
corr_subset_multiclass = df1[top_20_features_multiclass + ['attack_cat']].corr()

plt.figure(figsize=(14, 12))
sns.heatmap(corr_subset_multiclass, annot=True, cmap='RdBu_r', center=0, 
            square=True, fmt='.3f', cbar_kws={"shrink": .8})
plt.title('Correlation Matrix: Top 20 Features + Multi-class Label')
plt.tight_layout()
plt.show()

# Cell 9: Full correlation matrix (all features)
full_corr_matrix = df1.corr()
print(f"Full correlation matrix shape: {full_corr_matrix.shape}")

# Cell 10: Visualize full correlation matrix (heatmap)
plt.figure(figsize=(20, 16))
sns.heatmap(full_corr_matrix, cmap='RdBu_r', center=0, square=True, 
            cbar_kws={"shrink": .5})
plt.title('Full Correlation Matrix - All Features')
plt.tight_layout()
plt.show()

# Cell 11: Extract correlations with labels only
label_corr_summary = pd.DataFrame({
    'feature': feature_cols,
    'binary_label_corr': [corr_matrix_binary.loc[feature, 'label'] for feature in feature_cols],
    'attack_cat_corr': [corr_matrix_multiclass.loc[feature, 'attack_cat'] for feature in feature_cols]
})
label_corr_summary['abs_binary_corr'] = abs(label_corr_summary['binary_label_corr'])
label_corr_summary['abs_attack_cat_corr'] = abs(label_corr_summary['attack_cat_corr'])
label_corr_summary = label_corr_summary.sort_values('abs_binary_corr', ascending=False)

print("Feature correlations with both labels:")
print(label_corr_summary.head(15))

# Cell 12: Bar plot - Binary label correlations
plt.figure(figsize=(12, 8))
top_15_binary = label_corr_summary.head(15)
colors = ['red' if x < 0 else 'blue' for x in top_15_binary['binary_label_corr']]
plt.barh(range(len(top_15_binary)), top_15_binary['binary_label_corr'], color=colors, alpha=0.7)
plt.yticks(range(len(top_15_binary)), top_15_binary['feature'])
plt.xlabel('Correlation with Binary Label')
plt.title('Top 15 Features - Binary Label Correlation')
plt.axvline(x=0, color='black', linestyle='-', alpha=0.3)
plt.tight_layout()
plt.show()

# Cell 13: Bar plot - Multi-class label correlations
label_corr_summary_mc = label_corr_summary.sort_values('abs_attack_cat_corr', ascending=False)
plt.figure(figsize=(12, 8))
top_15_multiclass = label_corr_summary_mc.head(15)
colors_mc = ['red' if x < 0 else 'green' for x in top_15_multiclass['attack_cat_corr']]
plt.barh(range(len(top_15_multiclass)), top_15_multiclass['attack_cat_corr'], color=colors_mc, alpha=0.7)
plt.yticks(range(len(top_15_multiclass)), top_15_multiclass['feature'])
plt.xlabel('Correlation with Attack Category')
plt.title('Top 15 Features - Multi-class Label Correlation')
plt.axvline(x=0, color='black', linestyle='-', alpha=0.3)
plt.tight_layout()
plt.show()

# Cell 14: Correlation comparison scatter plot
plt.figure(figsize=(10, 8))
plt.scatter(label_corr_summary['abs_binary_corr'], label_corr_summary['abs_attack_cat_corr'], 
           alpha=0.6, s=50)
plt.plot([0, 1], [0, 1], 'r--', alpha=0.8)
plt.xlabel('Absolute Correlation with Binary Label')
plt.ylabel('Absolute Correlation with Attack Category')
plt.title('Feature Correlations: Binary vs Multi-class')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Cell 15: Save correlation matrices
print("Correlation matrices created:")
print(f"- corr_matrix_binary: shape {corr_matrix_binary.shape}")
print(f"- corr_matrix_multiclass: shape {corr_matrix_multiclass.shape}")
print(f"- full_corr_matrix: shape {full_corr_matrix.shape}")
print(f"- label_corr_summary: feature correlations with both labels")
print("Analysis completed.")

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

In [None]:
labels_bin = df1['label']
unique_labels = labels_bin.unique()
print(f"Number of unique labels: {len(unique_labels)}")
print(f"Unique labels: {unique_labels}")
print(f"Label data type: {labels_bin.dtype}")

In [None]:
label_counts_bin = labels_bin.value_counts()
print("Label Distribution:")
print(label_counts_bin)

In [None]:
max_count = label_counts_bin.max()
min_count = label_counts_bin.min()
balance_ratio = max_count / min_count
print(f"Most frequent class: {label_counts_bin.index[0]} ({max_count} samples)")
print(f"Least frequent class: {label_counts_bin.index[-1]} ({min_count} samples)")
print(f"Imbalance ratio: {balance_ratio:.2f}")

In [None]:
print(labels_bin.describe())

In [None]:
labels_att = df1['attack_cat']
unique_labels = labels_att.unique()
print(f"Number of unique labels: {len(unique_labels)}")
print(f"Unique labels: {unique_labels}")
print(f"Label data type: {labels_att.dtype}")

In [None]:
label_counts = labels_att.value_counts()
print("Label Distribution:")
print(label_counts)

In [None]:
label_percentages = labels_att.value_counts(normalize=True) * 100
label_distribution = pd.DataFrame({
    'Count': label_counts,
    'Percentage': label_percentages
})
print(label_distribution)

In [None]:
max_count = label_counts.max()
min_count = label_counts.min()
balance_ratio = max_count / min_count
print(f"Most frequent class: {label_counts.index[0]} ({max_count} samples)")
print(f"Least frequent class: {label_counts.index[-1]} ({min_count} samples)")
print(f"Imbalance ratio: {balance_ratio:.2f}")

In [None]:
print(labels_att.describe())


In [None]:
X = df1.drop(columns=["label", "attack_cat"])
y = df1["label"]
y_att = df1["attack_cat"]

In [None]:
X_train, X_test, y_train, y_test, attack_train, attack_test = train_test_split(X, y, y_att, test_size=0.2, stratify=y, random_state=42)

X_binned = X.apply(lambda col: pd.qcut(col, q=3, labels=False, duplicates='drop'))
X_binned_tensor = torch.tensor(X_binned.values, dtype=torch.long)


In [None]:
from scipy.stats import chi2_contingency
import numpy as np
import pandas as pd
from sklearn.preprocessing import KBinsDiscretizer

def comprehensive_chi2_selection(X_train, y_train, n_features=20, n_bins=10):
    results = []
    
    discretizer = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='quantile')
    X_binned = pd.DataFrame(
        discretizer.fit_transform(X_train),
        columns=X_train.columns,
        index=X_train.index
    )
    
    for col in X_binned.columns:

        contingency = pd.crosstab(X_binned[col], y_train)
        
    
        chi2, p_value, dof, expected = chi2_contingency(contingency)
        
        n = contingency.sum().sum()
        min_dim = min(contingency.shape[0], contingency.shape[1]) - 1
        cramers_v = np.sqrt(chi2 / (n * min_dim)) if min_dim > 0 else 0
        
        results.append({
            'feature': col,
            'chi2': chi2,
            'p_value': p_value,
            'cramers_v': cramers_v,
            'dof': dof,
            'n_bins_used': contingency.shape[0]
        })
    

    results_df = pd.DataFrame(results)
    results_df = results_df.sort_values('chi2', ascending=False)
    
    display(results_df)
    print("\nChi-Square Feature Selection Results:")
    print("-" * 80)
    print(f"{'Feature':<25} {'Chi2':>12} {'P-value':>12} {'CramÃ©r V':>10} {'DoF':>5}")
    print("-" * 80)
    
    for idx, row in results_df.head(n_features).iterrows():
        print(f"{row['feature']:<25} {row['chi2']:>12.3f} {row['p_value']:>12.3e} "
              f"{row['cramers_v']:>10.3f} {row['dof']:>5}")
    
    for idx, row in results_df.iterrows():
        if row['p_value'] > 0.05:
            print(f" {row['feature']} has p-value > 0.05 (not statistically significant)")
    
    
    top_features = results_df.head(n_features)['feature'].tolist()
    
    return top_features, results_df, discretizer


top_features, chi2_results, discretizer = comprehensive_chi2_selection(
    X_train, y_train, n_features=20
)


In [None]:
# from sklearn.feature_selection import chi2, SelectKBest
# selector = SelectKBest(chi2, k=20)
# selector.fit(X_train, y_train)

# selected_features = X_train.columns[selector.get_support()].tolist()
# print("Top 20 features selected by Chi-Square:")
# print(selected_features)

In [None]:
selected_features = ['sttl', 'dttl', 'ct_state_ttl', 'rate', 'sload', 'dpkts', 
                    'dload', 'dinpkt', 'dur', 'proto', 'state', 'dmean', 
                    'sbytes', 'sinpkt', 'spkts', 'sjit', 'dbytes', 
                    'ct_dst_sport_ltm', 'dloss', 'ct_srv_dst']


X_selected = X_train[selected_features]


correlation_matrix = X_selected.corr()

plt.figure(figsize=(12, 10))
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))  # Mask upper triangle
sns.heatmap(correlation_matrix, 
            mask=mask, 
            cmap='coolwarm', 
            center=0,
            annot=True, 
            fmt='.2f',
            square=True,
            linewidths=0.5,
            cbar_kws={"shrink": 0.8})
plt.title('Feature Correlation Matrix', fontsize=16)
plt.tight_layout()
plt.show()

In [None]:
original_features = ['sttl', 'dttl', 'ct_state_ttl', 'rate', 'sload', 'dpkts', 
                    'dload', 'dinpkt', 'dur', 'proto', 'state', 'dmean', 
                    'sbytes', 'sinpkt', 'spkts', 'sjit', 'dbytes', 
                    'ct_dst_sport_ltm', 'dloss', 'ct_srv_dst']

features_to_remove = ['spkts', 'dbytes', 'dloss']
remaining_features = [f for f in original_features if f not in features_to_remove]

print(f"Remaining features after removal ({len(remaining_features)}):")
print(remaining_features)


additional_features = ['ct_src_dport_ltm', 'ackdat', 'synack']  

final_20_features = remaining_features + additional_features

In [None]:
X_selected = X_train[final_20_features]


correlation_matrix = X_selected.corr()

plt.figure(figsize=(12, 10))
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))  # Mask upper triangle
sns.heatmap(correlation_matrix, 
            mask=mask, 
            cmap='coolwarm', 
            center=0,
            annot=True, 
            fmt='.2f',
            square=True,
            linewidths=0.5,
            cbar_kws={"shrink": 0.8})
plt.title('Feature Correlation Matrix', fontsize=16)
plt.tight_layout()
plt.show()

In [None]:
# Eddur porjonto kichu kora lagbe nah, just permuatation dea check korte hobe.

In [None]:
class HierarchicalNIDS(nn.Module):
    def __init__(self, input_features=20, seq_length=10, num_attack_types=9):
        super().__init__()
        
        self.conv1_3 = nn.Conv1d(input_features, 16, kernel_size=3, padding=1)
        self.conv1_5 = nn.Conv1d(input_features, 8, kernel_size=5, padding=2)
        
        self.bn1 = nn.BatchNorm1d(24)
        self.dropout1 = nn.Dropout(0.2)
        
        self.conv2 = nn.Conv1d(24, 32, kernel_size=3, padding=1)
        self.bn2 = nn.BatchNorm1d(32)
        self.pool = nn.MaxPool1d(2)
        
        self.bilstm = nn.LSTM(
            input_size=32,
            hidden_size=16,
            num_layers=2,
            batch_first=True,
            bidirectional=True,
            dropout=0.3
        )
        
        self.attention = nn.Sequential(
            nn.Linear(32, 16),
            nn.Tanh(),
            nn.Linear(16, 1)
        )
        
        self.binary_head = nn.Sequential(
            nn.Linear(32, 24),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(24, 12),
            nn.ReLU(),
            nn.Linear(12, 1)
        )
        
        self.multiclass_head = nn.Sequential(
            nn.Linear(32, 32),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Linear(16, num_attack_types)
        )
        
        self._initialize_weights()
    
    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv1d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
                nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.LSTM):
                for name, param in m.named_parameters():
                    if 'weight_ih' in name:
                        nn.init.xavier_uniform_(param.data)
                    elif 'weight_hh' in name:
                        nn.init.orthogonal_(param.data)
                    elif 'bias' in name:
                        nn.init.constant_(param.data, 0)
    
    def attention_net(self, lstm_output):
        attention_scores = self.attention(lstm_output)
        attention_weights = torch.softmax(attention_scores, dim=1)
        weighted_output = torch.sum(lstm_output * attention_weights, dim=1)
        return weighted_output
    
    def extract_features(self, x):
        x = x.permute(0, 2, 1)
        
        conv_3 = self.conv1_3(x)
        conv_5 = self.conv1_5(x)
        x = torch.cat([conv_3, conv_5], dim=1)
        
        x = self.bn1(x)
        x = F.relu(x)
        x = self.dropout1(x)
        
        x = self.conv2(x)
        x = self.bn2(x)
        x = F.relu(x)
        x = self.pool(x)
        
        x = x.permute(0, 2, 1)
        
        lstm_out, _ = self.bilstm(x)
        
        features = self.attention_net(lstm_out)
        
        return features
    
    def forward(self, x, stage='both'):
        features = self.extract_features(x)
        
        if stage == 'binary' or stage == 'both':
            binary_output = torch.sigmoid(self.binary_head(features))
            
        if stage == 'multiclass' or stage == 'both':
            multiclass_output = self.multiclass_head(features)
            
        if stage == 'binary':
            return binary_output
        elif stage == 'multiclass':
            return multiclass_output
        else:
            return binary_output, multiclass_output

In [None]:
X_train_selected = X_train[final_20_features]  
X_test_selected = X_test[final_20_features]

X_trainfinal, X_val, y_trainfinal, y_val, attack_trainfinal, attack_val = train_test_split(
    X_train_selected, y_train, attack_train, 
    test_size=0.2, stratify=y_train, random_state=42
)

custom_df = pd.concat([X_trainfinal, attack_trainfinal, y_trainfinal], axis=1)



display(custom_df.head(10))


print("Attack label distribution:")
print(custom_df['attack_cat'].value_counts().sort_index())
print(custom_df['label'].value_counts().sort_index())

In [None]:
# Second check done till here (Friday - 08/ 08/ 2025)

In [None]:
from imblearn.over_sampling import SMOTE
from collections import Counter


# attack_counts = {1: 37567, 2: 28462, 3: 15582, 4: 10571, 
#                  5: 8913, 6: 1722, 7: 1507, 8: 953, 9: 113}


normal_mask = attack_trainfinal == 0
attack_mask = attack_trainfinal > 0

X_normal = X_trainfinal[normal_mask]
X_attacks = X_trainfinal[attack_mask]
attack_labels = attack_trainfinal[attack_mask]
benign_labels = attack_trainfinal[normal_mask]


print(f"Original attack distribution: {Counter(attack_labels)}")
print(f"Number of normal samples: {len(X_normal)}")
print(f"Number of attack samples: {len(X_attacks)}")

In [None]:
df_temp = pd.DataFrame(X_attacks)
df_temp['attack_label'] = attack_labels

display(df_temp.head(10))
display(df_temp.tail(10))

In [None]:
df_temp1 = pd.DataFrame(X_normal)
df_temp1['normal_label'] = benign_labels

display(df_temp1.head(10))
display(df_temp1.tail(10))

In [None]:
# Fixed the label problem faced earlier, here check for sanity too. 08/ 08/ 2025

In [None]:
def create_sequences(X, y_binary, y_attack, seq_length=10):
    num_samples = len(X) - seq_length + 1
    
    # Pre-allocate arrays for efficiency
    X_seq = np.zeros((num_samples, seq_length, X.shape[1]))
    y_binary_seq = np.zeros(num_samples)
    y_attack_seq = np.zeros(num_samples, dtype=int)
    
    if hasattr(y_binary, 'iloc'):
        y_binary = y_binary.values
    if hasattr(y_attack, 'iloc'):
        y_attack = y_attack.values
    
    # Sliding window.
    for i in range(num_samples):
        X_seq[i] = X[i:i+seq_length]
        y_binary_seq[i] = y_binary[i+seq_length-1]
        y_attack_seq[i] = y_attack[i+seq_length-1]
    
    return X_seq, y_binary_seq, y_attack_seq

X_train_seq, y_binary_train_seq, y_attack_train_seq = create_sequences(
    X_train_selected, 
    y_train, 
    attack_train,
    seq_length=10
)

# Validation data (no SMOTE applied)
X_val_seq, y_binary_val_seq, y_attack_val_seq = create_sequences(
    X_val, 
    y_val,
    attack_val,
    seq_length=10
)

# Test data (no SMOTE applied)
X_test_seq, y_binary_test_seq, y_attack_test_seq = create_sequences(
    X_test_selected,  
    y_test,
    attack_test,
    seq_length=10
)


X_train_tensor = torch.FloatTensor(X_train_seq)
y_binary_train_tensor = torch.FloatTensor(y_binary_train_seq)
y_attack_train_tensor = torch.LongTensor(y_attack_train_seq)

X_val_tensor = torch.FloatTensor(X_val_seq)
y_binary_val_tensor = torch.FloatTensor(y_binary_val_seq)
y_attack_val_tensor = torch.LongTensor(y_attack_val_seq)

X_test_tensor = torch.FloatTensor(X_test_seq)
y_binary_test_tensor = torch.FloatTensor(y_binary_test_seq)
y_attack_test_tensor = torch.LongTensor(y_attack_test_seq)

print(f"\nSequence shapes:")
print(f"Training: {X_train_tensor.shape} (samples, seq_length, features)")
print(f"Validation: {X_val_tensor.shape}")
print(f"Test: {X_test_tensor.shape}")

# Class distributions after sequence creation
print(f"\nBinary distribution in sequences:")
print(f"Train - Normal: {(y_binary_train_tensor == 0).sum()}, Attack: {(y_binary_train_tensor == 1).sum()}")
print(f"Val - Normal: {(y_binary_val_tensor == 0).sum()}, Attack: {(y_binary_val_tensor == 1).sum()}")
print(f"Test - Normal: {(y_binary_test_tensor == 0).sum()}, Attack: {(y_binary_test_tensor == 1).sum()}")

# Attack type distribution
train_attack_dist = Counter(y_attack_train_tensor[y_binary_train_tensor == 1].numpy())
print(f"\nAttack distribution in training sequences:")
for attack_type, count in sorted(train_attack_dist.items()):
    print(f"Attack_{attack_type}: {count}")

In [None]:
# print("=== SEQUENCE DATA ===")
# if 'X_train_seq' in globals():
#     print(f"Training sequences shape: {X_train_seq.shape}")
#     print(f"Training binary seq labels shape: {y_binary_train_seq.shape}")
#     print(f"Training attack seq labels shape: {y_attack_train_seq.shape}")
    
#     print("\nSequence label distributions:")
#     print(f"  Binary: {Counter(y_binary_train_seq)}")
#     print(f"  Attack: {Counter(y_attack_train_seq)}")
# else:
#     print("Sequences not created yet")

In [None]:
BATCH_SIZE = 64 
# Training dataset (with SMOTE)
train_dataset = TensorDataset(X_train_tensor, y_binary_train_tensor, y_attack_train_tensor)
train_loader = DataLoader(
    train_dataset, 
    batch_size=BATCH_SIZE, 
    shuffle=True,
    num_workers=0,  # Setting to 0 jodi multiprocessing errors
    pin_memory=True if torch.cuda.is_available() else False  # GPU optimization
)

# Validation dataset (no SMOTE)
val_dataset = TensorDataset(X_val_tensor, y_binary_val_tensor, y_attack_val_tensor)
val_loader = DataLoader(
    val_dataset, 
    batch_size=BATCH_SIZE, 
    shuffle=False,
    num_workers=0,
    pin_memory=True if torch.cuda.is_available() else False
)

# Test dataset (no SMOTE) 
test_dataset = TensorDataset(X_test_tensor, y_binary_test_tensor, y_attack_test_tensor)
test_loader = DataLoader(
    test_dataset, 
    batch_size=BATCH_SIZE, 
    shuffle=False,
    num_workers=0,
    pin_memory=True if torch.cuda.is_available() else False
)

# Attack-only loader for multiclass training
attack_mask_train = y_binary_train_tensor == 1
X_train_attacks = X_train_tensor[attack_mask_train]
y_train_attacks = y_attack_train_tensor[attack_mask_train]

multiclass_dataset = TensorDataset(X_train_attacks, y_train_attacks)
multiclass_loader = DataLoader(
    multiclass_dataset, 
    batch_size=BATCH_SIZE, 
    shuffle=True,
    num_workers=0,
    pin_memory=True if torch.cuda.is_available() else False
)
print(f"\nDataLoader ready:")
print(f"Train batches: {len(train_loader)}")
print(f"Val batches: {len(val_loader)}")
print(f"Test batches: {len(test_loader)}")
print(f"Multiclass batches: {len(multiclass_loader)}")

In [None]:
# Check till here, Now next task is to work on the pos_weight and dynamic weighting for model to classify better in Multi Class Classification task

# Edddur porjonto ekbar check dewa lagbe (peer-review) - {Reviewed once and made a bit modular addressing previous issues - 08/ 08/ 2025}

# porer theke tunning shuru korlam

In [None]:
binary_counts = Counter(y_binary_train_tensor.numpy())
num_normal = binary_counts[0]
num_attack = binary_counts[1]

# pos_weight = torch.tensor([num_normal / num_attack])
# pos_weight = torch.tensor([num_attack/ num_normal])
pos_weight = torch.tensor(1)


print(f"Binary Classification:")
print(f"Normal samples: {num_normal}")
print(f"Attack samples: {num_attack}")
print(f"Positive weight: {pos_weight.item():.3f}")

In [None]:
# # Vanilla

# attack_counts = Counter(y_train_attacks.numpy())
# total_attacks = len(y_train_attacks)

# # Create weights for classes 0-8 (model's output space)
# class_weights = []
# for i in range(9):  # 0-8 for model's 9 output classes
#     # Map back to original labels (1-9)
#     original_label = i + 1
#     count = attack_counts.get(original_label, 1)
#     weight = total_attacks / (9 * count)
#     class_weights.append(weight)

# multiclass_weights = torch.FloatTensor(class_weights)

# print(f"\nMulti-class Classification Weights (for model classes 0-8):")
# for i, weight in enumerate(class_weights):
#     original_label = i + 1
#     count = attack_counts.get(original_label, 0)
#     print(f"Model class {i} (Attack_{original_label}): count={count}, weight={weight:.3f}")

In [None]:
# Sqrt
attack_counts = Counter(y_train_attacks.numpy())
total_attacks = len(y_train_attacks)
class_weights_sqrt = []
class_frequencies = []

for i in range(9):
    original_label = i + 1
    count = attack_counts.get(original_label, 1)
    freq = count / total_attacks
    class_frequencies.append(freq)

# Calculate sqrt-based weights
for freq in class_frequencies:
    weight = 1.0 / np.sqrt(freq)
    class_weights_sqrt.append(weight)

# Normalize
total_weight = sum(class_weights_sqrt)
class_weights_sqrt = [w * 9 / total_weight for w in class_weights_sqrt]
multiclass_weights_sqrt = torch.FloatTensor(class_weights_sqrt)

print(f"\nSquare Root Frequency Multiclass Weights:")
for i, weight in enumerate(class_weights_sqrt):
    original_label = i + 1
    count = attack_counts.get(original_label, 0)
    print(f"Class {i} (Attack_{original_label}): count={count}, weight={weight:.3f}")

In [None]:
# # ENS -> For Extreme Imbalance
# attack_counts = Counter(y_train_attacks.numpy())
# total_attacks = len(y_train_attacks)


# beta = 0.999  # Hyperparameter (0.9-0.999)
# effective_num = []
# class_weights_ens = []

# for i in range(9):
#     original_label = i + 1
#     count = attack_counts.get(original_label, 1)
    
#     # Calculate effective number
#     en = (1 - beta**count) / (1 - beta) if count > 0 else 1
#     effective_num.append(en)

# # Normalize ENS weights
# for en in effective_num:
#     weight = 1.0 / en
#     class_weights_ens.append(weight)

# # Normalize to sum to number of classes
# total_weight = sum(class_weights_ens)
# class_weights_ens = [w * 9 / total_weight for w in class_weights_ens]
# multiclass_weights_ens = torch.FloatTensor(class_weights_ens)

# print(f"\nENS-based Multiclass Weights:")
# for i, weight in enumerate(class_weights_ens):
#     original_label = i + 1
#     count = attack_counts.get(original_label, 0)
#     print(f"Class {i} (Attack_{original_label}): count={count}, weight={weight:.3f}")

In [None]:
# # Focal Loss Weights

# alpha_weights = []
# for i in range(9):
#     original_label = i + 1
#     count = attack_counts.get(original_label, 1)
#     freq = count / total_attacks
    
#     # Alpha weight inversely proportional to frequency
#     alpha = (1 - freq) ** 0.5  # Power factor controls aggressiveness
#     alpha_weights.append(alpha)

# # Normalize alphas to [0.25, 0.75] range for stability
# min_alpha, max_alpha = min(alpha_weights), max(alpha_weights)
# alpha_weights = [0.25 + 0.5 * (a - min_alpha) / (max_alpha - min_alpha) for a in alpha_weights]
# focal_alpha_weights = torch.FloatTensor(alpha_weights)

# print(f"\nFocal Loss Alpha Weights:")
# for i, weight in enumerate(alpha_weights):
#     original_label = i + 1
#     count = attack_counts.get(original_label, 0)
#     print(f"Class {i} (Attack_{original_label}): count={count}, alpha={weight:.3f}")

In [None]:
# Tweaking here from optimizer to scheduler.

In [None]:
import gc
gc.collect()
torch.cuda.empty_cache()


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = HierarchicalNIDS(input_features=20, seq_length=10, num_attack_types=9)
model = model.to(device)

binary_criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight.to(device))
multiclass_criterion = nn.CrossEntropyLoss(weight=multiclass_weights_sqrt.to(device))

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-4, weight_decay=0.01, betas=(0.9, 0.999))

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, 
    mode='min',
    patience=5, 
    factor=0.5,
    min_lr=1e-6,
    verbose=True
)

# scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=10, T_mult=2, eta_min=1e-6)

print("Model created successfully!")

In [None]:
print(f"\nModel Configuration:")
print(f"Device: {device}")
print(f"Total parameters: {sum(p.numel() for p in model.parameters()):,}")
print(f"Binary loss positive weight: {pos_weight.item():.3f}")
print(f"Multi-class weights shape: {multiclass_weights_sqrt.shape}")

In [None]:
# Training Starts from here.

In [None]:
# Regularization parameters
GRADIENT_CLIP = 1.0
# NOISE_STD = 0.01
EARLY_STOPPING_PATIENCE = 10


binary_metrics = {
    'train_loss': [], 'val_loss': [],
    'train_acc': [], 'val_acc': [],
    'train_fpr': [], 'val_fpr': [],
    'train_fnr': [], 'val_fnr': []
}

multiclass_metrics = {
    'train_loss': [], 'val_loss': [],
    'train_acc': [], 'val_acc': [],
    'class_precision': defaultdict(list),
    'class_recall': defaultdict(list),
    'class_f1': defaultdict(list)
}

joint_metrics = {
    'train_loss': [], 'val_loss': [],
    'binary_acc': [], 'multiclass_acc': []
}

In [None]:
# def add_noise_augmentation(batch_x, noise_std=NOISE_STD, training=True):
#     if training:
#         noise = torch.randn_like(batch_x) * noise_std
#         return batch_x + noise
#     return batch_x

In [None]:
def train_binary_epoch(model, loader, criterion, optimizer, device, ): #noise_std=NOISE_STD
    model.train()
    total_loss = 0
    predictions = []
    targets = []
    
    for batch_x, batch_y_binary, _ in loader:
        # batch_x = add_noise_augmentation(batch_x, noise_std)
        batch_x, batch_y_binary = batch_x.to(device), batch_y_binary.to(device)
        
        optimizer.zero_grad()
        outputs = model(batch_x, stage='binary').squeeze()
        loss = criterion(outputs, batch_y_binary)
        loss.backward()
        
        # Gradient clipping.
        torch.nn.utils.clip_grad_norm_(model.parameters(), GRADIENT_CLIP)
        
        optimizer.step()
        
        total_loss += loss.item()
        pred_probs = torch.sigmoid(outputs)
        predictions.extend(pred_probs.cpu().detach().numpy())
        targets.extend(batch_y_binary.cpu().numpy())
    
    # Metric evaluation/ Calc.
    predictions = np.array(predictions)
    targets = np.array(targets)
    preds_binary = (predictions > 0.5).astype(int)
    
    accuracy = np.mean(preds_binary == targets)
    tp = np.sum((preds_binary == 1) & (targets == 1))
    fp = np.sum((preds_binary == 1) & (targets == 0))
    tn = np.sum((preds_binary == 0) & (targets == 0))
    fn = np.sum((preds_binary == 0) & (targets == 1))
    
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
    fnr = fn / (fn + tp) if (fn + tp) > 0 else 0
    
    return total_loss / len(loader), accuracy, fpr, fnr, predictions, targets

def evaluate_binary(model, loader, criterion, device):
    model.eval()
    total_loss = 0
    predictions = []
    targets = []
    
    with torch.no_grad():
        for batch_x, batch_y_binary, _ in loader:
            batch_x, batch_y_binary = batch_x.to(device), batch_y_binary.to(device)
            
            outputs = model(batch_x, stage='binary').squeeze()
            loss = criterion(outputs, batch_y_binary)
            
            total_loss += loss.item()
            pred_probs = torch.sigmoid(outputs)
            predictions.extend(pred_probs.cpu().numpy())
            targets.extend(batch_y_binary.cpu().numpy())
    
    predictions = np.array(predictions)
    targets = np.array(targets)
    preds_binary = (predictions > 0.5).astype(int)
    
    accuracy = np.mean(preds_binary == targets)
    tp = np.sum((preds_binary == 1) & (targets == 1))
    fp = np.sum((preds_binary == 1) & (targets == 0))
    tn = np.sum((preds_binary == 0) & (targets == 0))
    fn = np.sum((preds_binary == 0) & (targets == 1))
    
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
    fnr = fn / (fn + tp) if (fn + tp) > 0 else 0
    
    # Calculate per-class metrics
    report = classification_report(targets, preds_binary, 
                                 target_names=['Normal', 'Attack'], 
                                 output_dict=True)
    
    return total_loss / len(loader), accuracy, fpr, fnr, predictions, targets, report

In [None]:
print("Phase 1: Binary Classification Training")
print("=" * 60)

best_val_acc = 0
patience_counter = 0

for epoch in range(15):
    # Training
    train_loss, train_acc, train_fpr, train_fnr, _, _ = train_binary_epoch(
        model, train_loader, binary_criterion, optimizer, device
    )
    
    # Validation
    val_loss, val_acc, val_fpr, val_fnr, val_preds, val_targets, val_report = evaluate_binary(
        model, val_loader, binary_criterion, device
    )
    

    binary_metrics['train_loss'].append(train_loss)
    binary_metrics['val_loss'].append(val_loss)
    binary_metrics['train_acc'].append(train_acc)
    binary_metrics['val_acc'].append(val_acc)
    binary_metrics['train_fpr'].append(train_fpr)
    binary_metrics['val_fpr'].append(val_fpr)
    binary_metrics['train_fnr'].append(train_fnr)
    binary_metrics['val_fnr'].append(val_fnr)
    
    # Learning rate scheduling
    scheduler.step(val_loss)
    
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        patience_counter = 0
        # Save model state dict only (not CUDA state)
        save_dict = {k: v.cpu() for k, v in model.state_dict().items()}
        torch.save(save_dict, 'best_binary_model.pth')
    else:
        patience_counter += 1
    
    if patience_counter >= EARLY_STOPPING_PATIENCE:
        print(f"Early stopping at epoch {epoch+1}")
        break
    
    if (epoch + 1) % 3 == 0:
        print(f"\nEpoch {epoch+1}/15")
        print(f"Train - Loss: {train_loss:.4f}, Acc: {train_acc:.4f}, FPR: {train_fpr:.4f}, FNR: {train_fnr:.4f}")
        print(f"Val   - Loss: {val_loss:.4f}, Acc: {val_acc:.4f}, FPR: {val_fpr:.4f}, FNR: {val_fnr:.4f}")
        print(f"Val Per-Class Metrics:")
        print(f"  Normal   - Precision: {val_report['Normal']['precision']:.3f}, "
              f"Recall: {val_report['Normal']['recall']:.3f}, "
              f"F1: {val_report['Normal']['f1-score']:.3f}")
        print(f"  Attack   - Precision: {val_report['Attack']['precision']:.3f}, "
              f"Recall: {val_report['Attack']['recall']:.3f}, "
              f"F1: {val_report['Attack']['f1-score']:.3f}")

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Loss plot
axes[0, 0].plot(binary_metrics['train_loss'], label='Train Loss')
axes[0, 0].plot(binary_metrics['val_loss'], label='Val Loss')
axes[0, 0].set_xlabel('Epoch')
axes[0, 0].set_ylabel('Loss')
axes[0, 0].set_title('Binary Classification Loss')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# Accuracy plot
axes[0, 1].plot(binary_metrics['train_acc'], label='Train Acc')
axes[0, 1].plot(binary_metrics['val_acc'], label='Val Acc')
axes[0, 1].set_xlabel('Epoch')
axes[0, 1].set_ylabel('Accuracy')
axes[0, 1].set_title('Binary Classification Accuracy')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# FPR/FNR plot
axes[1, 0].plot(binary_metrics['val_fpr'], label='False Positive Rate', color='red')
axes[1, 0].plot(binary_metrics['val_fnr'], label='False Negative Rate', color='orange')
axes[1, 0].set_xlabel('Epoch')
axes[1, 0].set_ylabel('Rate')
axes[1, 0].set_title('False Positive/Negative Rates')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

# ROC Curve
fpr_roc, tpr_roc, _ = roc_curve(val_targets, val_preds)
roc_auc = auc(fpr_roc, tpr_roc)
axes[1, 1].plot(fpr_roc, tpr_roc, color='darkorange', lw=2, 
                label=f'ROC curve (AUC = {roc_auc:.3f})')
axes[1, 1].plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
axes[1, 1].set_xlim([0.0, 1.0])
axes[1, 1].set_ylim([0.0, 1.05])
axes[1, 1].set_xlabel('False Positive Rate')
axes[1, 1].set_ylabel('True Positive Rate')
axes[1, 1].set_title('ROC Curve - Binary Classification')
axes[1, 1].legend(loc="lower right")
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Overfitting Analysis
train_val_gap = binary_metrics['train_acc'][-1] - binary_metrics['val_acc'][-1]
print(f"\nOverfitting Analysis:")
print(f"Final Train-Val Accuracy Gap: {train_val_gap:.3f}")
if train_val_gap > 0.1:
    print("Possible overfitting.")
elif train_val_gap < 0.02:
    print("Possible underfitting.")
else:
    print("Well-fitted")

In [None]:
def train_multiclass_epoch(model, loader, criterion, optimizer, device, ): #noise_std=NOISE_STD
    model.train()
    total_loss = 0
    all_preds = []
    all_targets = []
    
    for batch_x, batch_y in loader:
        # batch_x = add_noise_augmentation(batch_x, noise_std)
        batch_x = batch_x.to(device)
        batch_y = (batch_y - 1).to(device)  # Convert 1-9 to 0-8 for model
        
        optimizer.zero_grad()
        outputs = model(batch_x, stage='multiclass')
        loss = criterion(outputs, batch_y)
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), GRADIENT_CLIP)
        optimizer.step()
        
        total_loss += loss.item()
        _, predicted = outputs.max(1)
        all_preds.extend(predicted.cpu().numpy())
        all_targets.extend(batch_y.cpu().numpy())
    
    accuracy = np.mean(np.array(all_preds) == np.array(all_targets))
    return total_loss / len(loader), accuracy, all_preds, all_targets

def evaluate_multiclass(model, loader, criterion, device):
    model.eval()
    total_loss = 0
    all_preds = []
    all_targets = []
    all_probs = []
    
    with torch.no_grad():
        for batch_x, batch_y in loader:
            batch_x = batch_x.to(device)
            batch_y = (batch_y - 1).to(device)  # Convert 1-9 to 0-8 for model
            
            outputs = model(batch_x, stage='multiclass')
            loss = criterion(outputs, batch_y)
            
            total_loss += loss.item()
            probs = F.softmax(outputs, dim=1)
            _, predicted = outputs.max(1)
            
            all_preds.extend(predicted.cpu().numpy())
            all_targets.extend(batch_y.cpu().numpy())
            all_probs.extend(probs.cpu().numpy())
    
    accuracy = np.mean(np.array(all_preds) == np.array(all_targets))
    
    # Get per-class metrics (display as Attack_1 to Attack_9)
    attack_names = [f'Attack_{i+1}' for i in range(9)]
    report = classification_report(all_targets, all_preds, 
                                 target_names=attack_names, 
                                 output_dict=True,
                                 zero_division=0)
    
    return total_loss / len(loader), accuracy, all_preds, all_targets, all_probs, report

In [None]:
# Cell j: Phase 2 - Multi-class Classification Training
print("\n\nPhase 2: Multi-class Classification Training")
print("=" * 60)

# Load best binary model
model.load_state_dict(torch.load('best_binary_model.pth'))

# Freeze feature extractor
for param in model.conv1_3.parameters():
    param.requires_grad = False
for param in model.conv1_5.parameters():
    param.requires_grad = False
for param in model.conv2.parameters():
    param.requires_grad = False
for param in model.bilstm.parameters():
    param.requires_grad = False

# Optimizer for multi-class head only
optimizer_multi = torch.optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), 
                                   lr=0.0005, weight_decay=0.01)

# Create validation multiclass loader (only attacks)
val_attack_mask = y_binary_val_tensor == 1
X_val_attacks = X_val_tensor[val_attack_mask]
y_val_attacks = y_attack_val_tensor[val_attack_mask]
val_multiclass_dataset = TensorDataset(X_val_attacks, y_val_attacks)
val_multiclass_loader = DataLoader(val_multiclass_dataset, batch_size=64, shuffle=False)

best_multi_acc = 0

for epoch in range(10):
    # Training
    train_loss, train_acc, train_preds, train_targets = train_multiclass_epoch(
        model, multiclass_loader, multiclass_criterion, optimizer_multi, device
    )
    
    # Validation
    val_loss, val_acc, val_preds, val_targets, val_probs, val_report = evaluate_multiclass(
        model, val_multiclass_loader, multiclass_criterion, device
    )
    
    multiclass_metrics['train_loss'].append(train_loss)
    multiclass_metrics['val_loss'].append(val_loss)
    multiclass_metrics['train_acc'].append(train_acc)
    multiclass_metrics['val_acc'].append(val_acc)
    
    # Store per-class metrics
    for i in range(9):
        class_name = f'Attack_{i+1}'
        if class_name in val_report:
            multiclass_metrics['class_precision'][i].append(val_report[class_name]['precision'])
            multiclass_metrics['class_recall'][i].append(val_report[class_name]['recall'])
            multiclass_metrics['class_f1'][i].append(val_report[class_name]['f1-score'])
    
    if val_acc > best_multi_acc:
        best_multi_acc = val_acc
        torch.save(model.state_dict(), 'best_multiclass_model.pth')
    
    if (epoch + 1) % 2 == 0:
        print(f"\nEpoch {epoch+1}/10")
        print(f"Train - Loss: {train_loss:.4f}, Acc: {train_acc:.4f}")
        print(f"Val   - Loss: {val_loss:.4f}, Acc: {val_acc:.4f}")
        print(f"\nVal Per-Class Metrics:")
        for i in range(9):
            class_name = f'Attack_{i+1}'
            if class_name in val_report:
                print(f"  {class_name} - P: {val_report[class_name]['precision']:.3f}, "
                      f"R: {val_report[class_name]['recall']:.3f}, "
                      f"F1: {val_report[class_name]['f1-score']:.3f}")
        print(f"  Macro avg - P: {val_report['macro avg']['precision']:.3f}, "
              f"R: {val_report['macro avg']['recall']:.3f}, "
              f"F1: {val_report['macro avg']['f1-score']:.3f}")

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Loss plot
axes[0, 0].plot(multiclass_metrics['train_loss'], label='Train Loss')
axes[0, 0].plot(multiclass_metrics['val_loss'], label='Val Loss')
axes[0, 0].set_xlabel('Epoch')
axes[0, 0].set_ylabel('Loss')
axes[0, 0].set_title('Multi-class Classification Loss')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# Accuracy plot
axes[0, 1].plot(multiclass_metrics['train_acc'], label='Train Acc')
axes[0, 1].plot(multiclass_metrics['val_acc'], label='Val Acc')
axes[0, 1].set_xlabel('Epoch')
axes[0, 1].set_ylabel('Accuracy')
axes[0, 1].set_title('Multi-class Classification Accuracy')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)


axes[1, 0].set_prop_cycle('color', plt.cm.tab10(np.linspace(0, 1, 9)))
for i in range(9):
    axes[1, 0].plot(multiclass_metrics['class_f1'][i], label=f'Attack_{i+1}')
axes[1, 0].set_xlabel('Epoch')
axes[1, 0].set_ylabel('F1-Score')
axes[1, 0].set_title('Per-Class F1-Scores')
axes[1, 0].legend(bbox_to_anchor=(1.05, 1), loc='upper left')
axes[1, 0].grid(True, alpha=0.3)

# Confusion Matrix
cm = confusion_matrix(val_targets, val_preds)
im = axes[1, 1].imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
axes[1, 1].figure.colorbar(im, ax=axes[1, 1])
axes[1, 1].set(xticks=np.arange(cm.shape[1]),
                yticks=np.arange(cm.shape[0]),
                xticklabels=[f'A{i+1}' for i in range(9)],
                yticklabels=[f'A{i+1}' for i in range(9)],
                title='Confusion Matrix',
                ylabel='True label',
                xlabel='Predicted label')
plt.setp(axes[1, 1].get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")

plt.tight_layout()
plt.show()

In [None]:
def train_joint_epoch(model, loader, binary_criterion, multiclass_criterion, 
                     optimizer, device, ): #noise_std=NOISE_STD
    model.train()
    total_loss = 0
    binary_correct = 0
    total_samples = 0
    
    for batch_x, batch_y_binary, batch_y_attack in loader:
        # batch_x = add_noise_augmentation(batch_x, noise_std)
        batch_x = batch_x.to(device)
        batch_y_binary = batch_y_binary.to(device)
        batch_y_attack = batch_y_attack.to(device)
        
        optimizer.zero_grad()
        
        # Forward pass for both heads
        binary_output, multiclass_output = model(batch_x, stage='both')
        binary_output = binary_output.squeeze()
        
        # Binary loss
        binary_loss = binary_criterion(binary_output, batch_y_binary)
        
        # Multi-class loss only for attack samples
        attack_mask = batch_y_binary == 1
        if attack_mask.sum() > 0:
            attack_labels = batch_y_attack[attack_mask] - 1  # Converting to 1-9 to 0-8
            attack_predictions = multiclass_output[attack_mask]
            multiclass_loss = multiclass_criterion(attack_predictions, attack_labels)
            total_loss_batch = binary_loss + 0.5 * multiclass_loss
        else:
            total_loss_batch = binary_loss
        
        total_loss_batch.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), GRADIENT_CLIP)
        optimizer.step()
        
        total_loss += total_loss_batch.item()
        predicted = (torch.sigmoid(binary_output) > 0.5).float()
        binary_correct += (predicted == batch_y_binary).sum().item()
        total_samples += batch_y_binary.size(0)
    
    return total_loss / len(loader), binary_correct / total_samples

In [None]:
print("\n\nPhase 3: Joint Fine-tuning")
print("=" * 60)

# Unfreeze all parameters
for param in model.parameters():
    param.requires_grad = True


optimizer_joint = torch.optim.AdamW(model.parameters(), lr=0.0001, weight_decay=0.01)

for epoch in range(10):
    # Training
    train_loss, train_binary_acc = train_joint_epoch(
        model, train_loader, binary_criterion, multiclass_criterion, 
        optimizer_joint, device
    )
    
    # Validation
    val_loss, val_binary_acc, val_fpr, val_fnr, _, _, val_binary_report = evaluate_binary(
        model, val_loader, binary_criterion, device
    )
    
    val_multi_loss, val_multi_acc, _, _, _, val_multi_report = evaluate_multiclass(
        model, val_multiclass_loader, multiclass_criterion, device
    )
    
    joint_metrics['train_loss'].append(train_loss)
    joint_metrics['val_loss'].append(val_loss)
    joint_metrics['binary_acc'].append(val_binary_acc)
    joint_metrics['multiclass_acc'].append(val_multi_acc)
    
    if (epoch + 1) % 2 == 0:
        print(f"\nEpoch {epoch+1}/10")
        print(f"Train - Joint Loss: {train_loss:.4f}, Binary Acc: {train_binary_acc:.4f}")
        print(f"Val   - Binary Acc: {val_binary_acc:.4f}, Multi Acc: {val_multi_acc:.4f}")
        print(f"Binary FPR: {val_fpr:.4f}, FNR: {val_fnr:.4f}")

In [None]:
print("\n\nFinal Model Evaluation")
print("=" * 60)

# Testing on final test set
test_loss, test_binary_acc, test_fpr, test_fnr, test_preds, test_targets, test_binary_report = evaluate_binary(
    model, test_loader, binary_criterion, device
)

print(f"\nTest Set Binary Classification:")
print(f"Accuracy: {test_binary_acc:.4f}")
print(f"FPR: {test_fpr:.4f}, FNR: {test_fnr:.4f}")
print("\nBinary Classification Report:")
print(classification_report(test_targets, (test_preds > 0.5).astype(int), 
                          target_names=['Normal', 'Attack']))

# Multi-class evaluation on test set
model.eval()
test_multi_preds = []
test_multi_targets = []

with torch.no_grad():
    for batch_X, batch_binary_y, batch_multi_y in test_loader:
        batch_X = batch_X.to(device)
        
        binary_out, multi_out = model(batch_X)
        multi_pred = torch.softmax(multi_out, dim=1)
        
        test_multi_preds.extend(multi_pred.cpu().numpy())
        test_multi_targets.extend(batch_multi_y.cpu().numpy())

test_multi_preds = np.array(test_multi_preds)
test_multi_targets = np.array(test_multi_targets)
test_multi_pred_classes = np.argmax(test_multi_preds, axis=1)

# Check unique classes in test data
unique_classes = sorted(np.unique(test_multi_targets))
num_classes = len(unique_classes)

print(f"\nTest Set Multi-class Classification:")
test_multi_acc = accuracy_score(test_multi_targets, test_multi_pred_classes)
print(f"Accuracy: {test_multi_acc:.4f}")
print(f"Number of classes found: {num_classes}")
print(f"Class distribution: {dict(zip(*np.unique(test_multi_targets, return_counts=True)))}")

# Create target names based on actual number of classes
if num_classes == 5:
    target_names = ['Normal', 'DoS', 'Probe', 'R2L', 'U2R']
elif num_classes == 10:
    target_names = ['Normal', 'apache2', 'back', 'land', 'neptune', 'pod', 'smurf', 'teardrop', 'mailbomb', 'processtable']
else:
    target_names = [f'Class_{i}' for i in unique_classes]

print("\nMulti-class Classification Report:")
print(classification_report(test_multi_targets, test_multi_pred_classes, 
                          labels=unique_classes, target_names=target_names))

fig, axes = plt.subplots(2, 3, figsize=(20, 12))

# Phase comparison
phases = ['Binary\n(15 epochs)', 'Multi-class\n(10 epochs)', 'Joint\n(10 epochs)']
final_accs = [
    binary_metrics['val_acc'][-1] if binary_metrics['val_acc'] else 0,
    multiclass_metrics['val_acc'][-1] if multiclass_metrics['val_acc'] else 0,
    joint_metrics['binary_acc'][-1] if joint_metrics['binary_acc'] else 0
]

axes[0,0].bar(phases, final_accs, color=['blue', 'green', 'red'])
axes[0,0].set_ylabel('Final Validation Accuracy')
axes[0,0].set_title('Performance by Training Phase')
axes[0,0].set_ylim(0, 1)

# Overfitting analysis
train_val_gaps = {
    'Binary': binary_metrics['train_acc'][-1] - binary_metrics['val_acc'][-1] if binary_metrics['train_acc'] else 0,
    'Multi-class': multiclass_metrics['train_acc'][-1] - multiclass_metrics['val_acc'][-1] if multiclass_metrics['train_acc'] else 0,
}

axes[0,1].bar(train_val_gaps.keys(), train_val_gaps.values(), color=['orange', 'purple'])
axes[0,1].axhline(y=0.1, color='r', linestyle='--', label='Overfitting threshold')
axes[0,1].set_ylabel('Train-Val Accuracy Gap')
axes[0,1].set_title('Overfitting Analysis')
axes[0,1].legend()

# Final metrics summary
metrics_text = f"""Final Performance Summary:
------------------------
Binary Classification:
  Test Accuracy: {test_binary_acc:.3f}
  Test FPR: {test_fpr:.3f}
  Test FNR: {test_fnr:.3f}

Multi-class (Test):
  Test Accuracy: {test_multi_acc:.3f}
  Validation Accuracy: {multiclass_metrics['val_acc'][-1]:.3f}
  Macro F1: {val_multi_report['macro avg']['f1-score']:.3f}

Model Complexity:
  Parameters: {sum(p.numel() for p in model.parameters()):,}
  Size: ~{sum(p.numel() for p in model.parameters()) * 4 / 1024 / 1024:.2f} MB
"""

axes[0,2].text(0.1, 0.5, metrics_text, transform=axes[0,2].transAxes,
             fontsize=11, verticalalignment='center', fontfamily='monospace')
axes[0,2].axis('off')

# Binary Classification Confusion Matrix
from sklearn.metrics import confusion_matrix
import seaborn as sns

binary_cm = confusion_matrix(test_targets, (test_preds > 0.5).astype(int))
sns.heatmap(binary_cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Normal', 'Attack'], yticklabels=['Normal', 'Attack'], ax=axes[1,0])
axes[1,0].set_title('Binary Classification Confusion Matrix')
axes[1,0].set_xlabel('Predicted')
axes[1,0].set_ylabel('Actual')

# Multi-class Confusion Matrix
multi_cm = confusion_matrix(test_multi_targets, test_multi_pred_classes)
plt.figure(figsize=(8, 6))
sns.heatmap(multi_cm, annot=True, fmt='d', cmap='Greens', 
            xticklabels=target_names, yticklabels=target_names, ax=axes[1,1])
axes[1,1].set_title('Multi-class Classification Confusion Matrix')
axes[1,1].set_xlabel('Predicted')
axes[1,1].set_ylabel('Actual')
plt.setp(axes[1,1].get_xticklabels(), rotation=45, ha='right')
plt.setp(axes[1,1].get_yticklabels(), rotation=0)

# Training Loss Graph
axes[1,2].plot(range(1, len(binary_metrics['train_loss']) + 1), binary_metrics['train_loss'], 
               'b-', label='Binary Train Loss', linewidth=2)
axes[1,2].plot(range(1, len(binary_metrics['val_loss']) + 1), binary_metrics['val_loss'], 
               'b--', label='Binary Val Loss', linewidth=2)

if multiclass_metrics['train_loss']:
    offset = len(binary_metrics['train_loss'])
    axes[1,2].plot(range(offset + 1, offset + len(multiclass_metrics['train_loss']) + 1), 
                   multiclass_metrics['train_loss'], 'g-', label='Multi-class Train Loss', linewidth=2)
    axes[1,2].plot(range(offset + 1, offset + len(multiclass_metrics['val_loss']) + 1), 
                   multiclass_metrics['val_loss'], 'g--', label='Multi-class Val Loss', linewidth=2)

if joint_metrics['train_loss']:
    offset = len(binary_metrics['train_loss']) + len(multiclass_metrics['train_loss'])
    axes[1,2].plot(range(offset + 1, offset + len(joint_metrics['train_loss']) + 1), 
                   joint_metrics['train_loss'], 'r-', label='Joint Train Loss', linewidth=2)
    axes[1,2].plot(range(offset + 1, offset + len(joint_metrics['val_loss']) + 1), 
                   joint_metrics['val_loss'], 'r--', label='Joint Val Loss', linewidth=2)

axes[1,2].set_xlabel('Epoch')
axes[1,2].set_ylabel('Loss')
axes[1,2].set_title('Training and Validation Loss Over Epochs')
axes[1,2].legend()
axes[1,2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Save final model
torch.save({
    'model_state_dict': model.state_dict(),
    'binary_metrics': binary_metrics,
    'multiclass_metrics': multiclass_metrics,
    'joint_metrics': joint_metrics,
    'test_results': {
        'binary_acc': test_binary_acc,
        'multi_acc': test_multi_acc,
        'fpr': test_fpr,
        'fnr': test_fnr
    }
}, 'final_hierarchical_nids_model.pth')

print("\nModel saved successfully!")