<a href="https://colab.research.google.com/github/gabrieladamasceno/Model_Attacks/blob/main/Features_5G_NIDD.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [22]:
from google.colab import drive
from sklearn.preprocessing import LabelEncoder
import pandas as pd

# Acessar dataset
drive.mount('/content/gdrive')

# Carregar o dataset
encoded = pd.read_csv("/content/gdrive/MyDrive/Datasets/Attacks/Encoded/Encoded.csv")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [23]:
# Selecionar apenas colunas numéricas
numeric_cols = encoded.select_dtypes(include=['number']).columns

# Aplicar a suavização apenas nas colunas numéricas
for col in numeric_cols:
    encoded[col] = encoded[col].fillna(encoded[col].mean())

In [24]:
# columns with null values

columns_null = []
columns = encoded.columns
for column in columns:
    c = encoded[column].isnull().sum()
    if c != 0:
        print(column, 'has {} null values'.format(c))
        columns_null.append(column)

In [25]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_selection import mutual_info_classif
from sklearn.ensemble import RandomForestClassifier
from tensorflow.keras import layers, models
import matplotlib.pyplot as plt
import seaborn as sns

# =====================
# 1. Preparar dataset
# =====================
df = encoded  # seu DataFrame
df_features = df.drop(columns=['Attack Type']).copy()
features = df_features.columns.tolist()

# Converter colunas categóricas para numéricas
for col in df_features.columns:
    if df_features[col].dtype == 'object':
        le = LabelEncoder()
        df_features[col] = le.fit_transform(df_features[col])

X = df_features.values
y = df['Attack Type'].values
y_bin = (y != 'Benign').astype(int)  # 0=Benigno, 1=Ataque

leakage_features = [
    'Label',
    'Attack Tool',
    'Unnamed: 0'
]

# Normalização
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# =====================
# 2. Mutual Information
# =====================
mi = mutual_info_classif(X_scaled, y_bin, discrete_features=False)
mi_df = pd.DataFrame({'Feature': features, 'Mutual_Info': mi})

In [26]:
mi_df_sorted = mi_df.sort_values(by='Mutual_Info', ascending=False)

mi_df_filtered = mi_df_sorted[
    ~mi_df_sorted['Feature'].isin(leakage_features)
].reset_index(drop=True)

topmi25 = mi_df_filtered.sort_values(
    by='Mutual_Info',
    ascending=False
).head(25)

print(topmi25)

       Feature  Mutual_Info
0          Seq     0.335629
1         sTtl     0.215913
2     TotBytes     0.212333
3     SrcBytes     0.210312
4       Offset     0.206348
5   sMeanPktSz     0.201101
6        sHops     0.196715
7          tcp     0.167516
8   dMeanPktSz     0.147135
9    *    f        0.146559
10        dTtl     0.138705
11   e             0.138106
12       dHops     0.136319
13      TcpRtt     0.129838
14    DstBytes     0.125641
15      AckDat     0.125524
16         udp     0.122108
17      SrcWin     0.115534
18        Rate     0.114578
19        Load     0.109406
20     SrcLoad     0.108319
21     RunTime     0.108261
22         Dur     0.108065
23         Max     0.108056
24         Sum     0.107919


In [27]:
from sklearn.decomposition import PCA

# PCA completo
pca = PCA()
X_pca = pca.fit_transform(X_scaled)

# Variância explicada
explained_var = pca.explained_variance_ratio_
cum_var = np.cumsum(explained_var)

pca_var_df = pd.DataFrame({
    'PC': [f'PC{i+1}' for i in range(len(explained_var))],
    'Explained_Variance': explained_var,
    'Cumulative_Variance': cum_var
})

pca_importance = np.sum(
    np.abs(pca.components_) * pca.explained_variance_ratio_[:, np.newaxis],
    axis=0
)

In [28]:
pca_df_sorted = pd.DataFrame({
    'Feature': features,
    'PCA_Importance': pca_importance
}).sort_values(by='PCA_Importance', ascending=False)

pca_df_filtered = pca_df_sorted[
    ~pca_df_sorted['Feature'].isin(leakage_features)
].reset_index(drop=True)

top25 = pca_df_filtered.sort_values(
    by='PCA_Importance',
    ascending=False
).head(25)

print(top25)

       Feature  PCA_Importance
0         sTtl        0.078097
1    *    f           0.077936
2    e                0.077442
3   sMeanPktSz        0.075868
4       Offset        0.075586
5          Seq        0.074925
6   dMeanPktSz        0.074721
7          CON        0.074327
8          udp        0.073604
9     SrcBytes        0.073577
10     SrcPkts        0.072947
11     DstPkts        0.072418
12      AckDat        0.071220
13         REQ        0.070521
14         INT        0.070048
15    DstBytes        0.069524
16      TcpRtt        0.069231
17       sHops        0.068672
18         tcp        0.068588
19         FIN        0.067786
20    TotBytes        0.067695
21     TotPkts        0.067255
22        Loss        0.065851
23      Status        0.065359
24       Start        0.064419


In [32]:
pd.set_option('display.max_rows', None)
print(mi_df_sorted)

        Feature  Mutual_Info
44        Label     0.805309
45  Attack Tool     0.734615
0    Unnamed: 0     0.634596
1           Seq     0.335629
10         sTtl     0.215913
17     TotBytes     0.212333
18     SrcBytes     0.210312
20       Offset     0.206348
21   sMeanPktSz     0.201101
12        sHops     0.196715
66          tcp     0.167516
22   dMeanPktSz     0.147135
48    *    f        0.146559
11         dTtl     0.138705
49    e             0.138106
13        dHops     0.136319
41       TcpRtt     0.129838
19     DstBytes     0.125641
43       AckDat     0.125524
67          udp     0.122108
35       SrcWin     0.115534
32         Rate     0.114578
23         Load     0.109406
24      SrcLoad     0.108319
3       RunTime     0.108261
2           Dur     0.108065
7           Max     0.108056
5           Sum     0.107919
6           Min     0.107374
39   SrcTCPBase     0.107281
4          Mean     0.107271
42       SynAck     0.106145
33      SrcRate     0.103289
72          IN

In [30]:
pd.set_option('display.max_rows', None)
print(pca_df_sorted)

        Feature  PCA_Importance
10         sTtl    7.809733e-02
48    *    f       7.793584e-02
49    e            7.744235e-02
21   sMeanPktSz    7.586832e-02
20       Offset    7.558559e-02
1           Seq    7.492480e-02
22   dMeanPktSz    7.472084e-02
69          CON    7.432717e-02
67          udp    7.360407e-02
18     SrcBytes    7.357711e-02
44        Label    7.335703e-02
15      SrcPkts    7.294726e-02
45  Attack Tool    7.253896e-02
16      DstPkts    7.241766e-02
43       AckDat    7.122009e-02
74          REQ    7.052058e-02
0    Unnamed: 0    7.033847e-02
72          INT    7.004827e-02
19     DstBytes    6.952357e-02
41       TcpRtt    6.923149e-02
12        sHops    6.867205e-02
66          tcp    6.858842e-02
71          FIN    6.778634e-02
17     TotBytes    6.769520e-02
14      TotPkts    6.725493e-02
26         Loss    6.585079e-02
81       Status    6.535864e-02
80        Start    6.441879e-02
53    e d          6.351031e-02
76          RST    6.298766e-02
13      