# Libraries

In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# FUNÇÕES DE AJUDA

# Identificar Var Numéricas e Categóricas

In [15]:
def get_feature_types(df, target_col='Tipo de Ataque'):
    '''
    Identifica Categorias Numéricas e Categóricas
    '''
    numeric_features = df.select_dtypes(include=[np.number]).columns.tolist()
    categorical_features = df.select_dtypes(exclude=[np.number]).columns.tolist()

    # Remove Variável Alvo
    if target_col in numeric_features:
        numeric_features.remove(target_col)
    if target_col in categorical_features:
        categorical_features.remove(target_col)

    return numeric_features, categorical_features

# Analizar Correlação Entre Features Numéricas

In [16]:
def correlation_analysis(df, numeric_features, threshold=0.85):
    '''
    Analiza Correlação entre Features Numéricas

    Threshold 0.85 para procurar correlações fortes
    '''

    corr_matrix = df[numeric_features].corr()

    plt.figure(figsize=(20, 20))
    sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', center=0, linewidth= 0.5)

    plt.title('Correlação entre Features - Heatmap')
    plt.xticks(rotation=90)
    plt.yticks(rotation=0)
    plt.show()

    threshold = threshold
    high_corr = np.where(np.abs(corr_matrix) > threshold)
    high_corr = [(corr_matrix.index[x], corr_matrix.columns[y], corr_matrix.iloc[x, y]) for x, y in zip(*high_corr) if x != y and x < y]
    
    return high_corr


# Análise de Homogeneidade da Variância (Teste de Levene's)

In [17]:
def analyze_variance_homogen(df, numeric_features, target_col='Tipo de Ataque'):
    '''
    Análise de Variância Homogénea
    '''

    results_levene = {}

    for feature in numeric_features:
        groups = [group[feature].dropna().values for name, group in df.groupby(target_col) if not group[feature].dropna().empty] 

        groups = [group for group in groups if len(group) > 0 and np.any(group != 0) and np.var(group) > 0]

        if len(groups) < 2:
            print(f"Não existem grupos válidos para fazer o teste de Levene para a feature: {feature}")
            continue

        stat_levene, p_value_levene = stats.levene(*groups)  
        results_levene[feature] = {'Statistic': stat_levene, 'p-value': p_value_levene}  

    return results_levene



# Análise da Importância das Features com Teste Kurskal-Wallis

In [18]:
def analyze_feature_importance(df, numeric_features, target_col='Tipo de Ataque'):
    '''
    Análise de Importância de Features
    '''

    h_scores = {}

    for feature in numeric_features:
        groups = [group[feature].dropna().values for name, group in df.groupby(target_col)]
        h_stat, p_value = stats.kruskal(*groups)
        h_scores[feature] = {'H-Statistic': h_stat, 'p-value': p_value}

    h_scores_df = pd.DataFrame.from_dict(h_scores, orient='index')
    h_scores_df = h_scores_df.sort_values('H-Statistic', ascending=False)

    plt.figure(figsize=(18, 10))
    plt.bar(range(len(h_scores_df)), h_scores_df['H-Statistic'], color='skyblue')
    plt.xticks(range(len(h_scores_df)), h_scores_df.index, rotation=90)
    plt.title('Importância de Features - Teste de Kruskal-Wallis')
    plt.xlabel('Features')
    plt.ylabel('H-Statistic')
    plt.show()

    return h_scores_df

# Análise da Importância das Features com Random Forest

In [19]:
def analyze_feature_importance_rf(df, numeric_features, target_col='Tipo de Ataque'):
    '''
    Análise de Importância de Features com Random Forest
    '''

    hyper_params = {
        'n_estimators': 150, # Número de Árvores
        'max_depth': 5, # Limite da Profundidade das Árvores
        'random_state': 42, # Para Reprodutibilidade
        'n_jobs': -1 # Para Paralelização
    }

    X = df[numeric_features]
    y = df[target_col]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=hyper_params['random_state'], stratify=y)

    rf = RandomForestClassifier(**hyper_params)

    rf.fit(X_train, y_train)

    cv_scores = cross_val_score(rf, X_train, y_train, cv=5, n_jobs=-1)
    print(f"Cross-Validation Scores: {np.mean(cv_scores):.4f} +/- {np.std(cv_scores):.4f}")

    y_pred = rf.predict(X_test)

    importances = rf.feature_importances_
    feature_importance_df = pd.DataFrame({'Feature': numeric_features, 'Importance': importances})
    feature_importance_df = feature_importance_df.sort_values('Importance', ascending=False)

    rf_labels = rf.classes_
    cm = confusion_matrix(y_test, y_pred)

    report = classification_report(y_test, y_pred, target_names=rf_labels)
    print("\nClassification Report:\n")
    print(report, end='\n\n')

    plt.figure(figsize=(18, 12))
    plt.bar(feature_importance_df['Feature'], feature_importance_df['Importance'], color='skyblue')
    plt.ylabel('Importância')
    plt.xlabel('Features')
    plt.title('Importância de Features - Random Forest')
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.show()

    return feature_importance_df, cm, rf_labels, cv_scores


# Cálculo da Percentagem de Outliers para Cada Feature com IQR

In [20]:
def calculate_outliers_percentage(df):
    '''
    Calcula a Percentagem de Outliers com Método IQR
    '''

    outliers_percentage = {}

    for column in df.columns:
        q1 = df[column].quantile(0.25)
        q3 = df[column].quantile(0.75)
        iqr = q3 - q1

        lower_bound = q1 - 1.5 * iqr
        upper_bound = q3 + 1.5 * iqr

        outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]

        outliers_percentage[column] = len(outliers) / len(df) * 100

        outliers_percentage[column] = outliers_percentage
   
    return outliers_percentage

# 1. EXPLORAÇÃO INICIAL

# 1.1. Carregar o Ds

In [29]:
# Path para o Dataset
ds_file = 'project_ds.csv'

# Carregar o Dataset
ds = pd.read_csv(ds_file, delimiter=';')
ds.columns = ds.columns.str.strip()

# Dimensões do Dataset
rows, cols = ds.shape
print(f"O Dataset tem {rows} linhas e {cols} colunas")

O Dataset tem 225745 linhas e 79 colunas


# 1.2. Overview dos Dados

In [22]:
# Apresentação das Primeiras Linhas do DS
ds.head()

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,54865,3,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
1,55054,109,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2,55055,52,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
3,46236,34,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
4,54863,3,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN


In [23]:
# Apresentação de 10 linhas aleatórias
ds.sample(n=10, random_state=42)

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
127940,80,9392887,4,0,24,0,6,6,6.0,0.0,...,20,1982.0,0.0,1982,1982,9390905.0,0.0,9390905,9390905,DDoS
195330,64007,5438011,1,5,6,30,6,6,6.0,0.0,...,20,27151.0,0.0,27151,27151,5410860.0,0.0,5410860,5410860,BENIGN
202016,80,115615609,20,15,1728,3463,578,0,86.4,211.017934,...,32,,134508.6864,566475,120118,10100000.0,,10100000,9988018,BENIGN
18959,80,9827,3,5,26,11601,20,0,8.666667,10.263203,...,20,0.0,0.0,0,0,0.0,0.0,0,0,DDoS
120928,53315,87583370,7,9,11607,62,4380,0,1658.142857,1762.272815,...,20,4219842.0,0.0,4219842,4219842,82600000.0,0.0,82600000,82600000,BENIGN
74750,80,11095658,4,0,24,0,6,6,6.0,0.0,...,20,1004.0,0.0,1004,1004,11100000.0,0.0,11100000,11100000,DDoS
29454,80,71170688,8,6,56,11601,20,0,7.0,5.656854,...,20,1016.0,0.0,1016,1016,35100000.0,39900000.0,63300000,6908953,DDoS
132954,80,855997,3,5,26,11601,20,0,8.666667,10.263203,...,20,0.0,0.0,0,0,0.0,0.0,0,0,DDoS
10641,53,23940,1,1,59,75,59,59,59.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
146195,443,60466365,13,11,1070,3929,697,0,82.307692,193.904781,...,20,122418.0,81954.97124,288633,84425,,,10000000,9716694,BENIGN


In [24]:
# Apresentar Dimensões do Ds
print(f"Dimensões do Dataset: {ds.shape}")

Dimensões do Dataset: (225745, 79)


In [25]:
# Apresentar Tipos de Dados
ds.info()

# Podemos Verificar que o 'y' é a única variável categórica

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 225745 entries, 0 to 225744
Data columns (total 79 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Destination Port             225745 non-null  int64  
 1   Flow Duration                225745 non-null  int64  
 2   Total Fwd Packets            225745 non-null  int64  
 3   Total Backward Packets       225745 non-null  int64  
 4   Total Length of Fwd Packets  225745 non-null  int64  
 5   Total Length of Bwd Packets  225745 non-null  int64  
 6   Fwd Packet Length Max        225745 non-null  int64  
 7   Fwd Packet Length Min        225745 non-null  int64  
 8   Fwd Packet Length Mean       225745 non-null  float64
 9   Fwd Packet Length Std        225745 non-null  float64
 10  Bwd Packet Length Max        225745 non-null  int64  
 11  Bwd Packet Length Min        225745 non-null  int64  
 12  Bwd Packet Length Mean       225745 non-null  float64
 13 

In [26]:
# Procurar Valores Nulos
null_values = ds.isna().sum()
null_percentage = (null_values / len(ds)) * 100

# Apresentação de Colunas com Valores Nulos
for column, count in null_values.items():
    if count != 0:
        print(f"A Coluna {column}: tem {count} valores nulos, o que faz ({null_percentage[column]:.2f}% de valores em falta)")

A Coluna Flow Bytes/s: tem 4 valores nulos, o que faz (0.00% de valores em falta)
A Coluna Active Mean: tem 4355 valores nulos, o que faz (1.93% de valores em falta)
A Coluna Idle Mean: tem 2150 valores nulos, o que faz (0.95% de valores em falta)
A Coluna Idle Std: tem 5445 valores nulos, o que faz (2.41% de valores em falta)


# 2. Limpeza de Dados

In [27]:
#Remoção de Espaços em Branco
col_name = {col: col.strip() for col in ds.columns}
ds = ds.rename(columns=col_name, inplace=True)

# 2.1. Dados Duplicados

In [31]:
# Verificar e Contar Duplicados
duplicates = ds.duplicated()
duplicates_count = duplicates.sum()

print(f"Número de Linhas Duplicadas: {duplicates_count}")

Número de Linhas Duplicadas: 2633


In [32]:
# Remoção de Duplicados
data = ds.drop_duplicates(keep='first')
del duplicates
data.shape

(223112, 79)

In [33]:
# Colunas com Dados Duplicados
identical_columns = {}
columns = data.columns
list_contol = columns.copy().tolist()

# Comparação de cada par de colunas
for col1 in columns:    
    for col2 in columns:
        if col1 != col2:
            if data[col1].equals(data[col2]):
                if (col1 not in identical_columns) and (col1 in list_contol):
                    identical_columns[col1] = [col2]
                    list_contol.remove(col2)
                elif (col1 in identical_columns) and (col1 in list_contol):
                    identical_columns[col1].append(col2)
                    list_contol.remove(col2)

if identical_columns:
    print("Colunas Identicas Encontradas:")
    for key, value in identical_columns.items():
        print(f"{key}: é idêntico a {value}")
else: print("Não foram encontradas colunas idênticas")
                

Colunas Identicas Encontradas:
Total Fwd Packets: é idêntico a ['Subflow Fwd Packets']
Total Backward Packets: é idêntico a ['Subflow Bwd Packets']
Total Length of Fwd Packets: é idêntico a ['Subflow Fwd Bytes']
Total Length of Bwd Packets: é idêntico a ['Subflow Bwd Bytes']
Fwd Packet Length Mean: é idêntico a ['Avg Fwd Segment Size']
Fwd PSH Flags: é idêntico a ['SYN Flag Count']
Bwd PSH Flags: é idêntico a ['Fwd URG Flags', 'Bwd URG Flags', 'CWE Flag Count', 'Fwd Avg Bytes/Bulk', 'Fwd Avg Packets/Bulk', 'Fwd Avg Bulk Rate', 'Bwd Avg Bytes/Bulk', 'Bwd Avg Packets/Bulk', 'Bwd Avg Bulk Rate']
Fwd Header Length: é idêntico a ['Fwd Header Length_1']
RST Flag Count: é idêntico a ['ECE Flag Count']


In [34]:
# Remover Colunas com Valores Duplicados
for key, value in identical_columns.items():
    data.drop(columns=value, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.drop(columns=value, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.drop(columns=value, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.drop(columns=value, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.drop(columns=value, inplace=True)
A value is t

In [35]:
print(data.columns)
data.shape

Index(['Destination Port', 'Flow Duration', 'Total Fwd Packets',
       'Total Backward Packets', 'Total Length of Fwd Packets',
       'Total Length of Bwd Packets', 'Fwd Packet Length Max',
       'Fwd Packet Length Min', 'Fwd Packet Length Mean',
       'Fwd Packet Length Std', 'Bwd Packet Length Max',
       'Bwd Packet Length Min', 'Bwd Packet Length Mean',
       'Bwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s',
       'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min',
       'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max',
       'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean', 'Bwd IAT Std',
       'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags',
       'Fwd Header Length', 'Bwd Header Length', 'Fwd Packets/s',
       'Bwd Packets/s', 'Min Packet Length', 'Max Packet Length',
       'Packet Length Mean', 'Packet Length Std', 'Packet Length Variance',
       'FIN Flag Count', 'RST Flag Count', 'PSH Flag Count', 'ACK Flag Count',

(223112, 62)