# Tratamento de Inconsist√™ncias

In [5]:
import pandas as pd
import numpy as np

# Carregar o dataset (ajuste o caminho se necess√°rio)
df = pd.read_csv("/content/dataset_artigo.csv")

# Selecionar apenas colunas num√©ricas
num_cols = df.select_dtypes(include=[np.number]).columns

# Criar uma m√°scara booleana para detectar valores infinitos
mask_inf = np.isinf(df[num_cols])

# Filtrar linhas que possuem pelo menos um valor infinito
df_inf = df[mask_inf.any(axis=1)]

# ---- Parte 1: visualizar dados infinitos ----
print(f"N√∫mero de linhas com infinito: {df_inf.shape[0]}")
print(f"Colunas com infinito: {list(mask_inf.any()[mask_inf.any()].index)}")

# Salvar os dados com infinito (opcional)
#df_inf.to_csv("dados_infinito.csv", index=False)

# Exibir as primeiras linhas com infinito
print("\nPrimeiras linhas com infinito:")
print(df_inf.head())


df_inf.columns

N√∫mero de linhas com infinito: 2986
Colunas com infinito: ['Flow Bytes/s', 'Flow Packets/s']

Primeiras linhas com infinito:
                                           Flow ID           Src IP  Src Port  \
408      35.175.71.102-192.168.137.148-443-54072-6    35.175.71.102       443   
3149  192.168.137.253-173.198.192.103-42496-4431-6  192.168.137.253     42496   
3225  192.168.137.253-173.198.192.103-42540-4431-6  192.168.137.253     42540   
4037  192.168.137.253-173.198.192.103-42674-4431-6  192.168.137.253     42674   
4122  192.168.137.253-173.198.192.103-42722-4431-6  192.168.137.253     42722   

               Dst IP  Dst Port  Protocol               Timestamp  \
408   192.168.137.148     54072         6  08/10/2022 06:51:31 AM   
3149  173.198.192.103      4431         6  08/10/2022 07:22:31 AM   
3225  173.198.192.103      4431         6  08/10/2022 07:23:31 AM   
4037  173.198.192.103      4431         6  08/10/2022 07:32:32 AM   
4122  173.198.192.103      4431         6 

Index(['Flow ID', 'Src IP', 'Src Port', 'Dst IP', 'Dst Port', 'Protocol',
       'Timestamp', 'Flow Duration', 'Total Fwd Packet', 'Total Bwd packets',
       'Total Length of Fwd Packet', 'Total Length of Bwd Packet',
       'Fwd Packet Length Max', 'Fwd Packet Length Min',
       'Fwd Packet Length Mean', 'Fwd Packet Length Std',
       'Bwd Packet Length Max', 'Bwd Packet Length Min',
       'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Flow Bytes/s',
       'Flow Packets/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max',
       'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std',
       'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean',
       'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags',
       'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags', 'Fwd Header Length',
       'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s',
       'Packet Length Min', 'Packet Length Max', 'Packet Length Mean',
       'Packet Length Std', 'Packet Len

In [6]:
df_inf[[ 'Flow Bytes/s', 'Flow Packets/s', 'Label']][df_inf.Label==1].count()/df_inf[[ 'Flow Packets/s', 'Label']].count()

Unnamed: 0,0
Flow Bytes/s,
Flow Packets/s,0.98493
Label,0.98493


In [7]:
# ---- Parte 2: tratar os infinitos substituindo pelo maior valor finito da coluna ----
for col in num_cols:
    if np.isinf(df[col]).any():
        max_val = df.loc[~np.isinf(df[col]), col].max()
        df[col] = df[col].replace([np.inf, -np.inf], max_val)
        print(f" Coluna '{col}' corrigida: ‚àû substitu√≠do por {max_val}")

# Agora df est√° limpo
print("\nüöÄ Dataset final corrigido!")


 Coluna 'Flow Bytes/s' corrigida: ‚àû substitu√≠do por 2896000000.0
 Coluna 'Flow Packets/s' corrigida: ‚àû substitu√≠do por 3000000.0

üöÄ Dataset final corrigido!


In [8]:
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.isna().sum()[df.isna().sum() > 0]


Unnamed: 0,0
Flow Bytes/s,2898


In [9]:
# Taxa de valores ausentes por label com valor 1
df['Label'][df['Flow Bytes/s'].isna()][df['Label']==1].count()/df['Label'][df['Flow Bytes/s'].isna()].count()

np.float64(0.9951690821256038)

In [10]:
# Seleciona apenas colunas num√©ricas que t√™m pelo menos um NaN
cols_com_nan = [col for col in num_cols if df[col].isna().any()]

for col in cols_com_nan:
    # encontra o maior valor finito da coluna
    max_val = df.loc[np.isfinite(df[col]), col].max()

    # substitui infinitos e NaN pelo maior valor finito
    df[col] = df[col].replace([np.inf, -np.inf, np.nan], max_val)

    print(f"Coluna '{col}' corrigida: infinitos e NaN substitu√≠dos por {max_val}")

print("\nüöÄ Dataset final corrigido!")


Coluna 'Flow Bytes/s' corrigida: infinitos e NaN substitu√≠dos por 2896000000.0

üöÄ Dataset final corrigido!


# Redu√ß√£o de Dimensionalidade por Correla√ß√£o de Pearson e Kbest para a Sele√ß√£o de Features


In [11]:
# Seleciona apenas as colunas num√©ricas
num_cols = df.select_dtypes(include=[np.number]).columns
df_num = df[num_cols]



In [12]:
val = df_num[df_num.columns[1:]].corr().abs()['Label'][:].sort_values(ascending=False)[1:7]
val

Unnamed: 0,Label
RST Flag Count,0.282452
Protocol,0.202148
Fwd IAT Total,0.201089
SYN Flag Count,0.193475
Flow Duration,0.192673
FWD Init Win Bytes,0.171927


In [13]:
v1 = df_num[val.index.to_list()+['Label']]
v1

Unnamed: 0,RST Flag Count,Protocol,Fwd IAT Total,SYN Flag Count,Flow Duration,FWD Init Win Bytes,Label
0,1,6,31334.0,0,31334,1365,0
1,0,6,4.0,0,109939,2542,0
2,2,6,1.0,0,1,0,0
3,1,6,0.0,0,114090,0,0
4,2,6,181.0,0,181,0,0
...,...,...,...,...,...,...,...
220401,2,6,0.0,0,0,0,1
220402,1,6,2575.0,0,25145,434,1
220403,2,6,0.0,0,0,0,1
220404,1,6,34779202.0,6,34779202,64240,1


##Estrat√©gia 1: Correla√ß√£o de Pearson

In [14]:
#@title M√©todos de correla√ß√£o gr√°fica
import plotly.graph_objects as go
def matriz_corr_graf(df):
  print("""
An√°lise de Covari√¢ncia:
A covari√¢ncia, ou vari√¢ncia conjunta, √© a medida do grau de interdepend√™ncia (ou inter-rela√ß√£o) num√©rica entre duas vari√°veis.
√â definida da seguinte maneira:

                                          Covari√¢ncia Populacional:

                                          œÉxy = 1/n ‚àë (Xi - Œºx)(Yi - Œºy)

                                          onde:
                                          - œÉxy √© a covari√¢ncia populacional entre X e Y,
                                          - n √© o n√∫mero total de pares de dados,
                                          - ‚àë √© a soma de todos os valores nos pares de dados,
                                          - Xi √© o valor da vari√°vel X no i-√©simo par de dados,
                                          - Yi √© o valor da vari√°vel Y no i-√©simo par de dados,
                                          - Œºx √© a m√©dia de todos os valores de X,
                                          - Œºy √© a m√©dia de todos os valores de Y.

                                          Covari√¢ncia Amostral:

                                          Sxy = 1/(n-1) ‚àë (Xi - X¬Ø)(Yi - Y¬Ø)

                                          onde:
                                          - Sxy √© a covari√¢ncia amostral entre X e Y,
                                          - n √© o n√∫mero total de pares de dados na amostra,
                                          - ‚àë √© a soma de todos os valores nos pares de dados,
                                          - Xi √© o valor da vari√°vel X no i-√©simo par de dados,
                                          - Yi √© o valor da vari√°vel Y no i-√©simo par de dados,
                                          - X¬Ø √© a m√©dia dos valores de X na amostra,
                                          - Y¬Ø √© a m√©dia dos valores de Y na amostra.

Valor de  Sxy  positivo indica uma associa√ß√£o linear positiva entre x e y, ou seja, √† medida que o valor de x aumenta,
o valor de y tamb√©m aumenta. Neste caso, podemos ver na figura abaixo que os pontos que t√™m a maior influ√™ncia sobre  Sxy  devem estar nos quadrantes I e III.
Se o valor de  Sxy  for negativo temos um indicativo de associa√ß√£o linear negativa entre x e y, ou seja, √† medida que x aumenta,
o valor de y diminui. Neste caso, podemos ver na figura abaixo que os pontos que t√™m a maior influ√™ncia sobre  Sxy  devem estar nos quadrantes II e IV.
Finalmente, se os pontos estiverem uniformemente distribu√≠dos pelos quadrantes, o valor de  Sxy  se aproximar√° de zero,
indicando que n√£o existe nenhuma associa√ß√£o linear entre x e y.

Na pr√°tica, a covari√¢ncia populacional e a covari√¢ncia amostral s√£o medidas muito semelhantes.
Ambas buscam medir a rela√ß√£o linear entre duas vari√°veis. A principal diferen√ßa est√° em como a m√©dia √© calculada.
Na covari√¢ncia populacional, a m√©dia √© calculada considerando todos os membros da popula√ß√£o.
Na covari√¢ncia amostral, a m√©dia √© calculada apenas para a amostra dos dados selecionados.

####################################################################################################################################################
                                          An√°lise de Correla√ß√£o co Coeficiente  de Pearson com base na covari√¢ncia
####################################################################################################################################################
Para avaliar a relev√¢ncia de nossos atributos, utilizamos um mapa de correla√ß√£o baseado no Coeficiente de Correla√ß√£o de Pearson.

O Coeficiente de Correla√ß√£o de Pearson √© uma medida estat√≠stica que calcula o grau de rela√ß√£o entre duas vari√°veis num√©ricas.
Esse coeficiente √© obtido dividindo a covari√¢ncia de duas vari√°veis pelo produto de seus desvios padr√µes.

A covari√¢ncia √© uma medida de como as duas vari√°veis variam conjuntamente - se elas tendem a aumentar ou diminuir juntas.
O desvio padr√£o, por outro lado, √© uma medida de qu√£o dispersos est√£o os valores de cada vari√°vel em torno da m√©dia.
Portanto, ao dividir a covari√¢ncia pelo produto dos desvios padr√µes, o Coeficiente de Correla√ß√£o de Pearson normaliza a medida de associa√ß√£o linear,
colocando-a em uma escala de -1 a 1.

Nesta escala, um valor de -1 indica uma rela√ß√£o linear perfeita negativa, ou seja, quando uma vari√°vel aumenta, a outra diminui na mesma propor√ß√£o,e vice-versa.
Um valor de 1, por outro lado, indica uma rela√ß√£o linear perfeita positiva, ou seja, ambas as vari√°veis aumentam ou diminuem juntas na mesma propor√ß√£o.

No entanto, √© crucial entender que o Coeficiente de Correla√ß√£o de Pearson √© uma medida de associa√ß√£o linear e n√£o necessariamente indica causalidade.
Uma alta correla√ß√£o entre duas vari√°veis n√£o significa, necessariamente, que varia√ß√µes em uma delas provocar√£o altera√ß√µes na outra.
A exist√™ncia de uma correla√ß√£o apenas sugere que as vari√°veis tendem a se mover juntas, mas n√£o especifica por que isso acontece.
Assim, enquanto a correla√ß√£o pode ser √∫til para identificar padr√µes nos dados, ela deve ser interpretada com cautela ao inferir rela√ß√µes causais.
####################################################################################################################################################
  """)

  # Calcular a matriz de correla√ß√£o
  matriz_corr = df.corr(numeric_only=True)

  # Converter a matriz de correla√ß√£o para uma matriz numpy e arredondar os valores
  matriz_corr = matriz_corr.round(2)

  # Definir a escala de cores
  colorscale = [(0, 'rgba(255,255,255,1)'), (1, 'rgba(0,0,255,1)')]

  # Gerar as anota√ß√µes para cada c√©lula
  annotations = []
  for i, row in enumerate(matriz_corr.index):
      for j, col in enumerate(matriz_corr.columns):
          annotations.append(
              go.layout.Annotation(
                  text=str(matriz_corr.loc[row, col]),
                  x=col, y=row,
                  xref='x1', yref='y1',
                  showarrow=False)
          )

  # Criar a figura com um mapa de calor
  fig = go.Figure(data=go.Heatmap(
      z=matriz_corr.values,
      x=matriz_corr.columns,
      y=matriz_corr.index,
      colorscale=colorscale
  ))

  # Ajustar o layout
  fig.update_layout(
      title='Matriz de Correla√ß√£o',
      annotations=annotations,
      xaxis=dict(tickfont=dict(size=10), side='top'),
      yaxis=dict(tickfont=dict(size=10), autorange="reversed"),
      autosize=True,
      margin=dict(t=100, l=200),
  )

  # Mostrar a figura
  figure = fig.show()

  return None





In [15]:
matriz_corr_graf(v1)


An√°lise de Covari√¢ncia:
A covari√¢ncia, ou vari√¢ncia conjunta, √© a medida do grau de interdepend√™ncia (ou inter-rela√ß√£o) num√©rica entre duas vari√°veis.
√â definida da seguinte maneira:

                                          Covari√¢ncia Populacional:

                                          œÉxy = 1/n ‚àë (Xi - Œºx)(Yi - Œºy)

                                          onde:
                                          - œÉxy √© a covari√¢ncia populacional entre X e Y,
                                          - n √© o n√∫mero total de pares de dados,
                                          - ‚àë √© a soma de todos os valores nos pares de dados,
                                          - Xi √© o valor da vari√°vel X no i-√©simo par de dados,
                                          - Yi √© o valor da vari√°vel Y no i-√©simo par de dados,
                                          - Œºx √© a m√©dia de todos os valores de X,
                                          - Œºy 

In [16]:
import pandas as pd
from itertools import combinations
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score
from imblearn.over_sampling import SMOTE
import statsmodels.formula.api as smf
import statsmodels.api as sm
from tqdm import tqdm

def testar_combinacoes(df, y_column, predictors, n_val=27000, random_state=42, max_vars=6, normalizar=True):
    results = []
    smote = SMOTE(random_state=random_state)
    y = df[y_column].values.ravel()

    for i in range(1, min(max_vars, len(predictors)) + 1):
        for combo in tqdm(list(combinations(predictors, i)), desc=f"Testando combina√ß√µes de {i} vari√°veis"):
            X = df[list(combo)]
            X_columns = list(combo)

            from sklearn.model_selection import train_test_split

            X_train_raw, X_test_raw, y_train_raw, y_test = train_test_split(
                X, y, test_size=n_val, stratify=y, random_state=random_state
              )


            # Normaliza√ß√£o Min-Max
            if normalizar:
                scaler = MinMaxScaler()
                X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train_raw), columns=X_columns)
                X_test_scaled = pd.DataFrame(scaler.transform(X_test_raw), columns=X_columns)
            else:
                X_train_scaled, X_test_scaled = X_train_raw, X_test_raw

            # Balanceamento com SMOTE
            X_train, y_train = smote.fit_resample(X_train_scaled, y_train_raw)
            X_train = pd.DataFrame(X_train, columns=X_columns)
            y_train = pd.Series(y_train, name=y_column)


            # Treinar modelo GLM (Regress√£o Log√≠stica)
            formula = f"{y_column} ~ " + " + ".join(X_columns)
            modelo = smf.glm(formula=formula,
                            data=pd.concat([X_train, y_train], axis=1),
                            family=sm.families.Binomial()).fit()

            # Previs√µes
            # Cria DataFrame para teste com as mesmas colunas do modelo
            X_test_scaled_with_const = sm.add_constant(X_test_scaled, has_constant='add')
            cols_modelo = modelo.model.exog_names  # nomes esperados pelo modelo (inclui intercepto)
            X_test_scaled_with_const = X_test_scaled_with_const.reindex(columns=cols_modelo, fill_value=1)

            y_train_pred_proba = modelo.fittedvalues
            y_test_pred_proba = modelo.predict(X_test_scaled_with_const)


            # M√©tricas
            auc_train = roc_auc_score(y_train, y_train_pred_proba)
            auc_test = roc_auc_score(y_test, y_test_pred_proba)

            results.append({
                'Vari√°veis': combo,
                'AUC Treino': auc_train,
                'AUC Teste': auc_test
            })

    return pd.DataFrame(results)

In [17]:
# Substituir espa√ßos por underline
df_renamed = v1.copy()#.iloc[00:]
df_renamed.columns = [col.replace(' ', '_').replace('/', '_').replace('-', '_') for col in df_renamed.columns]

# Atualiza predictors
predictors = df_renamed.drop(columns='Label').columns.to_list()
y_column = 'Label'

# Chama a fun√ß√£o
results_df = testar_combinacoes(df_renamed, y_column, predictors)

results_df.sort_values(by='AUC Teste', ascending=True)


Testando combina√ß√µes de 1 vari√°veis: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [03:48<00:00, 38.02s/it]
Testando combina√ß√µes de 2 vari√°veis: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 15/15 [05:25<00:00, 21.73s/it]
Testando combina√ß√µes de 3 vari√°veis: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [05:03<00:00, 15.19s/it]
Testando combina√ß√µes de 4 vari√°veis: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 15/15 [03:18<00:00, 13.21s/it]
Testando combina√ß√µes de 5 vari√°veis: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [00:56<00:00,  9.46s/it]
Testando combina√ß√µes de 6 vari√°veis: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:07<00:00,  7.22s/it]


Unnamed: 0,Vari√°veis,AUC Treino,AUC Teste
2,"(Fwd_IAT_Total,)",0.537337,0.543932
16,"(Fwd_IAT_Total, Flow_Duration)",0.540667,0.546455
4,"(Flow_Duration,)",0.569291,0.577003
5,"(FWD_Init_Win_Bytes,)",0.582379,0.577412
19,"(SYN_Flag_Count, FWD_Init_Win_Bytes)",0.593165,0.589632
...,...,...,...
28,"(RST_Flag_Count, SYN_Flag_Count, Flow_Duration)",0.692448,0.697301
50,"(RST_Flag_Count, SYN_Flag_Count, Flow_Duration...",0.693767,0.697331
46,"(RST_Flag_Count, Protocol, Flow_Duration, FWD_...",0.700516,0.702937
59,"(RST_Flag_Count, Protocol, SYN_Flag_Count, Flo...",0.701434,0.704564


In [18]:
results_df.sort_values(by='AUC Teste', ascending=True).to_latex()

"\\begin{tabular}{llrr}\n\\toprule\n & Vari√°veis & AUC Treino & AUC Teste \\\\\n\\midrule\n2 & ('Fwd_IAT_Total',) & 0.537337 & 0.543932 \\\\\n16 & ('Fwd_IAT_Total', 'Flow_Duration') & 0.540667 & 0.546455 \\\\\n4 & ('Flow_Duration',) & 0.569291 & 0.577003 \\\\\n5 & ('FWD_Init_Win_Bytes',) & 0.582379 & 0.577412 \\\\\n19 & ('SYN_Flag_Count', 'FWD_Init_Win_Bytes') & 0.593165 & 0.589632 \\\\\n3 & ('SYN_Flag_Count',) & 0.591562 & 0.592476 \\\\\n14 & ('Protocol', 'FWD_Init_Win_Bytes') & 0.606058 & 0.601741 \\\\\n1 & ('Protocol',) & 0.603958 & 0.601761 \\\\\n35 & ('Protocol', 'SYN_Flag_Count', 'FWD_Init_Win_Bytes') & 0.616554 & 0.613857 \\\\\n37 & ('Fwd_IAT_Total', 'SYN_Flag_Count', 'Flow_Duration') & 0.611313 & 0.616167 \\\\\n39 & ('Fwd_IAT_Total', 'Flow_Duration', 'FWD_Init_Win_Bytes') & 0.615465 & 0.617278 \\\\\n0 & ('RST_Flag_Count',) & 0.621158 & 0.622913 \\\\\n15 & ('Fwd_IAT_Total', 'SYN_Flag_Count') & 0.620121 & 0.624982 \\\\\n55 & ('Fwd_IAT_Total', 'SYN_Flag_Count', 'Flow_Duration', '

###A partir daqui c√©lulas como a abaixo ser√£o mais comuns. Ela n√£o precisa ser executada, est√° apenas ali para exibir de maneira mais f√°cil a sa√≠da da execu√ß√£o da c√©lula acima. N√≥s utilizamos vers√µes adaptadas dessas tabelas em nosso artigo.

In [19]:
\begin{tabular}{llrr}
\toprule
 & Vari√°veis & AUC Treino & AUC Teste \\
\midrule
2 & ('Fwd_IAT_Total',) & 0.537337 & 0.543932 \\
16 & ('Fwd_IAT_Total', 'Flow_Duration') & 0.540667 & 0.546455 \\
4 & ('Flow_Duration',) & 0.569291 & 0.577003 \\
5 & ('FWD_Init_Win_Bytes',) & 0.582379 & 0.577412 \\
19 & ('SYN_Flag_Count', 'FWD_Init_Win_Bytes') & 0.593165 & 0.589632 \\
3 & ('SYN_Flag_Count',) & 0.591562 & 0.592476 \\
14 & ('Protocol', 'FWD_Init_Win_Bytes') & 0.606058 & 0.601741 \\
1 & ('Protocol',) & 0.603958 & 0.601761 \\
35 & ('Protocol', 'SYN_Flag_Count', 'FWD_Init_Win_Bytes') & 0.616554 & 0.613857 \\
37 & ('Fwd_IAT_Total', 'SYN_Flag_Count', 'Flow_Duration') & 0.611313 & 0.616167 \\
39 & ('Fwd_IAT_Total', 'Flow_Duration', 'FWD_Init_Win_Bytes') & 0.615465 & 0.617278 \\
0 & ('RST_Flag_Count',) & 0.621158 & 0.622913 \\
15 & ('Fwd_IAT_Total', 'SYN_Flag_Count') & 0.620121 & 0.624982 \\
55 & ('Fwd_IAT_Total', 'SYN_Flag_Count', 'Flow_Duration', 'FWD_Init_Win_Bytes') & 0.622778 & 0.626059 \\
12 & ('Protocol', 'SYN_Flag_Count') & 0.627548 & 0.626143 \\
17 & ('Fwd_IAT_Total', 'FWD_Init_Win_Bytes') & 0.627911 & 0.629438 \\
38 & ('Fwd_IAT_Total', 'SYN_Flag_Count', 'FWD_Init_Win_Bytes') & 0.635782 & 0.639160 \\
7 & ('RST_Flag_Count', 'Fwd_IAT_Total') & 0.634413 & 0.639697 \\
6 & ('RST_Flag_Count', 'Protocol') & 0.646240 & 0.645604 \\
9 & ('RST_Flag_Count', 'Flow_Duration') & 0.646706 & 0.651319 \\
10 & ('RST_Flag_Count', 'FWD_Init_Win_Bytes') & 0.653088 & 0.652428 \\
26 & ('RST_Flag_Count', 'Fwd_IAT_Total', 'Flow_Duration') & 0.647398 & 0.652848 \\
20 & ('Flow_Duration', 'FWD_Init_Win_Bytes') & 0.650885 & 0.653087 \\
24 & ('RST_Flag_Count', 'Protocol', 'FWD_Init_Win_Bytes') & 0.656605 & 0.655719 \\
53 & ('Protocol', 'Fwd_IAT_Total', 'Flow_Duration', 'FWD_Init_Win_Bytes') & 0.654297 & 0.656595 \\
29 & ('RST_Flag_Count', 'SYN_Flag_Count', 'FWD_Init_Win_Bytes') & 0.657131 & 0.656838 \\
18 & ('SYN_Flag_Count', 'Flow_Duration') & 0.651653 & 0.657099 \\
61 & ('Protocol', 'Fwd_IAT_Total', 'SYN_Flag_Count', 'Flow_Duration', 'FWD_Init_Win_Bytes') & 0.656306 & 0.659572 \\
51 & ('Protocol', 'Fwd_IAT_Total', 'SYN_Flag_Count', 'Flow_Duration') & 0.658375 & 0.662075 \\
40 & ('SYN_Flag_Count', 'Flow_Duration', 'FWD_Init_Win_Bytes') & 0.658160 & 0.662297 \\
32 & ('Protocol', 'Fwd_IAT_Total', 'Flow_Duration') & 0.661109 & 0.664720 \\
33 & ('Protocol', 'Fwd_IAT_Total', 'FWD_Init_Win_Bytes') & 0.663319 & 0.665862 \\
8 & ('RST_Flag_Count', 'SYN_Flag_Count') & 0.663941 & 0.666436 \\
45 & ('RST_Flag_Count', 'Protocol', 'SYN_Flag_Count', 'FWD_Init_Win_Bytes') & 0.663466 & 0.667616 \\
52 & ('Protocol', 'Fwd_IAT_Total', 'SYN_Flag_Count', 'FWD_Init_Win_Bytes') & 0.665699 & 0.669228 \\
22 & ('RST_Flag_Count', 'Protocol', 'SYN_Flag_Count') & 0.667048 & 0.672084 \\
31 & ('Protocol', 'Fwd_IAT_Total', 'SYN_Flag_Count') & 0.668273 & 0.672123 \\
11 & ('Protocol', 'Fwd_IAT_Total') & 0.670010 & 0.674352 \\
36 & ('Protocol', 'Flow_Duration', 'FWD_Init_Win_Bytes') & 0.673487 & 0.675142 \\
49 & ('RST_Flag_Count', 'Fwd_IAT_Total', 'Flow_Duration', 'FWD_Init_Win_Bytes') & 0.673957 & 0.678019 \\
54 & ('Protocol', 'SYN_Flag_Count', 'Flow_Duration', 'FWD_Init_Win_Bytes') & 0.677576 & 0.680427 \\
13 & ('Protocol', 'Flow_Duration') & 0.676283 & 0.680994 \\
60 & ('RST_Flag_Count', 'Fwd_IAT_Total', 'SYN_Flag_Count', 'Flow_Duration', 'FWD_Init_Win_Bytes') & 0.677941 & 0.682400 \\
58 & ('RST_Flag_Count', 'Protocol', 'Fwd_IAT_Total', 'Flow_Duration', 'FWD_Init_Win_Bytes') & 0.679826 & 0.682697 \\
34 & ('Protocol', 'SYN_Flag_Count', 'Flow_Duration') & 0.679465 & 0.682917 \\
47 & ('RST_Flag_Count', 'Fwd_IAT_Total', 'SYN_Flag_Count', 'Flow_Duration') & 0.677635 & 0.683216 \\
62 & ('RST_Flag_Count', 'Protocol', 'Fwd_IAT_Total', 'SYN_Flag_Count', 'Flow_Duration', 'FWD_Init_Win_Bytes') & 0.680248 & 0.683872 \\
27 & ('RST_Flag_Count', 'Fwd_IAT_Total', 'FWD_Init_Win_Bytes') & 0.680698 & 0.684027 \\
42 & ('RST_Flag_Count', 'Protocol', 'Fwd_IAT_Total', 'Flow_Duration') & 0.679319 & 0.684289 \\
56 & ('RST_Flag_Count', 'Protocol', 'Fwd_IAT_Total', 'SYN_Flag_Count', 'Flow_Duration') & 0.682345 & 0.686447 \\
25 & ('RST_Flag_Count', 'Fwd_IAT_Total', 'SYN_Flag_Count') & 0.681434 & 0.686969 \\
21 & ('RST_Flag_Count', 'Protocol', 'Fwd_IAT_Total') & 0.682912 & 0.687877 \\
48 & ('RST_Flag_Count', 'Fwd_IAT_Total', 'SYN_Flag_Count', 'FWD_Init_Win_Bytes') & 0.684371 & 0.688518 \\
30 & ('RST_Flag_Count', 'Flow_Duration', 'FWD_Init_Win_Bytes') & 0.689119 & 0.691728 \\
43 & ('RST_Flag_Count', 'Protocol', 'Fwd_IAT_Total', 'FWD_Init_Win_Bytes') & 0.689056 & 0.692126 \\
57 & ('RST_Flag_Count', 'Protocol', 'Fwd_IAT_Total', 'SYN_Flag_Count', 'FWD_Init_Win_Bytes') & 0.690042 & 0.693595 \\
23 & ('RST_Flag_Count', 'Protocol', 'Flow_Duration') & 0.691766 & 0.696000 \\
41 & ('RST_Flag_Count', 'Protocol', 'Fwd_IAT_Total', 'SYN_Flag_Count') & 0.691747 & 0.696046 \\
28 & ('RST_Flag_Count', 'SYN_Flag_Count', 'Flow_Duration') & 0.692448 & 0.697301 \\
50 & ('RST_Flag_Count', 'SYN_Flag_Count', 'Flow_Duration', 'FWD_Init_Win_Bytes') & 0.693767 & 0.697331 \\
46 & ('RST_Flag_Count', 'Protocol', 'Flow_Duration', 'FWD_Init_Win_Bytes') & 0.700516 & 0.702937 \\
59 & ('RST_Flag_Count', 'Protocol', 'SYN_Flag_Count', 'Flow_Duration', 'FWD_Init_Win_Bytes') & 0.701434 & 0.704564 \\
44 & ('RST_Flag_Count', 'Protocol', 'SYN_Flag_Count', 'Flow_Duration') & 0.702184 & 0.705656 \\
\bottomrule
\end{tabular}


SyntaxError: unexpected character after line continuation character (ipython-input-2661023305.py, line 1)

##Estrat√©gia 2: KBest

In [20]:
from sklearn.feature_selection import SelectKBest, mutual_info_classif

X = df_num.drop(columns='Label')
y = df_num['Label']

selector = SelectKBest(score_func=mutual_info_classif, k=6)
selector.fit(X, y)

top_features = X.columns[selector.get_support()]
print(top_features)


Index(['Src Port', 'Total Length of Fwd Packet', 'Total Length of Bwd Packet',
       'Packet Length Max', 'Packet Length Mean', 'Average Packet Size'],
      dtype='object')


In [21]:
v2 = df_num[top_features.to_list()+['Label']]
v2

Unnamed: 0,Src Port,Total Length of Fwd Packet,Total Length of Bwd Packet,Packet Length Max,Packet Length Mean,Average Packet Size,Label
0,51438,31.0,0.0,31.0,15.500000,20.666667,0
1,44438,115.0,31.0,73.0,37.600000,47.000000,0
2,443,0.0,0.0,0.0,0.000000,0.000000,0
3,51438,0.0,31.0,31.0,7.750000,10.333333,0
4,41834,0.0,0.0,0.0,0.000000,0.000000,0
...,...,...,...,...,...,...,...
220401,50884,0.0,0.0,0.0,0.000000,0.000000,1
220402,80,9480.0,0.0,1436.0,1212.888889,1364.500000,1
220403,51216,0.0,0.0,0.0,0.000000,0.000000,1
220404,50812,3628.0,4586.0,2896.0,410.700000,432.315789,1


In [22]:
# Substituir espa√ßos por underline
df_renamed = v2.copy()
df_renamed.columns = [col.replace(' ', '_').replace('/', '_').replace('-', '_') for col in df_renamed.columns]

# Atualiza predictors
predictors = df_renamed.drop(columns='Label').columns.to_list()
y_column = 'Label'

# Chama a fun√ß√£o
results_df2 = testar_combinacoes(df_renamed, y_column, predictors).sort_values(by='AUC Teste', ascending=True)
results_df2.to_latex()

Testando combina√ß√µes de 1 vari√°veis: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [00:29<00:00,  4.85s/it]
Testando combina√ß√µes de 2 vari√°veis: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 15/15 [00:46<00:00,  3.11s/it]
Testando combina√ß√µes de 3 vari√°veis: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:58<00:00,  2.92s/it]
Testando combina√ß√µes de 4 vari√°veis: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 15/15 [00:46<00:00,  3.07s/it]
Testando combina√ß√µes de 5 vari√°veis: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [00:17<00:00,  2.88s/it]
Testando combina√ß√µes de 6 vari√°veis: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:02<00:00,  2.55s/it]


"\\begin{tabular}{llrr}\n\\toprule\n & Vari√°veis & AUC Treino & AUC Teste \\\\\n\\midrule\n38 & ('Total_Length_of_Bwd_Packet', 'Packet_Length_Max', 'Average_Packet_Size') & 0.422683 & 0.422048 \\\\\n19 & ('Packet_Length_Max', 'Average_Packet_Size') & 0.422731 & 0.422071 \\\\\n52 & ('Total_Length_of_Fwd_Packet', 'Total_Length_of_Bwd_Packet', 'Packet_Length_Max', 'Average_Packet_Size') & 0.424136 & 0.423277 \\\\\n35 & ('Total_Length_of_Fwd_Packet', 'Packet_Length_Max', 'Average_Packet_Size') & 0.424166 & 0.423340 \\\\\n37 & ('Total_Length_of_Bwd_Packet', 'Packet_Length_Max', 'Packet_Length_Mean') & 0.425561 & 0.425200 \\\\\n18 & ('Packet_Length_Max', 'Packet_Length_Mean') & 0.425558 & 0.425216 \\\\\n34 & ('Total_Length_of_Fwd_Packet', 'Packet_Length_Max', 'Packet_Length_Mean') & 0.425991 & 0.425678 \\\\\n51 & ('Total_Length_of_Fwd_Packet', 'Total_Length_of_Bwd_Packet', 'Packet_Length_Max', 'Packet_Length_Mean') & 0.425974 & 0.425682 \\\\\n5 & ('Average_Packet_Size',) & 0.429311 & 0.4257

In [None]:
\begin{tabular}{llrr}
\toprule
 & Vari√°veis & AUC Treino & AUC Teste \\
\midrule
38 & ('Total_Length_of_Bwd_Packet', 'Packet_Length_Max', 'Average_Packet_Size') & 0.422683 & 0.422048 \\
19 & ('Packet_Length_Max', 'Average_Packet_Size') & 0.422731 & 0.422071 \\
52 & ('Total_Length_of_Fwd_Packet', 'Total_Length_of_Bwd_Packet', 'Packet_Length_Max', 'Average_Packet_Size') & 0.424136 & 0.423277 \\
35 & ('Total_Length_of_Fwd_Packet', 'Packet_Length_Max', 'Average_Packet_Size') & 0.424166 & 0.423340 \\
37 & ('Total_Length_of_Bwd_Packet', 'Packet_Length_Max', 'Packet_Length_Mean') & 0.425561 & 0.425200 \\
18 & ('Packet_Length_Max', 'Packet_Length_Mean') & 0.425558 & 0.425216 \\
34 & ('Total_Length_of_Fwd_Packet', 'Packet_Length_Max', 'Packet_Length_Mean') & 0.425991 & 0.425678 \\
51 & ('Total_Length_of_Fwd_Packet', 'Total_Length_of_Bwd_Packet', 'Packet_Length_Max', 'Packet_Length_Mean') & 0.425974 & 0.425682 \\
5 & ('Average_Packet_Size',) & 0.429311 & 0.425715 \\
17 & ('Total_Length_of_Bwd_Packet', 'Average_Packet_Size') & 0.429357 & 0.425775 \\
4 & ('Packet_Length_Mean',) & 0.430940 & 0.427406 \\
16 & ('Total_Length_of_Bwd_Packet', 'Packet_Length_Mean') & 0.430978 & 0.427446 \\
33 & ('Total_Length_of_Fwd_Packet', 'Total_Length_of_Bwd_Packet', 'Average_Packet_Size') & 0.435828 & 0.431942 \\
14 & ('Total_Length_of_Fwd_Packet', 'Average_Packet_Size') & 0.435849 & 0.431949 \\
32 & ('Total_Length_of_Fwd_Packet', 'Total_Length_of_Bwd_Packet', 'Packet_Length_Mean') & 0.437738 & 0.433531 \\
13 & ('Total_Length_of_Fwd_Packet', 'Packet_Length_Mean') & 0.437747 & 0.433535 \\
2 & ('Total_Length_of_Bwd_Packet',) & 0.447360 & 0.442781 \\
28 & ('Src_Port', 'Packet_Length_Max', 'Packet_Length_Mean') & 0.505369 & 0.502651 \\
47 & ('Src_Port', 'Total_Length_of_Bwd_Packet', 'Packet_Length_Max', 'Packet_Length_Mean') & 0.505380 & 0.502676 \\
23 & ('Src_Port', 'Total_Length_of_Fwd_Packet', 'Packet_Length_Mean') & 0.509703 & 0.503072 \\
42 & ('Src_Port', 'Total_Length_of_Fwd_Packet', 'Total_Length_of_Bwd_Packet', 'Packet_Length_Mean') & 0.509776 & 0.503151 \\
44 & ('Src_Port', 'Total_Length_of_Fwd_Packet', 'Packet_Length_Max', 'Packet_Length_Mean') & 0.505791 & 0.503159 \\
56 & ('Src_Port', 'Total_Length_of_Fwd_Packet', 'Total_Length_of_Bwd_Packet', 'Packet_Length_Max', 'Packet_Length_Mean') & 0.505801 & 0.503161 \\
29 & ('Src_Port', 'Packet_Length_Max', 'Average_Packet_Size') & 0.507021 & 0.503908 \\
48 & ('Src_Port', 'Total_Length_of_Bwd_Packet', 'Packet_Length_Max', 'Average_Packet_Size') & 0.507023 & 0.503924 \\
45 & ('Src_Port', 'Total_Length_of_Fwd_Packet', 'Packet_Length_Max', 'Average_Packet_Size') & 0.508073 & 0.504943 \\
57 & ('Src_Port', 'Total_Length_of_Fwd_Packet', 'Total_Length_of_Bwd_Packet', 'Packet_Length_Max', 'Average_Packet_Size') & 0.508061 & 0.504944 \\
24 & ('Src_Port', 'Total_Length_of_Fwd_Packet', 'Average_Packet_Size') & 0.513610 & 0.507400 \\
43 & ('Src_Port', 'Total_Length_of_Fwd_Packet', 'Total_Length_of_Bwd_Packet', 'Average_Packet_Size') & 0.513705 & 0.507501 \\
9 & ('Src_Port', 'Packet_Length_Mean') & 0.518271 & 0.513876 \\
26 & ('Src_Port', 'Total_Length_of_Bwd_Packet', 'Packet_Length_Mean') & 0.518445 & 0.514066 \\
10 & ('Src_Port', 'Average_Packet_Size') & 0.521519 & 0.517355 \\
27 & ('Src_Port', 'Total_Length_of_Bwd_Packet', 'Average_Packet_Size') & 0.521666 & 0.517507 \\
40 & ('Packet_Length_Max', 'Packet_Length_Mean', 'Average_Packet_Size') & 0.519737 & 0.520857 \\
55 & ('Total_Length_of_Bwd_Packet', 'Packet_Length_Max', 'Packet_Length_Mean', 'Average_Packet_Size') & 0.519769 & 0.520898 \\
61 & ('Total_Length_of_Fwd_Packet', 'Total_Length_of_Bwd_Packet', 'Packet_Length_Max', 'Packet_Length_Mean', 'Average_Packet_Size') & 0.520191 & 0.521220 \\
54 & ('Total_Length_of_Fwd_Packet', 'Packet_Length_Max', 'Packet_Length_Mean', 'Average_Packet_Size') & 0.520202 & 0.521231 \\
30 & ('Src_Port', 'Packet_Length_Mean', 'Average_Packet_Size') & 0.526831 & 0.522182 \\
49 & ('Src_Port', 'Total_Length_of_Bwd_Packet', 'Packet_Length_Mean', 'Average_Packet_Size') & 0.526857 & 0.522219 \\
39 & ('Total_Length_of_Bwd_Packet', 'Packet_Length_Mean', 'Average_Packet_Size') & 0.526681 & 0.525215 \\
20 & ('Packet_Length_Mean', 'Average_Packet_Size') & 0.526674 & 0.525220 \\
58 & ('Src_Port', 'Total_Length_of_Fwd_Packet', 'Total_Length_of_Bwd_Packet', 'Packet_Length_Mean', 'Average_Packet_Size') & 0.531790 & 0.526999 \\
46 & ('Src_Port', 'Total_Length_of_Fwd_Packet', 'Packet_Length_Mean', 'Average_Packet_Size') & 0.531790 & 0.527003 \\
8 & ('Src_Port', 'Packet_Length_Max') & 0.529193 & 0.528941 \\
25 & ('Src_Port', 'Total_Length_of_Bwd_Packet', 'Packet_Length_Max') & 0.529396 & 0.529074 \\
53 & ('Total_Length_of_Fwd_Packet', 'Total_Length_of_Bwd_Packet', 'Packet_Length_Mean', 'Average_Packet_Size') & 0.531583 & 0.529461 \\
36 & ('Total_Length_of_Fwd_Packet', 'Packet_Length_Mean', 'Average_Packet_Size') & 0.531588 & 0.529466 \\
22 & ('Src_Port', 'Total_Length_of_Fwd_Packet', 'Packet_Length_Max') & 0.531619 & 0.531305 \\
41 & ('Src_Port', 'Total_Length_of_Fwd_Packet', 'Total_Length_of_Bwd_Packet', 'Packet_Length_Max') & 0.531689 & 0.531321 \\
7 & ('Src_Port', 'Total_Length_of_Bwd_Packet') & 0.535125 & 0.532776 \\
0 & ('Src_Port',) & 0.535504 & 0.533253 \\
6 & ('Src_Port', 'Total_Length_of_Fwd_Packet') & 0.538442 & 0.535432 \\
21 & ('Src_Port', 'Total_Length_of_Fwd_Packet', 'Total_Length_of_Bwd_Packet') & 0.538536 & 0.535465 \\
60 & ('Src_Port', 'Total_Length_of_Bwd_Packet', 'Packet_Length_Max', 'Packet_Length_Mean', 'Average_Packet_Size') & 0.540397 & 0.539697 \\
50 & ('Src_Port', 'Packet_Length_Max', 'Packet_Length_Mean', 'Average_Packet_Size') & 0.540407 & 0.539709 \\
62 & ('Src_Port', 'Total_Length_of_Fwd_Packet', 'Total_Length_of_Bwd_Packet', 'Packet_Length_Max', 'Packet_Length_Mean', 'Average_Packet_Size') & 0.540631 & 0.539953 \\
59 & ('Src_Port', 'Total_Length_of_Fwd_Packet', 'Packet_Length_Max', 'Packet_Length_Mean', 'Average_Packet_Size') & 0.540641 & 0.539969 \\
15 & ('Total_Length_of_Bwd_Packet', 'Packet_Length_Max') & 0.541175 & 0.545520 \\
3 & ('Packet_Length_Max',) & 0.542061 & 0.546537 \\
31 & ('Total_Length_of_Fwd_Packet', 'Total_Length_of_Bwd_Packet', 'Packet_Length_Max') & 0.543143 & 0.547861 \\
12 & ('Total_Length_of_Fwd_Packet', 'Packet_Length_Max') & 0.543095 & 0.547879 \\
11 & ('Total_Length_of_Fwd_Packet', 'Total_Length_of_Bwd_Packet') & 0.568550 & 0.573607 \\
1 & ('Total_Length_of_Fwd_Packet',) & 0.569280 & 0.574628 \\
\bottomrule
\end{tabular}


# Avalia√ß√£o de modelos de ML

In [None]:

import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, recall_score, accuracy_score, confusion_matrix
from imblearn.over_sampling import SMOTE
from tqdm import tqdm

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression


def testar_modelo(df, y_column, grupos, modelo_nome, n_val=5000, random_state=42, normalizar=True):
    """
    Testa um modelo (KNN, SVM, RF ou Regress√£o Log√≠stica) para grupos espec√≠ficos de vari√°veis.
    Retorna DataFrame com m√©tricas detalhadas.
    """
    results = []
    smote = SMOTE(random_state=random_state)
    y = df[y_column].values.ravel()

    for combo in tqdm(grupos, desc=f"Testando {modelo_nome} para grupos definidos"):
        X = df[list(combo)]
        X_columns = list(combo)

        # Split estratificado
        X_train_raw, X_test_raw, y_train_raw, y_test = train_test_split(
            X, y, test_size=n_val, stratify=y, random_state=random_state
        )

        # Normaliza√ß√£o
        if normalizar:
            scaler = MinMaxScaler()
            X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train_raw), columns=X_columns)
            X_test_scaled = pd.DataFrame(scaler.transform(X_test_raw), columns=X_columns)
        else:
            X_train_scaled, X_test_scaled = X_train_raw, X_test_raw

        # Balanceamento
        X_train, y_train = smote.fit_resample(X_train_scaled, y_train_raw)

        # ====== Modelo selecionado ======
        if modelo_nome == 'KNN':

            k_otimo = int(min(15, max(3, int(np.sqrt(len(X_train)))))/3)  # evita k muito grande
            print(k_otimo)
            modelo = KNeighborsClassifier(
                n_neighbors=k_otimo,
                weights='distance',
                algorithm='ball_tree',  # mais r√°pido que brute
                n_jobs=-1
            )


        elif modelo_nome == 'SVM':
            from sklearn.svm import LinearSVC
            from sklearn.calibration import CalibratedClassifierCV

            base_svm = LinearSVC(C=0.5, random_state=random_state, dual=False, max_iter=5000)
            modelo = CalibratedClassifierCV(base_svm, method='sigmoid', cv=3)


        elif modelo_nome == 'RandomForest':

            modelo = RandomForestClassifier(
                n_estimators=100,        # ‚Üì de 200 para 100
                max_depth=8,             # ‚Üì menor profundidade
                min_samples_split=20,    # evita √°rvores muito grandes
                min_samples_leaf=10,     # idem
                max_features='sqrt',
                n_jobs=-1,               # usa todos os n√∫cleos
                class_weight='balanced',
                random_state=random_state
            )

        elif modelo_nome == 'LogisticRegression':
            modelo = LogisticRegression(max_iter=1000, solver='lbfgs', random_state=random_state)

        else:
            raise ValueError("Modelo n√£o reconhecido. Use: 'KNN', 'SVM', 'RandomForest' ou 'LogisticRegression'.")

        # ====== Treino ======
        modelo.fit(X_train, y_train)

        # Previs√µes e probabilidades
        y_train_pred = modelo.predict(X_train)
        y_test_pred = modelo.predict(X_test_scaled)
        y_train_proba = modelo.predict_proba(X_train)[:, 1]
        y_test_proba = modelo.predict_proba(X_test_scaled)[:, 1]

        # ====== M√©tricas ======
        auc_train = roc_auc_score(y_train, y_train_proba)
        auc_test = roc_auc_score(y_test, y_test_proba)
        acc_train = accuracy_score(y_train, y_train_pred)
        acc_test = accuracy_score(y_test, y_test_pred)
        recall_train = recall_score(y_train, y_train_pred, zero_division=0)
        recall_test = recall_score(y_test, y_test_pred, zero_division=0)

        tn_train, fp_train, fn_train, tp_train = confusion_matrix(y_train, y_train_pred).ravel()
        tn_test, fp_test, fn_test, tp_test = confusion_matrix(y_test, y_test_pred).ravel()

        results.append({
            'Modelo': modelo_nome,
            'Vari√°veis': combo,
            'AUC Treino': auc_train,
            'AUC Teste': auc_test,
            'Recall Treino': recall_train,
            'Recall Teste': recall_test,
            'Acur√°cia Treino': acc_train,
            'Acur√°cia Teste': acc_test,
            'TP Teste': tp_test, 'TN Teste': tn_test, 'FP Teste': fp_test, 'FN Teste': fn_test
        })

    return pd.DataFrame(results)


##Avalia√ß√£o Estrat√©gia 1:

In [None]:

df_renamed = v1.copy()
df_renamed.columns = [col.replace(' ', '_').replace('/', '_').replace('-', '_') for col in df_renamed.columns]

# # --- Grupo de vari√°veis que voc√™ quer testar ---
# grupo_unico = [['RST_Flag_Count', 'Protocol', 'SYN_Flag_Count', 'Flow_Duration']]

# # --- Chamada da fun√ß√£o ---
# df_results, ranking = testar_grupos(
#     df=df_renamed,
#     y_column='Label',
#     grupos=grupo_unico,
#     n_val=27000,
#     random_state=42,
#     normalizar=True
# )



# # Ver os melhores
# ranking.head(10)


grupo_unico = [['RST_Flag_Count', 'Protocol', 'SYN_Flag_Count', 'Flow_Duration']]

df_renamed


Unnamed: 0,RST_Flag_Count,Protocol,Fwd_IAT_Total,SYN_Flag_Count,Flow_Duration,FWD_Init_Win_Bytes,Label
0,1,6,31334.0,0,31334,1365,0
1,0,6,4.0,0,109939,2542,0
2,2,6,1.0,0,1,0,0
3,1,6,0.0,0,114090,0,0
4,2,6,181.0,0,181,0,0
...,...,...,...,...,...,...,...
220401,2,6,0.0,0,0,0,1
220402,1,6,2575.0,0,25145,434,1
220403,2,6,0.0,0,0,0,1
220404,1,6,34779202.0,6,34779202,64240,1


###Logistic Regression:

In [None]:
df_lr = testar_modelo(df_renamed, 'Label', grupo_unico, modelo_nome='LogisticRegression')
print(df_lr.to_latex())

Testando LogisticRegression para grupos definidos: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:04<00:00,  4.12s/it]

\begin{tabular}{lllrrrrrrrrrr}
\toprule
 & Modelo & Vari√°veis & AUC Treino & AUC Teste & Recall Treino & Recall Teste & Acur√°cia Treino & Acur√°cia Teste & TP Teste & TN Teste & FP Teste & FN Teste \\
\midrule
0 & LogisticRegression & ['RST_Flag_Count', 'Protocol', 'SYN_Flag_Count', 'Flow_Duration'] & 0.702427 & 0.710566 & 0.454286 & 0.442472 & 0.652701 & 0.618600 & 1296 & 1797 & 274 & 1633 \\
\bottomrule
\end{tabular}






In [None]:
\begin{tabular}{lllrrrrrrrrrr}
\toprule
 & Modelo & Vari√°veis & AUC Treino & AUC Teste & Recall Treino & Recall Teste & Acur√°cia Treino & Acur√°cia Teste & TP Teste & TN Teste & FP Teste & FN Teste \\
\midrule
0 & LogisticRegression & ['RST_Flag_Count', 'Protocol', 'SYN_Flag_Count', 'Flow_Duration'] & 0.702427 & 0.710566 & 0.454286 & 0.442472 & 0.652701 & 0.618600 & 1296 & 1797 & 274 & 1633 \\
\bottomrule
\end{tabular}

###Random Forest:

In [None]:
df_rf = testar_modelo(df_renamed, 'Label', grupo_unico, modelo_nome='RandomForest')
print(df_rf.to_latex())


Testando RandomForest para grupos definidos: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:32<00:00, 32.96s/it]

\begin{tabular}{lllrrrrrrrrrr}
\toprule
 & Modelo & Vari√°veis & AUC Treino & AUC Teste & Recall Treino & Recall Teste & Acur√°cia Treino & Acur√°cia Teste & TP Teste & TN Teste & FP Teste & FN Teste \\
\midrule
0 & RandomForest & ['RST_Flag_Count', 'Protocol', 'SYN_Flag_Count', 'Flow_Duration'] & 0.833133 & 0.833274 & 0.649931 & 0.636736 & 0.756846 & 0.732200 & 1865 & 1796 & 275 & 1064 \\
\bottomrule
\end{tabular}






In [None]:
\begin{tabular}{lllrrrrrrrrrr}
\toprule
 & Modelo & Vari√°veis & AUC Treino & AUC Teste & Recall Treino & Recall Teste & Acur√°cia Treino & Acur√°cia Teste & TP Teste & TN Teste & FP Teste & FN Teste \\
\midrule
0 & RandomForest & ['RST_Flag_Count', 'Protocol', 'SYN_Flag_Count', 'Flow_Duration'] & 0.833133 & 0.833274 & 0.649931 & 0.636736 & 0.756846 & 0.732200 & 1865 & 1796 & 275 & 1064 \\
\bottomrule
\end{tabular}

###KNN:

In [None]:
df_knn = testar_modelo(df_renamed, 'Label', grupo_unico, modelo_nome='KNN')
print(df_knn.to_latex())

Testando KNN para grupos definidos:   0%|          | 0/1 [00:00<?, ?it/s]

5


Testando KNN para grupos definidos: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [01:43<00:00, 103.27s/it]

\begin{tabular}{lllrrrrrrrrrr}
\toprule
 & Modelo & Vari√°veis & AUC Treino & AUC Teste & Recall Treino & Recall Teste & Acur√°cia Treino & Acur√°cia Teste & TP Teste & TN Teste & FP Teste & FN Teste \\
\midrule
0 & KNN & ['RST_Flag_Count', 'Protocol', 'SYN_Flag_Count', 'Flow_Duration'] & 0.997650 & 0.782272 & 0.961687 & 0.722431 & 0.975737 & 0.718800 & 2116 & 1478 & 593 & 813 \\
\bottomrule
\end{tabular}






In [None]:
\begin{tabular}{lllrrrrrrrrrr}
\toprule
 & Modelo & Vari√°veis & AUC Treino & AUC Teste & Recall Treino & Recall Teste & Acur√°cia Treino & Acur√°cia Teste & TP Teste & TN Teste & FP Teste & FN Teste \\
\midrule
0 & KNN & ['RST_Flag_Count', 'Protocol', 'SYN_Flag_Count', 'Flow_Duration'] & 0.997650 & 0.782272 & 0.961687 & 0.722431 & 0.975737 & 0.718800 & 2116 & 1478 & 593 & 813 \\
\bottomrule
\end{tabular}

###SVM:

In [None]:
# Rodar cada modelo separadamente

df_svm = testar_modelo(df_renamed, 'Label', grupo_unico, modelo_nome='SVM')
print(df_svm.to_latex())

Testando SVM para grupos definidos: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:04<00:00,  4.96s/it]

\begin{tabular}{lllrrrrrrrrrr}
\toprule
 & Modelo & Vari√°veis & AUC Treino & AUC Teste & Recall Treino & Recall Teste & Acur√°cia Treino & Acur√°cia Teste & TP Teste & TN Teste & FP Teste & FN Teste \\
\midrule
0 & SVM & ['RST_Flag_Count', 'Protocol', 'SYN_Flag_Count', 'Flow_Duration'] & 0.705633 & 0.714916 & 0.496624 & 0.482417 & 0.651365 & 0.624600 & 1413 & 1710 & 361 & 1516 \\
\bottomrule
\end{tabular}






In [None]:
\begin{tabular}{lllrrrrrrrrrr}
\toprule
 & Modelo & Vari√°veis & AUC Treino & AUC Teste & Recall Treino & Recall Teste & Acur√°cia Treino & Acur√°cia Teste & TP Teste & TN Teste & FP Teste & FN Teste \\
\midrule
0 & SVM & ['RST_Flag_Count', 'Protocol', 'SYN_Flag_Count', 'Flow_Duration'] & 0.705633 & 0.714916 & 0.496624 & 0.482417 & 0.651365 & 0.624600 & 1413 & 1710 & 361 & 1516 \\
\bottomrule
\end{tabular}

##Avalia√ß√£o Estrat√©gia 2:

In [None]:
# Ver os resultados
# display(df_knn, df_rf, df_svm, df_lr)
df_renamed2 = v2.copy()
df_renamed2.columns = [col.replace(' ', '_').replace('/', '_').replace('-', '_') for col in df_renamed.columns]

# # --- Grupo de vari√°veis que voc√™ quer testar ---
# grupo_unico = [['RST_Flag_Count', 'Protocol', 'SYN_Flag_Count', 'Flow_Duration']]

# # --- Chamada da fun√ß√£o ---
# df_results, ranking = testar_grupos(
#     df=df_renamed,
#     y_column='Label',
#     grupos=grupo_unico,
#     n_val=27000,
#     random_state=42,
#     normalizar=True
# )



# # Ver os melhores
# ranking.head(10)


grupo_unico = [['Total_Length_of_Fwd_Packet', 'Total_Length_of_Bwd_Packet']]

df_renamed2

Unnamed: 0,Src_Port,Total_Length_of_Fwd_Packet,Total_Length_of_Bwd_Packet,Packet_Length_Max,Packet_Length_Mean,Average_Packet_Size,Label
0,51438,31.0,0.0,31.0,15.500000,20.666667,0
1,44438,115.0,31.0,73.0,37.600000,47.000000,0
2,443,0.0,0.0,0.0,0.000000,0.000000,0
3,51438,0.0,31.0,31.0,7.750000,10.333333,0
4,41834,0.0,0.0,0.0,0.000000,0.000000,0
...,...,...,...,...,...,...,...
220401,50884,0.0,0.0,0.0,0.000000,0.000000,1
220402,80,9480.0,0.0,1436.0,1212.888889,1364.500000,1
220403,51216,0.0,0.0,0.0,0.000000,0.000000,1
220404,50812,3628.0,4586.0,2896.0,410.700000,432.315789,1


###Logistic Regression:

In [None]:
df_lr = testar_modelo(df_renamed2, 'Label', grupo_unico, modelo_nome='LogisticRegression')
print(df_lr.to_latex())

Testando LogisticRegression para grupos definidos: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:02<00:00,  2.96s/it]

\begin{tabular}{lllrrrrrrrrrr}
\toprule
 & Modelo & Vari√°veis & AUC Treino & AUC Teste & Recall Treino & Recall Teste & Acur√°cia Treino & Acur√°cia Teste & TP Teste & TN Teste & FP Teste & FN Teste \\
\midrule
0 & LogisticRegression & ['Total_Length_of_Fwd_Packet', 'Total_Length_of_Bwd_Packet'] & 0.500000 & 0.500000 & 0.000000 & 0.000000 & 0.500000 & 0.414200 & 0 & 2071 & 0 & 2929 \\
\bottomrule
\end{tabular}






In [None]:
\begin{tabular}{lllrrrrrrrrrr}
\toprule
 & Modelo & Vari√°veis & AUC Treino & AUC Teste & Recall Treino & Recall Teste & Acur√°cia Treino & Acur√°cia Teste & TP Teste & TN Teste & FP Teste & FN Teste \\
\midrule
0 & LogisticRegression & ['Total_Length_of_Fwd_Packet', 'Total_Length_of_Bwd_Packet'] & 0.500000 & 0.500000 & 0.000000 & 0.000000 & 0.500000 & 0.414200 & 0 & 2071 & 0 & 2929 \\
\bottomrule
\end{tabular}

###Random Forest:

In [None]:
df_rf = testar_modelo(df_renamed2, 'Label', grupo_unico, modelo_nome='RandomForest')
print(df_rf.to_latex())


Testando RandomForest para grupos definidos: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:23<00:00, 23.05s/it]

\begin{tabular}{lllrrrrrrrrrr}
\toprule
 & Modelo & Vari√°veis & AUC Treino & AUC Teste & Recall Treino & Recall Teste & Acur√°cia Treino & Acur√°cia Teste & TP Teste & TN Teste & FP Teste & FN Teste \\
\midrule
0 & RandomForest & ['Total_Length_of_Fwd_Packet', 'Total_Length_of_Bwd_Packet'] & 0.837717 & 0.843384 & 0.683402 & 0.680437 & 0.761038 & 0.753600 & 1993 & 1775 & 296 & 936 \\
\bottomrule
\end{tabular}






In [None]:
\begin{tabular}{lllrrrrrrrrrr}
\toprule
 & Modelo & Vari√°veis & AUC Treino & AUC Teste & Recall Treino & Recall Teste & Acur√°cia Treino & Acur√°cia Teste & TP Teste & TN Teste & FP Teste & FN Teste \\
\midrule
0 & RandomForest & ['Total_Length_of_Fwd_Packet', 'Total_Length_of_Bwd_Packet'] & 0.837717 & 0.843384 & 0.683402 & 0.680437 & 0.761038 & 0.753600 & 1993 & 1775 & 296 & 936 \\
\bottomrule
\end{tabular}

###KNN:

In [None]:
df_knn = testar_modelo(df_renamed2, 'Label', grupo_unico, modelo_nome='KNN')
print(df_knn.to_latex())

Testando KNN para grupos definidos:   0%|          | 0/1 [00:00<?, ?it/s]

5


Testando KNN para grupos definidos: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [01:55<00:00, 115.09s/it]

\begin{tabular}{lllrrrrrrrrrr}
\toprule
 & Modelo & Vari√°veis & AUC Treino & AUC Teste & Recall Treino & Recall Teste & Acur√°cia Treino & Acur√°cia Teste & TP Teste & TN Teste & FP Teste & FN Teste \\
\midrule
0 & KNN & ['Total_Length_of_Fwd_Packet', 'Total_Length_of_Bwd_Packet'] & 0.910448 & 0.889977 & 0.879863 & 0.854899 & 0.850616 & 0.836600 & 2504 & 1679 & 392 & 425 \\
\bottomrule
\end{tabular}






In [None]:
\begin{tabular}{lllrrrrrrrrrr}
\toprule
 & Modelo & Vari√°veis & AUC Treino & AUC Teste & Recall Treino & Recall Teste & Acur√°cia Treino & Acur√°cia Teste & TP Teste & TN Teste & FP Teste & FN Teste \\
\midrule
0 & KNN & ['Total_Length_of_Fwd_Packet', 'Total_Length_of_Bwd_Packet'] & 0.910448 & 0.889977 & 0.879863 & 0.854899 & 0.850616 & 0.836600 & 2504 & 1679 & 392 & 425 \\
\bottomrule
\end{tabular}

###SVM:

In [None]:
# Rodar cada modelo separadamente

df_svm = testar_modelo(df_renamed2, 'Label', grupo_unico, modelo_nome='SVM')
print(df_svm.to_latex())

Testando SVM para grupos definidos: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:03<00:00,  3.77s/it]

\begin{tabular}{lllrrrrrrrrrr}
\toprule
 & Modelo & Vari√°veis & AUC Treino & AUC Teste & Recall Treino & Recall Teste & Acur√°cia Treino & Acur√°cia Teste & TP Teste & TN Teste & FP Teste & FN Teste \\
\midrule
0 & SVM & ['Total_Length_of_Fwd_Packet', 'Total_Length_of_Bwd_Packet'] & 0.569928 & 0.564348 & 0.958700 & 0.954251 & 0.492112 & 0.567400 & 2795 & 42 & 2029 & 134 \\
\bottomrule
\end{tabular}






In [None]:
\begin{tabular}{lllrrrrrrrrrr}
\toprule
 & Modelo & Vari√°veis & AUC Treino & AUC Teste & Recall Treino & Recall Teste & Acur√°cia Treino & Acur√°cia Teste & TP Teste & TN Teste & FP Teste & FN Teste \\
\midrule
0 & SVM & ['Total_Length_of_Fwd_Packet', 'Total_Length_of_Bwd_Packet'] & 0.569928 & 0.564348 & 0.958700 & 0.954251 & 0.492112 & 0.567400 & 2795 & 42 & 2029 & 134 \\
\bottomrule
\end{tabular}
