# CSE-CIC-IDS2018 - Benchmarking 
## 0. Notebook initialization

In [3]:
# Mount your Google Drive

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [32]:
import os
import warnings
from google.colab import files

# Suppress all warning messages
warnings.filterwarnings("ignore")

# Check if the Kaggle API credentials file already exists
kaggle_credentials_path = os.path.expanduser("~/.kaggle/kaggle.json")

if not os.path.exists(kaggle_credentials_path):

    if not os.path.exists(os.path.join("/content/drive/MyDrive/.kaggle/", "kaggle.json")):

      # Upload your Kaggle API credentials file (kaggle.json)
      files.upload()

      !mv kaggle.json "/content/drive/MyDrive/.kaggle/"
      !chmod 600 "/content/drive/MyDrive/.kaggle/kaggle.json"

    # Move the Kaggle API Credentials File
    !mkdir -p ~/.kaggle
    !cp '/content/drive/MyDrive/.kaggle/kaggle.json' ~/.kaggle/

else:

    print("Kaggle API credentials file already exists.")

Kaggle API credentials file already exists.


In [33]:
import tensorflow as tf
print("GPU available:", tf.test.is_gpu_available())
print("GPU device name:", tf.test.gpu_device_name())

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.


GPU available: False
GPU device name: 


In [4]:
import os
from psutil import virtual_memory
from tabulate import tabulate

# Function to get CPU information
def get_cpu_info():
    cpu_info = os.popen('lscpu').read()
    return cpu_info

# Function to get RAM information
def get_ram_info():
    ram = virtual_memory()
    total_ram = f"{ram.total / 1e9:.2f} GB"
    available_ram = f"{ram.available / 1e9:.2f} GB"
    return total_ram, available_ram

# Function to get GPU information
def get_gpu_info():
    # Execute nvidia-smi and get its output
    gpu_info = os.popen('nvidia-smi --query-gpu=name,memory.total,memory.used,memory.free --format=csv,noheader,nounits').read().strip()

    # Split the output to get individual GPU details
    details = gpu_info.split(", ")

    # Return GPU name, total, used, and free memory
    return details[0], f"{details[1]} MB", f"{details[2]} MB", f"{details[3]} MB"

# Collect system information
cpu_info = get_cpu_info()
total_ram, available_ram = get_ram_info()
try:
  gpu_name, gpu_total_memory, gpu_used_memory, gpu_free_memory = get_gpu_info()
except:
  gpu_name, gpu_total_memory, gpu_used_memory, gpu_free_memory = 'null',0,0,0

# Extract relevant CPU information
cpu_type = ""
cpu_architecture = ""

for line in cpu_info.splitlines():
    if "Model name:" in line:
        cpu_type = line.split(":")[1].strip()
    elif "Architecture:" in line:
        cpu_architecture = line.split(":")[1].strip()

# Create a table
table = [
    ["CPU Type", cpu_type],
    ["CPU Architecture", cpu_architecture],
    ["Total RAM", total_ram],
    ["Available RAM", available_ram],
    ["GPU Name", gpu_name],
    ["GPU Total Memory", gpu_total_memory],
    ["GPU Used Memory", gpu_used_memory],
    ["GPU Free Memory", gpu_free_memory]
]

# Display the table
print(tabulate(table, headers=["Characteristic", "Value"], tablefmt="pretty"))


+------------------+--------------------------------+
|  Characteristic  |             Value              |
+------------------+--------------------------------+
|     CPU Type     | Intel(R) Xeon(R) CPU @ 2.20GHz |
| CPU Architecture |             x86_64             |
|    Total RAM     |            54.76 GB            |
|  Available RAM   |            53.08 GB            |
|     GPU Name     |            Tesla T4            |
| GPU Total Memory |            15360 MB            |
| GPU Used Memory  |              0 MB              |
| GPU Free Memory  |            15101 MB            |
+------------------+--------------------------------+


## 1. Data Acquisition and Preprocessing

In this section, we focus on acquiring the above mentioned datasets.

### 1.6. CSE-CIC-IDS2018 dataset
CSE-CIC-IDS2018 [[1]](#1) is an extensively curated cybersecurity dataset developed by the Canadian Institute for Cybersecurity (CIC) researchers. This dataset stands out for its unique characteristics, as it was generated using an AWS testbed. Additionaly, what sets it apart is the innovative approach of using "profiles" to generate datasets systematically, allowing for a comprehensive evaluation of intrusion detection systems (IDS) under various network traffic scenarios. This approach ensures that CSE-CIC-IDS2018 offers a diverse and robust benchmark for assessing IDS performance and advancing cybersecurity research.

### Download and Unzip CSE-CIC-IDS2018 dataset

In [35]:
import os
import pandas as pd
import zipfile

# Specify the dataset name
dataset_name = "solarmainframe/ids-intrusion-csv"

# Specify the destination folder in your Google Drive
destination_folder = "/content/drive/MyDrive/CSE-CIC-IDS2018-BM"

# Check if the dataset file already exists in your Google Drive
dataset_file_path = os.path.join(destination_folder, "ids-intrusion-csv.zip")

if not os.path.exists(dataset_file_path):

  # Download the dataset and save it to your Google Drive
  !kaggle datasets download -d $dataset_name -p $destination_folder

  print("Download complete.")

else:

  print("Dataset already exists. Skipping download.")

dest_file = f"{destination_folder}/ids-intrusion-csv.zip"

# Check if the Dataset was downlaoded
if os.path.exists(dest_file) and len(os.listdir(destination_folder))==1:

  # Unzip the downloaded dataset
  with zipfile.ZipFile(dest_file, "r") as zip_ref:
      zip_ref.extractall(destination_folder)

  print("Unzip complete.")

else:

  print("Dataset already exists. Skipping unzip.")

Dataset already exists. Skipping download.
Dataset already exists. Skipping unzip.


In [1]:
!ls -ahl '/content/drive/MyDrive/CSE-CIC-IDS2018-BM'

total 15G
-rw------- 1 root root 342M Oct 11 11:09 02-14-2018.csv
-rw------- 1 root root 359M Oct 11 11:09 02-15-2018.csv
-rw------- 1 root root 319M Oct 11 11:09 02-16-2018.csv
-rw------- 1 root root 3.8G Oct 11 11:09 02-20-2018.csv
-rw------- 1 root root 314M Oct 11 11:09 02-21-2018.csv
-rw------- 1 root root 365M Oct 11 11:09 02-22-2018.csv
-rw------- 1 root root 366M Oct 11 11:09 02-23-2018.csv
-rw------- 1 root root 200M Oct 11 11:10 02-28-2018.csv
-rw------- 1 root root 103M Oct 11 11:10 03-01-2018.csv
-rw------- 1 root root 337M Oct 11 11:10 03-02-2018.csv
-rw------- 1 root root 6.9G Oct 11 13:08 cse-cic-ids2018.csv
-rw------- 1 root root 1.6G Oct  2  2020 ids-intrusion-csv.zip
drwx------ 2 root root 4.0K Oct 11 13:05 .ipynb_checkpoints


In [28]:
import os
import pandas as pd

# Specify the destination folder in your Google Drive
destination_folder = "/content/drive/MyDrive/CSE-CIC-IDS2018-BM"

# Check if the Dataset is saved
#df_file_path = os.path.join(destination_folder, "cse-cic-ids2018.csv")

encoding = 'ISO-8859-1'  # Specify the correct encoding

# Create an empty dictionary to store the dataframes
dfs = {}

# Iterate over the CSV files in the folder
for filename in os.listdir(destination_folder):
  if filename.endswith(".csv") and not filename.startswith("cse-cic-ids2018") :

    # Get the dataframe name from the file name
    df_name = os.path.splitext(filename)[0]

    # Read the CSV file with the specified encoding
    try:
      df = pd.read_csv(os.path.join(destination_folder, filename), encoding=encoding)
    except UnicodeDecodeError:
      print(f'Error: Unable to read {filename} with encoding {encoding}')
    # Store the dataframe in the dictionary with the same name as the file
    dfs[df_name] = df

print("Data stored in dfs dictionary")

  df = pd.read_csv(os.path.join(destination_folder, filename), encoding=encoding)
  df = pd.read_csv(os.path.join(destination_folder, filename), encoding=encoding)
  df = pd.read_csv(os.path.join(destination_folder, filename), encoding=encoding)


Data stored in dfs dictionary


In [29]:
for key in dfs.keys():
  # Get the dataframe corresponding to the key
  df = dfs[key]
  # Display memory usage and data types
  print("Memory Usage (MB):", df.memory_usage(deep=True).sum() / (1024 ** 2))
  #print("Data Types:")
  #print(df.dtypes)
  print()

Memory Usage (MB): 765.9058046340942

Memory Usage (MB): 763.7503843307495

Memory Usage (MB): 2743.7713899612427

Memory Usage (MB): 7612.453751564026

Memory Usage (MB): 769.5648126602173

Memory Usage (MB): 763.0027494430542

Memory Usage (MB): 763.0046405792236

Memory Usage (MB): 2069.126660346985

Memory Usage (MB): 983.4694261550903

Memory Usage (MB): 762.1805953979492



In [30]:
for key in dfs.keys():
    df = dfs[key]
    print(f"Dataframe: '{key}'\n")
    display(df.head())
    display(df.info())

Dataframe: '02-14-2018'



Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,0,0,14/02/2018 08:31:01,112641719,3,0,0,0,0,0,...,0,0.0,0.0,0,0,56320859.5,139.300036,56320958,56320761,Benign
1,0,0,14/02/2018 08:33:50,112641466,3,0,0,0,0,0,...,0,0.0,0.0,0,0,56320733.0,114.551299,56320814,56320652,Benign
2,0,0,14/02/2018 08:36:39,112638623,3,0,0,0,0,0,...,0,0.0,0.0,0,0,56319311.5,301.934596,56319525,56319098,Benign
3,22,6,14/02/2018 08:40:13,6453966,15,10,1239,2273,744,0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,Benign
4,22,6,14/02/2018 08:40:23,8804066,14,11,1143,2209,744,0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,Benign


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 80 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   Dst Port           1048575 non-null  int64  
 1   Protocol           1048575 non-null  int64  
 2   Timestamp          1048575 non-null  object 
 3   Flow Duration      1048575 non-null  int64  
 4   Tot Fwd Pkts       1048575 non-null  int64  
 5   Tot Bwd Pkts       1048575 non-null  int64  
 6   TotLen Fwd Pkts    1048575 non-null  int64  
 7   TotLen Bwd Pkts    1048575 non-null  int64  
 8   Fwd Pkt Len Max    1048575 non-null  int64  
 9   Fwd Pkt Len Min    1048575 non-null  int64  
 10  Fwd Pkt Len Mean   1048575 non-null  float64
 11  Fwd Pkt Len Std    1048575 non-null  float64
 12  Bwd Pkt Len Max    1048575 non-null  int64  
 13  Bwd Pkt Len Min    1048575 non-null  int64  
 14  Bwd Pkt Len Mean   1048575 non-null  float64
 15  Bwd Pkt Len Std    1048575 non-n

None

Dataframe: '02-15-2018'



Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,0,0,15/02/2018 08:25:18,112641158,3,0,0,0,0,0,...,0,0.0,0.0,0,0,56320579.0,704.2784,56321077,56320081,Benign
1,22,6,15/02/2018 08:29:05,37366762,14,12,2168,2993,712,0,...,32,1024353.0,649038.754495,1601183,321569,11431221.0,3644991.0,15617415,8960247,Benign
2,47514,6,15/02/2018 08:29:42,543,2,0,64,0,64,0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,Benign
3,0,0,15/02/2018 08:28:07,112640703,3,0,0,0,0,0,...,0,0.0,0.0,0,0,56320351.5,366.9884,56320611,56320092,Benign
4,0,0,15/02/2018 08:30:56,112640874,3,0,0,0,0,0,...,0,0.0,0.0,0,0,56320437.0,719.8347,56320946,56319928,Benign


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 80 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   Dst Port           1048575 non-null  int64  
 1   Protocol           1048575 non-null  int64  
 2   Timestamp          1048575 non-null  object 
 3   Flow Duration      1048575 non-null  int64  
 4   Tot Fwd Pkts       1048575 non-null  int64  
 5   Tot Bwd Pkts       1048575 non-null  int64  
 6   TotLen Fwd Pkts    1048575 non-null  int64  
 7   TotLen Bwd Pkts    1048575 non-null  int64  
 8   Fwd Pkt Len Max    1048575 non-null  int64  
 9   Fwd Pkt Len Min    1048575 non-null  int64  
 10  Fwd Pkt Len Mean   1048575 non-null  float64
 11  Fwd Pkt Len Std    1048575 non-null  float64
 12  Bwd Pkt Len Max    1048575 non-null  int64  
 13  Bwd Pkt Len Min    1048575 non-null  int64  
 14  Bwd Pkt Len Mean   1048575 non-null  float64
 15  Bwd Pkt Len Std    1048575 non-n

None

Dataframe: '02-16-2018'



Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,0,0,16/02/2018 08:27:23,112640768,3,0,0,0,0,0,...,0,0,0.0,0,0,56300000.0,138.592929,56300000,56300000,Benign
1,0,0,16/02/2018 08:30:12,112641773,3,0,0,0,0,0,...,0,0,0.0,0,0,56300000.0,263.750829,56300000,56300000,Benign
2,35605,6,16/02/2018 08:26:55,20784143,23,44,2416,1344,240,64,...,20,2624734,0.0,2624734,2624734,9058214.0,0.0,9058214,9058214,Benign
3,0,0,16/02/2018 08:33:01,112640836,3,0,0,0,0,0,...,0,0,0.0,0,0,56300000.0,82.024387,56300000,56300000,Benign
4,23,6,16/02/2018 08:27:59,20,1,1,0,0,0,0,...,20,0,0.0,0,0,0.0,0.0,0,0,Benign


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 80 columns):
 #   Column             Non-Null Count    Dtype 
---  ------             --------------    ----- 
 0   Dst Port           1048575 non-null  object
 1   Protocol           1048575 non-null  object
 2   Timestamp          1048575 non-null  object
 3   Flow Duration      1048575 non-null  object
 4   Tot Fwd Pkts       1048575 non-null  object
 5   Tot Bwd Pkts       1048575 non-null  object
 6   TotLen Fwd Pkts    1048575 non-null  object
 7   TotLen Bwd Pkts    1048575 non-null  object
 8   Fwd Pkt Len Max    1048575 non-null  object
 9   Fwd Pkt Len Min    1048575 non-null  object
 10  Fwd Pkt Len Mean   1048575 non-null  object
 11  Fwd Pkt Len Std    1048575 non-null  object
 12  Bwd Pkt Len Max    1048575 non-null  object
 13  Bwd Pkt Len Min    1048575 non-null  object
 14  Bwd Pkt Len Mean   1048575 non-null  object
 15  Bwd Pkt Len Std    1048575 non-null  object
 16  

None

Dataframe: '02-20-2018'



Unnamed: 0,Flow ID,Src IP,Src Port,Dst IP,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,172.31.69.25-94.231.103.172-22-45498-6,94.231.103.172,45498,172.31.69.25,22,6,20/02/2018 08:34:07,888751,11,11,...,32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
1,8.0.6.4-8.6.0.1-0-0-0,8.6.0.1,0,8.0.6.4,0,0,20/02/2018 08:33:22,112642816,3,0,...,0,0.0,0.0,0.0,0.0,56300000.0,7.071068,56300000.0,56300000.0,Benign
2,8.0.6.4-8.6.0.1-0-0-0,8.6.0.1,0,8.0.6.4,0,0,20/02/2018 08:36:11,112642712,3,0,...,0,0.0,0.0,0.0,0.0,56300000.0,18.384776,56300000.0,56300000.0,Benign
3,8.0.6.4-8.6.0.1-0-0-0,8.6.0.1,0,8.0.6.4,0,0,20/02/2018 08:39:00,112642648,3,0,...,0,0.0,0.0,0.0,0.0,56300000.0,5.656854,56300000.0,56300000.0,Benign
4,8.0.6.4-8.6.0.1-0-0-0,8.6.0.1,0,8.0.6.4,0,0,20/02/2018 08:41:49,112642702,3,0,...,0,0.0,0.0,0.0,0.0,56300000.0,65.053824,56300000.0,56300000.0,Benign


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7948748 entries, 0 to 7948747
Data columns (total 84 columns):
 #   Column             Dtype  
---  ------             -----  
 0   Flow ID            object 
 1   Src IP             object 
 2   Src Port           int64  
 3   Dst IP             object 
 4   Dst Port           int64  
 5   Protocol           int64  
 6   Timestamp          object 
 7   Flow Duration      int64  
 8   Tot Fwd Pkts       int64  
 9   Tot Bwd Pkts       int64  
 10  TotLen Fwd Pkts    float64
 11  TotLen Bwd Pkts    float64
 12  Fwd Pkt Len Max    float64
 13  Fwd Pkt Len Min    float64
 14  Fwd Pkt Len Mean   float64
 15  Fwd Pkt Len Std    float64
 16  Bwd Pkt Len Max    float64
 17  Bwd Pkt Len Min    float64
 18  Bwd Pkt Len Mean   float64
 19  Bwd Pkt Len Std    float64
 20  Flow Byts/s        float64
 21  Flow Pkts/s        float64
 22  Flow IAT Mean      float64
 23  Flow IAT Std       float64
 24  Flow IAT Max       float64
 25  Flow IAT Min      

None

Dataframe: '02-21-2018'



Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,80,6,21/02/2018 08:33:25,37953,5,3,135,127,135,0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,Benign
1,500,17,21/02/2018 08:33:06,117573474,3,0,1500,0,500,500,...,8,0.0,0.0,0,0,58800000.0,23800000.0,75600000,42000000,Benign
2,500,17,21/02/2018 08:33:06,117573474,3,0,1500,0,500,500,...,8,0.0,0.0,0,0,58800000.0,23800000.0,75600000,42000000,Benign
3,500,17,21/02/2018 08:33:11,99743998,5,0,2500,0,500,500,...,8,4000290.0,0.0,4000290,4000290,31900000.0,37900000.0,75600000,7200397,Benign
4,500,17,21/02/2018 08:33:11,99743999,5,0,2500,0,500,500,...,8,4000286.0,0.0,4000286,4000286,31900000.0,37900000.0,75600000,7200399,Benign


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 80 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   Dst Port           1048575 non-null  int64  
 1   Protocol           1048575 non-null  int64  
 2   Timestamp          1048575 non-null  object 
 3   Flow Duration      1048575 non-null  int64  
 4   Tot Fwd Pkts       1048575 non-null  int64  
 5   Tot Bwd Pkts       1048575 non-null  int64  
 6   TotLen Fwd Pkts    1048575 non-null  int64  
 7   TotLen Bwd Pkts    1048575 non-null  int64  
 8   Fwd Pkt Len Max    1048575 non-null  int64  
 9   Fwd Pkt Len Min    1048575 non-null  int64  
 10  Fwd Pkt Len Mean   1048575 non-null  float64
 11  Fwd Pkt Len Std    1048575 non-null  float64
 12  Bwd Pkt Len Max    1048575 non-null  int64  
 13  Bwd Pkt Len Min    1048575 non-null  int64  
 14  Bwd Pkt Len Mean   1048575 non-null  float64
 15  Bwd Pkt Len Std    1048575 non-n

None

Dataframe: '02-22-2018'



Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,22,6,22/02/2018 08:26:03,20553406,10,7,1063,1297,744,0,...,20,1027304.0,0.0,1027304,1027304,19526080.0,0.0,19526080,19526080,Benign
1,34989,6,22/02/2018 08:26:24,790,2,0,848,0,848,0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,Benign
2,500,17,22/02/2018 08:25:10,99745913,5,0,2500,0,500,500,...,8,4000203.0,0.0,4000203,4000203,31915240.0,37927870.0,75584115,7200679,Benign
3,500,17,22/02/2018 08:25:10,99745913,5,0,2500,0,500,500,...,8,4000189.0,0.0,4000189,4000189,31915240.0,37927880.0,75584130,7200693,Benign
4,500,17,22/02/2018 08:24:59,89481361,6,0,3000,0,500,500,...,8,4000554.0,0.0,4000554,4000554,21370200.0,15281090.0,41990741,7200848,Benign


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 80 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   Dst Port           1048575 non-null  int64  
 1   Protocol           1048575 non-null  int64  
 2   Timestamp          1048575 non-null  object 
 3   Flow Duration      1048575 non-null  int64  
 4   Tot Fwd Pkts       1048575 non-null  int64  
 5   Tot Bwd Pkts       1048575 non-null  int64  
 6   TotLen Fwd Pkts    1048575 non-null  int64  
 7   TotLen Bwd Pkts    1048575 non-null  int64  
 8   Fwd Pkt Len Max    1048575 non-null  int64  
 9   Fwd Pkt Len Min    1048575 non-null  int64  
 10  Fwd Pkt Len Mean   1048575 non-null  float64
 11  Fwd Pkt Len Std    1048575 non-null  float64
 12  Bwd Pkt Len Max    1048575 non-null  int64  
 13  Bwd Pkt Len Min    1048575 non-null  int64  
 14  Bwd Pkt Len Mean   1048575 non-null  float64
 15  Bwd Pkt Len Std    1048575 non-n

None

Dataframe: '02-23-2018'



Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,22,6,23/02/2018 08:18:29,1532698,11,11,1179,1969,648,0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,Benign
1,500,17,23/02/2018 08:17:45,117573855,3,0,1500,0,500,500,...,8,0.0,0.0,0,0,58786927.5,23753240.0,75583006,41990849,Benign
2,500,17,23/02/2018 08:17:45,117573848,3,0,1500,0,500,500,...,8,0.0,0.0,0,0,58786924.0,23753250.0,75583007,41990841,Benign
3,22,6,23/02/2018 08:19:55,1745392,11,11,1179,1969,648,0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,Benign
4,500,17,23/02/2018 08:18:17,89483474,6,0,3000,0,500,500,...,8,4000364.0,0.0,4000364,4000364,21370777.5,15280920.0,41989576,7200485,Benign


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 80 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   Dst Port           1048575 non-null  int64  
 1   Protocol           1048575 non-null  int64  
 2   Timestamp          1048575 non-null  object 
 3   Flow Duration      1048575 non-null  int64  
 4   Tot Fwd Pkts       1048575 non-null  int64  
 5   Tot Bwd Pkts       1048575 non-null  int64  
 6   TotLen Fwd Pkts    1048575 non-null  int64  
 7   TotLen Bwd Pkts    1048575 non-null  int64  
 8   Fwd Pkt Len Max    1048575 non-null  int64  
 9   Fwd Pkt Len Min    1048575 non-null  int64  
 10  Fwd Pkt Len Mean   1048575 non-null  float64
 11  Fwd Pkt Len Std    1048575 non-null  float64
 12  Bwd Pkt Len Max    1048575 non-null  int64  
 13  Bwd Pkt Len Min    1048575 non-null  int64  
 14  Bwd Pkt Len Mean   1048575 non-null  float64
 15  Bwd Pkt Len Std    1048575 non-n

None

Dataframe: '02-28-2018'



Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,443,6,28/02/2018 08:22:13,94658,6,7,708,3718,387,0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,Benign
1,443,6,28/02/2018 08:22:13,206,2,0,0,0,0,0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,Benign
2,445,6,28/02/2018 08:22:15,165505,3,1,0,0,0,0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,Benign
3,443,6,28/02/2018 08:22:16,102429,6,7,708,3718,387,0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,Benign
4,443,6,28/02/2018 08:22:16,167,2,0,0,0,0,0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,Benign


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 613104 entries, 0 to 613103
Data columns (total 80 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   Dst Port           613104 non-null  object
 1   Protocol           613104 non-null  object
 2   Timestamp          613104 non-null  object
 3   Flow Duration      613104 non-null  object
 4   Tot Fwd Pkts       613104 non-null  object
 5   Tot Bwd Pkts       613104 non-null  object
 6   TotLen Fwd Pkts    613104 non-null  object
 7   TotLen Bwd Pkts    613104 non-null  object
 8   Fwd Pkt Len Max    613104 non-null  object
 9   Fwd Pkt Len Min    613104 non-null  object
 10  Fwd Pkt Len Mean   613104 non-null  object
 11  Fwd Pkt Len Std    613104 non-null  object
 12  Bwd Pkt Len Max    613104 non-null  object
 13  Bwd Pkt Len Min    613104 non-null  object
 14  Bwd Pkt Len Mean   613104 non-null  object
 15  Bwd Pkt Len Std    613104 non-null  object
 16  Flow Byts/s        6

None

Dataframe: '03-01-2018'



Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,0,0,01/03/2018 08:17:11,115307855,5,0,0,0,0,0,...,0,1812348,0,1812348,1812348,56700000,6010057.622,61000000,52500000,Benign
1,0,0,01/03/2018 08:20:07,60997457,2,0,0,0,0,0,...,0,0,0,0,0,61000000,0.0,61000000,61000000,Benign
2,67,17,01/03/2018 08:17:18,61149019,5,0,1500,0,300,300,...,8,3530939,0,3530939,3530939,19200000,12500000.0,32600000,7999725,Benign
3,0,0,01/03/2018 08:22:09,60997555,2,0,0,0,0,0,...,0,0,0,0,0,61000000,0.0,61000000,61000000,Benign
4,0,0,01/03/2018 08:24:11,61997503,3,0,0,0,0,0,...,0,999909,0,999909,999909,61000000,0.0,61000000,61000000,Benign


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 331125 entries, 0 to 331124
Data columns (total 80 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   Dst Port           331125 non-null  object
 1   Protocol           331125 non-null  object
 2   Timestamp          331125 non-null  object
 3   Flow Duration      331125 non-null  object
 4   Tot Fwd Pkts       331125 non-null  object
 5   Tot Bwd Pkts       331125 non-null  object
 6   TotLen Fwd Pkts    331125 non-null  object
 7   TotLen Bwd Pkts    331125 non-null  object
 8   Fwd Pkt Len Max    331125 non-null  object
 9   Fwd Pkt Len Min    331125 non-null  object
 10  Fwd Pkt Len Mean   331125 non-null  object
 11  Fwd Pkt Len Std    331125 non-null  object
 12  Bwd Pkt Len Max    331125 non-null  object
 13  Bwd Pkt Len Min    331125 non-null  object
 14  Bwd Pkt Len Mean   331125 non-null  object
 15  Bwd Pkt Len Std    331125 non-null  object
 16  Flow Byts/s        3

None

Dataframe: '03-02-2018'



Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,443,6,02/03/2018 08:47:38,141385,9,7,553,3773.0,202,0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
1,49684,6,02/03/2018 08:47:38,281,2,1,38,0.0,38,0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
2,443,6,02/03/2018 08:47:40,279824,11,15,1086,10527.0,385,0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
3,443,6,02/03/2018 08:47:40,132,2,0,0,0.0,0,0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
4,443,6,02/03/2018 08:47:41,274016,9,13,1285,6141.0,517,0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 80 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   Dst Port           1048575 non-null  int64  
 1   Protocol           1048575 non-null  int64  
 2   Timestamp          1048575 non-null  object 
 3   Flow Duration      1048575 non-null  int64  
 4   Tot Fwd Pkts       1048575 non-null  int64  
 5   Tot Bwd Pkts       1048575 non-null  int64  
 6   TotLen Fwd Pkts    1048575 non-null  int64  
 7   TotLen Bwd Pkts    1048575 non-null  float64
 8   Fwd Pkt Len Max    1048575 non-null  int64  
 9   Fwd Pkt Len Min    1048575 non-null  int64  
 10  Fwd Pkt Len Mean   1048575 non-null  float64
 11  Fwd Pkt Len Std    1048575 non-null  float64
 12  Bwd Pkt Len Max    1048575 non-null  int64  
 13  Bwd Pkt Len Min    1048575 non-null  int64  
 14  Bwd Pkt Len Mean   1048575 non-null  float64
 15  Bwd Pkt Len Std    1048575 non-n

None

In [31]:
# Shape and columns
for key in dfs.keys():
  print(f"Dataframe: '{key}'\n")
  display(dfs[key].shape, dfs[key].columns)

Dataframe: '02-14-2018'



(1048575, 80)

Index(['Dst Port', 'Protocol', 'Timestamp', 'Flow Duration', 'Tot Fwd Pkts',
       'Tot Bwd Pkts', 'TotLen Fwd Pkts', 'TotLen Bwd Pkts', 'Fwd Pkt Len Max',
       'Fwd Pkt Len Min', 'Fwd Pkt Len Mean', 'Fwd Pkt Len Std',
       'Bwd Pkt Len Max', 'Bwd Pkt Len Min', 'Bwd Pkt Len Mean',
       'Bwd Pkt Len Std', 'Flow Byts/s', 'Flow Pkts/s', 'Flow IAT Mean',
       'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Tot',
       'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min',
       'Bwd IAT Tot', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max',
       'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags',
       'Bwd URG Flags', 'Fwd Header Len', 'Bwd Header Len', 'Fwd Pkts/s',
       'Bwd Pkts/s', 'Pkt Len Min', 'Pkt Len Max', 'Pkt Len Mean',
       'Pkt Len Std', 'Pkt Len Var', 'FIN Flag Cnt', 'SYN Flag Cnt',
       'RST Flag Cnt', 'PSH Flag Cnt', 'ACK Flag Cnt', 'URG Flag Cnt',
       'CWE Flag Count', 'ECE Flag Cnt', 'Down/Up Ratio', 'Pkt Size Avg',
      

Dataframe: '02-15-2018'



(1048575, 80)

Index(['Dst Port', 'Protocol', 'Timestamp', 'Flow Duration', 'Tot Fwd Pkts',
       'Tot Bwd Pkts', 'TotLen Fwd Pkts', 'TotLen Bwd Pkts', 'Fwd Pkt Len Max',
       'Fwd Pkt Len Min', 'Fwd Pkt Len Mean', 'Fwd Pkt Len Std',
       'Bwd Pkt Len Max', 'Bwd Pkt Len Min', 'Bwd Pkt Len Mean',
       'Bwd Pkt Len Std', 'Flow Byts/s', 'Flow Pkts/s', 'Flow IAT Mean',
       'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Tot',
       'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min',
       'Bwd IAT Tot', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max',
       'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags',
       'Bwd URG Flags', 'Fwd Header Len', 'Bwd Header Len', 'Fwd Pkts/s',
       'Bwd Pkts/s', 'Pkt Len Min', 'Pkt Len Max', 'Pkt Len Mean',
       'Pkt Len Std', 'Pkt Len Var', 'FIN Flag Cnt', 'SYN Flag Cnt',
       'RST Flag Cnt', 'PSH Flag Cnt', 'ACK Flag Cnt', 'URG Flag Cnt',
       'CWE Flag Count', 'ECE Flag Cnt', 'Down/Up Ratio', 'Pkt Size Avg',
      

Dataframe: '02-16-2018'



(1048575, 80)

Index(['Dst Port', 'Protocol', 'Timestamp', 'Flow Duration', 'Tot Fwd Pkts',
       'Tot Bwd Pkts', 'TotLen Fwd Pkts', 'TotLen Bwd Pkts', 'Fwd Pkt Len Max',
       'Fwd Pkt Len Min', 'Fwd Pkt Len Mean', 'Fwd Pkt Len Std',
       'Bwd Pkt Len Max', 'Bwd Pkt Len Min', 'Bwd Pkt Len Mean',
       'Bwd Pkt Len Std', 'Flow Byts/s', 'Flow Pkts/s', 'Flow IAT Mean',
       'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Tot',
       'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min',
       'Bwd IAT Tot', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max',
       'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags',
       'Bwd URG Flags', 'Fwd Header Len', 'Bwd Header Len', 'Fwd Pkts/s',
       'Bwd Pkts/s', 'Pkt Len Min', 'Pkt Len Max', 'Pkt Len Mean',
       'Pkt Len Std', 'Pkt Len Var', 'FIN Flag Cnt', 'SYN Flag Cnt',
       'RST Flag Cnt', 'PSH Flag Cnt', 'ACK Flag Cnt', 'URG Flag Cnt',
       'CWE Flag Count', 'ECE Flag Cnt', 'Down/Up Ratio', 'Pkt Size Avg',
      

Dataframe: '02-20-2018'



(7948748, 84)

Index(['Flow ID', 'Src IP', 'Src Port', 'Dst IP', 'Dst Port', 'Protocol',
       'Timestamp', 'Flow Duration', 'Tot Fwd Pkts', 'Tot Bwd Pkts',
       'TotLen Fwd Pkts', 'TotLen Bwd Pkts', 'Fwd Pkt Len Max',
       'Fwd Pkt Len Min', 'Fwd Pkt Len Mean', 'Fwd Pkt Len Std',
       'Bwd Pkt Len Max', 'Bwd Pkt Len Min', 'Bwd Pkt Len Mean',
       'Bwd Pkt Len Std', 'Flow Byts/s', 'Flow Pkts/s', 'Flow IAT Mean',
       'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Tot',
       'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min',
       'Bwd IAT Tot', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max',
       'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags',
       'Bwd URG Flags', 'Fwd Header Len', 'Bwd Header Len', 'Fwd Pkts/s',
       'Bwd Pkts/s', 'Pkt Len Min', 'Pkt Len Max', 'Pkt Len Mean',
       'Pkt Len Std', 'Pkt Len Var', 'FIN Flag Cnt', 'SYN Flag Cnt',
       'RST Flag Cnt', 'PSH Flag Cnt', 'ACK Flag Cnt', 'URG Flag Cnt',
       'CWE Flag Count', 'ECE 

Dataframe: '02-21-2018'



(1048575, 80)

Index(['Dst Port', 'Protocol', 'Timestamp', 'Flow Duration', 'Tot Fwd Pkts',
       'Tot Bwd Pkts', 'TotLen Fwd Pkts', 'TotLen Bwd Pkts', 'Fwd Pkt Len Max',
       'Fwd Pkt Len Min', 'Fwd Pkt Len Mean', 'Fwd Pkt Len Std',
       'Bwd Pkt Len Max', 'Bwd Pkt Len Min', 'Bwd Pkt Len Mean',
       'Bwd Pkt Len Std', 'Flow Byts/s', 'Flow Pkts/s', 'Flow IAT Mean',
       'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Tot',
       'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min',
       'Bwd IAT Tot', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max',
       'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags',
       'Bwd URG Flags', 'Fwd Header Len', 'Bwd Header Len', 'Fwd Pkts/s',
       'Bwd Pkts/s', 'Pkt Len Min', 'Pkt Len Max', 'Pkt Len Mean',
       'Pkt Len Std', 'Pkt Len Var', 'FIN Flag Cnt', 'SYN Flag Cnt',
       'RST Flag Cnt', 'PSH Flag Cnt', 'ACK Flag Cnt', 'URG Flag Cnt',
       'CWE Flag Count', 'ECE Flag Cnt', 'Down/Up Ratio', 'Pkt Size Avg',
      

Dataframe: '02-22-2018'



(1048575, 80)

Index(['Dst Port', 'Protocol', 'Timestamp', 'Flow Duration', 'Tot Fwd Pkts',
       'Tot Bwd Pkts', 'TotLen Fwd Pkts', 'TotLen Bwd Pkts', 'Fwd Pkt Len Max',
       'Fwd Pkt Len Min', 'Fwd Pkt Len Mean', 'Fwd Pkt Len Std',
       'Bwd Pkt Len Max', 'Bwd Pkt Len Min', 'Bwd Pkt Len Mean',
       'Bwd Pkt Len Std', 'Flow Byts/s', 'Flow Pkts/s', 'Flow IAT Mean',
       'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Tot',
       'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min',
       'Bwd IAT Tot', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max',
       'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags',
       'Bwd URG Flags', 'Fwd Header Len', 'Bwd Header Len', 'Fwd Pkts/s',
       'Bwd Pkts/s', 'Pkt Len Min', 'Pkt Len Max', 'Pkt Len Mean',
       'Pkt Len Std', 'Pkt Len Var', 'FIN Flag Cnt', 'SYN Flag Cnt',
       'RST Flag Cnt', 'PSH Flag Cnt', 'ACK Flag Cnt', 'URG Flag Cnt',
       'CWE Flag Count', 'ECE Flag Cnt', 'Down/Up Ratio', 'Pkt Size Avg',
      

Dataframe: '02-23-2018'



(1048575, 80)

Index(['Dst Port', 'Protocol', 'Timestamp', 'Flow Duration', 'Tot Fwd Pkts',
       'Tot Bwd Pkts', 'TotLen Fwd Pkts', 'TotLen Bwd Pkts', 'Fwd Pkt Len Max',
       'Fwd Pkt Len Min', 'Fwd Pkt Len Mean', 'Fwd Pkt Len Std',
       'Bwd Pkt Len Max', 'Bwd Pkt Len Min', 'Bwd Pkt Len Mean',
       'Bwd Pkt Len Std', 'Flow Byts/s', 'Flow Pkts/s', 'Flow IAT Mean',
       'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Tot',
       'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min',
       'Bwd IAT Tot', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max',
       'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags',
       'Bwd URG Flags', 'Fwd Header Len', 'Bwd Header Len', 'Fwd Pkts/s',
       'Bwd Pkts/s', 'Pkt Len Min', 'Pkt Len Max', 'Pkt Len Mean',
       'Pkt Len Std', 'Pkt Len Var', 'FIN Flag Cnt', 'SYN Flag Cnt',
       'RST Flag Cnt', 'PSH Flag Cnt', 'ACK Flag Cnt', 'URG Flag Cnt',
       'CWE Flag Count', 'ECE Flag Cnt', 'Down/Up Ratio', 'Pkt Size Avg',
      

Dataframe: '02-28-2018'



(613104, 80)

Index(['Dst Port', 'Protocol', 'Timestamp', 'Flow Duration', 'Tot Fwd Pkts',
       'Tot Bwd Pkts', 'TotLen Fwd Pkts', 'TotLen Bwd Pkts', 'Fwd Pkt Len Max',
       'Fwd Pkt Len Min', 'Fwd Pkt Len Mean', 'Fwd Pkt Len Std',
       'Bwd Pkt Len Max', 'Bwd Pkt Len Min', 'Bwd Pkt Len Mean',
       'Bwd Pkt Len Std', 'Flow Byts/s', 'Flow Pkts/s', 'Flow IAT Mean',
       'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Tot',
       'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min',
       'Bwd IAT Tot', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max',
       'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags',
       'Bwd URG Flags', 'Fwd Header Len', 'Bwd Header Len', 'Fwd Pkts/s',
       'Bwd Pkts/s', 'Pkt Len Min', 'Pkt Len Max', 'Pkt Len Mean',
       'Pkt Len Std', 'Pkt Len Var', 'FIN Flag Cnt', 'SYN Flag Cnt',
       'RST Flag Cnt', 'PSH Flag Cnt', 'ACK Flag Cnt', 'URG Flag Cnt',
       'CWE Flag Count', 'ECE Flag Cnt', 'Down/Up Ratio', 'Pkt Size Avg',
      

Dataframe: '03-01-2018'



(331125, 80)

Index(['Dst Port', 'Protocol', 'Timestamp', 'Flow Duration', 'Tot Fwd Pkts',
       'Tot Bwd Pkts', 'TotLen Fwd Pkts', 'TotLen Bwd Pkts', 'Fwd Pkt Len Max',
       'Fwd Pkt Len Min', 'Fwd Pkt Len Mean', 'Fwd Pkt Len Std',
       'Bwd Pkt Len Max', 'Bwd Pkt Len Min', 'Bwd Pkt Len Mean',
       'Bwd Pkt Len Std', 'Flow Byts/s', 'Flow Pkts/s', 'Flow IAT Mean',
       'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Tot',
       'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min',
       'Bwd IAT Tot', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max',
       'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags',
       'Bwd URG Flags', 'Fwd Header Len', 'Bwd Header Len', 'Fwd Pkts/s',
       'Bwd Pkts/s', 'Pkt Len Min', 'Pkt Len Max', 'Pkt Len Mean',
       'Pkt Len Std', 'Pkt Len Var', 'FIN Flag Cnt', 'SYN Flag Cnt',
       'RST Flag Cnt', 'PSH Flag Cnt', 'ACK Flag Cnt', 'URG Flag Cnt',
       'CWE Flag Count', 'ECE Flag Cnt', 'Down/Up Ratio', 'Pkt Size Avg',
      

Dataframe: '03-02-2018'



(1048575, 80)

Index(['Dst Port', 'Protocol', 'Timestamp', 'Flow Duration', 'Tot Fwd Pkts',
       'Tot Bwd Pkts', 'TotLen Fwd Pkts', 'TotLen Bwd Pkts', 'Fwd Pkt Len Max',
       'Fwd Pkt Len Min', 'Fwd Pkt Len Mean', 'Fwd Pkt Len Std',
       'Bwd Pkt Len Max', 'Bwd Pkt Len Min', 'Bwd Pkt Len Mean',
       'Bwd Pkt Len Std', 'Flow Byts/s', 'Flow Pkts/s', 'Flow IAT Mean',
       'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Tot',
       'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min',
       'Bwd IAT Tot', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max',
       'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags',
       'Bwd URG Flags', 'Fwd Header Len', 'Bwd Header Len', 'Fwd Pkts/s',
       'Bwd Pkts/s', 'Pkt Len Min', 'Pkt Len Max', 'Pkt Len Mean',
       'Pkt Len Std', 'Pkt Len Var', 'FIN Flag Cnt', 'SYN Flag Cnt',
       'RST Flag Cnt', 'PSH Flag Cnt', 'ACK Flag Cnt', 'URG Flag Cnt',
       'CWE Flag Count', 'ECE Flag Cnt', 'Down/Up Ratio', 'Pkt Size Avg',
      

In [32]:
# Drop Timestamp before normalization as it is in string format (object)
# Removing the "Timestamp" column is essential as it is irrelevant to the learning process,
# helps reduce dimensionality, enhances model interpretability, prevents overfitting, and
# contributes to improved data cleaning

# Loop through each DataFrame in the df_copy dictionary
for key in dfs.keys():
  df = dfs[key]
  # Check if 'Timestamp' column exists in the DataFrame
  if 'Timestamp' in df.columns:
    print(f"Dataframe: '{key}'\n")
    # Drop the 'Timestamp' column if it exists
    df.drop(columns=['Timestamp'], inplace=True)

Dataframe: '02-14-2018'

Dataframe: '02-15-2018'

Dataframe: '02-16-2018'

Dataframe: '02-20-2018'

Dataframe: '02-21-2018'

Dataframe: '02-22-2018'

Dataframe: '02-23-2018'

Dataframe: '02-28-2018'

Dataframe: '03-01-2018'

Dataframe: '03-02-2018'



In [33]:
# Dataframe '02-20-2018' has 4 extra features ('Flow ID', 'Src IP', 'Src Port', 'Dst IP') that we will remove

# Columns to drop by name
columns_to_drop = ['Flow ID', 'Src IP', 'Src Port', 'Dst IP']

for key in dfs.keys():
  if key == '02-20-2018':
    dfs[key] = dfs[key].drop(columns=columns_to_drop)

In [34]:
# Check if we have the same shape (7948748, 80)
display(dfs['02-20-2018'].shape)

(7948748, 79)

In [35]:
# Converting object-type columns to numeric formats is essential before normalization to enable compatibility
# with machine learning algorithms and avoid data type errors during the scaling process

# Loop through each DataFrame in the dictionary
for key in dfs.keys():
  df = dfs[key]

  # Select columns with object data type, excluding 'Label'
  object_columns = df.select_dtypes(include=['object']).columns.difference(['Label'])

  # Convert object columns to numeric (int64 or float64)
  df[object_columns] = df[object_columns].apply(pd.to_numeric, errors='coerce')

  # Print the updated DataFrame
  print(f"DataFrame '{key}' after transformation:")
  display(df.info())


DataFrame '02-14-2018' after transformation:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 79 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   Dst Port           1048575 non-null  int64  
 1   Protocol           1048575 non-null  int64  
 2   Flow Duration      1048575 non-null  int64  
 3   Tot Fwd Pkts       1048575 non-null  int64  
 4   Tot Bwd Pkts       1048575 non-null  int64  
 5   TotLen Fwd Pkts    1048575 non-null  int64  
 6   TotLen Bwd Pkts    1048575 non-null  int64  
 7   Fwd Pkt Len Max    1048575 non-null  int64  
 8   Fwd Pkt Len Min    1048575 non-null  int64  
 9   Fwd Pkt Len Mean   1048575 non-null  float64
 10  Fwd Pkt Len Std    1048575 non-null  float64
 11  Bwd Pkt Len Max    1048575 non-null  int64  
 12  Bwd Pkt Len Min    1048575 non-null  int64  
 13  Bwd Pkt Len Mean   1048575 non-null  float64
 14  Bwd Pkt Len Std    1048575 non-null  

None

DataFrame '02-15-2018' after transformation:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 79 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   Dst Port           1048575 non-null  int64  
 1   Protocol           1048575 non-null  int64  
 2   Flow Duration      1048575 non-null  int64  
 3   Tot Fwd Pkts       1048575 non-null  int64  
 4   Tot Bwd Pkts       1048575 non-null  int64  
 5   TotLen Fwd Pkts    1048575 non-null  int64  
 6   TotLen Bwd Pkts    1048575 non-null  int64  
 7   Fwd Pkt Len Max    1048575 non-null  int64  
 8   Fwd Pkt Len Min    1048575 non-null  int64  
 9   Fwd Pkt Len Mean   1048575 non-null  float64
 10  Fwd Pkt Len Std    1048575 non-null  float64
 11  Bwd Pkt Len Max    1048575 non-null  int64  
 12  Bwd Pkt Len Min    1048575 non-null  int64  
 13  Bwd Pkt Len Mean   1048575 non-null  float64
 14  Bwd Pkt Len Std    1048575 non-null  

None

DataFrame '02-16-2018' after transformation:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 79 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   Dst Port           1048574 non-null  float64
 1   Protocol           1048574 non-null  float64
 2   Flow Duration      1048574 non-null  float64
 3   Tot Fwd Pkts       1048574 non-null  float64
 4   Tot Bwd Pkts       1048574 non-null  float64
 5   TotLen Fwd Pkts    1048574 non-null  float64
 6   TotLen Bwd Pkts    1048574 non-null  float64
 7   Fwd Pkt Len Max    1048574 non-null  float64
 8   Fwd Pkt Len Min    1048574 non-null  float64
 9   Fwd Pkt Len Mean   1048574 non-null  float64
 10  Fwd Pkt Len Std    1048574 non-null  float64
 11  Bwd Pkt Len Max    1048574 non-null  float64
 12  Bwd Pkt Len Min    1048574 non-null  float64
 13  Bwd Pkt Len Mean   1048574 non-null  float64
 14  Bwd Pkt Len Std    1048574 non-null  

None

DataFrame '02-20-2018' after transformation:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7948748 entries, 0 to 7948747
Data columns (total 79 columns):
 #   Column             Dtype  
---  ------             -----  
 0   Dst Port           int64  
 1   Protocol           int64  
 2   Flow Duration      int64  
 3   Tot Fwd Pkts       int64  
 4   Tot Bwd Pkts       int64  
 5   TotLen Fwd Pkts    float64
 6   TotLen Bwd Pkts    float64
 7   Fwd Pkt Len Max    float64
 8   Fwd Pkt Len Min    float64
 9   Fwd Pkt Len Mean   float64
 10  Fwd Pkt Len Std    float64
 11  Bwd Pkt Len Max    float64
 12  Bwd Pkt Len Min    float64
 13  Bwd Pkt Len Mean   float64
 14  Bwd Pkt Len Std    float64
 15  Flow Byts/s        float64
 16  Flow Pkts/s        float64
 17  Flow IAT Mean      float64
 18  Flow IAT Std       float64
 19  Flow IAT Max       float64
 20  Flow IAT Min       float64
 21  Fwd IAT Tot        float64
 22  Fwd IAT Mean       float64
 23  Fwd IAT Std        float64
 24  Fwd I

None

DataFrame '02-21-2018' after transformation:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 79 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   Dst Port           1048575 non-null  int64  
 1   Protocol           1048575 non-null  int64  
 2   Flow Duration      1048575 non-null  int64  
 3   Tot Fwd Pkts       1048575 non-null  int64  
 4   Tot Bwd Pkts       1048575 non-null  int64  
 5   TotLen Fwd Pkts    1048575 non-null  int64  
 6   TotLen Bwd Pkts    1048575 non-null  int64  
 7   Fwd Pkt Len Max    1048575 non-null  int64  
 8   Fwd Pkt Len Min    1048575 non-null  int64  
 9   Fwd Pkt Len Mean   1048575 non-null  float64
 10  Fwd Pkt Len Std    1048575 non-null  float64
 11  Bwd Pkt Len Max    1048575 non-null  int64  
 12  Bwd Pkt Len Min    1048575 non-null  int64  
 13  Bwd Pkt Len Mean   1048575 non-null  float64
 14  Bwd Pkt Len Std    1048575 non-null  

None

DataFrame '02-22-2018' after transformation:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 79 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   Dst Port           1048575 non-null  int64  
 1   Protocol           1048575 non-null  int64  
 2   Flow Duration      1048575 non-null  int64  
 3   Tot Fwd Pkts       1048575 non-null  int64  
 4   Tot Bwd Pkts       1048575 non-null  int64  
 5   TotLen Fwd Pkts    1048575 non-null  int64  
 6   TotLen Bwd Pkts    1048575 non-null  int64  
 7   Fwd Pkt Len Max    1048575 non-null  int64  
 8   Fwd Pkt Len Min    1048575 non-null  int64  
 9   Fwd Pkt Len Mean   1048575 non-null  float64
 10  Fwd Pkt Len Std    1048575 non-null  float64
 11  Bwd Pkt Len Max    1048575 non-null  int64  
 12  Bwd Pkt Len Min    1048575 non-null  int64  
 13  Bwd Pkt Len Mean   1048575 non-null  float64
 14  Bwd Pkt Len Std    1048575 non-null  

None

DataFrame '02-23-2018' after transformation:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 79 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   Dst Port           1048575 non-null  int64  
 1   Protocol           1048575 non-null  int64  
 2   Flow Duration      1048575 non-null  int64  
 3   Tot Fwd Pkts       1048575 non-null  int64  
 4   Tot Bwd Pkts       1048575 non-null  int64  
 5   TotLen Fwd Pkts    1048575 non-null  int64  
 6   TotLen Bwd Pkts    1048575 non-null  int64  
 7   Fwd Pkt Len Max    1048575 non-null  int64  
 8   Fwd Pkt Len Min    1048575 non-null  int64  
 9   Fwd Pkt Len Mean   1048575 non-null  float64
 10  Fwd Pkt Len Std    1048575 non-null  float64
 11  Bwd Pkt Len Max    1048575 non-null  int64  
 12  Bwd Pkt Len Min    1048575 non-null  int64  
 13  Bwd Pkt Len Mean   1048575 non-null  float64
 14  Bwd Pkt Len Std    1048575 non-null  

None

DataFrame '02-28-2018' after transformation:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 613104 entries, 0 to 613103
Data columns (total 79 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Dst Port           613071 non-null  float64
 1   Protocol           613071 non-null  float64
 2   Flow Duration      613071 non-null  float64
 3   Tot Fwd Pkts       613071 non-null  float64
 4   Tot Bwd Pkts       613071 non-null  float64
 5   TotLen Fwd Pkts    613071 non-null  float64
 6   TotLen Bwd Pkts    613071 non-null  float64
 7   Fwd Pkt Len Max    613071 non-null  float64
 8   Fwd Pkt Len Min    613071 non-null  float64
 9   Fwd Pkt Len Mean   613071 non-null  float64
 10  Fwd Pkt Len Std    613071 non-null  float64
 11  Bwd Pkt Len Max    613071 non-null  float64
 12  Bwd Pkt Len Min    613071 non-null  float64
 13  Bwd Pkt Len Mean   613071 non-null  float64
 14  Bwd Pkt Len Std    613071 non-null  float64
 15  Flow B

None

DataFrame '03-01-2018' after transformation:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 331125 entries, 0 to 331124
Data columns (total 79 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Dst Port           331100 non-null  float64
 1   Protocol           331100 non-null  float64
 2   Flow Duration      331100 non-null  float64
 3   Tot Fwd Pkts       331100 non-null  float64
 4   Tot Bwd Pkts       331100 non-null  float64
 5   TotLen Fwd Pkts    331100 non-null  float64
 6   TotLen Bwd Pkts    331100 non-null  float64
 7   Fwd Pkt Len Max    331100 non-null  float64
 8   Fwd Pkt Len Min    331100 non-null  float64
 9   Fwd Pkt Len Mean   331100 non-null  float64
 10  Fwd Pkt Len Std    331100 non-null  float64
 11  Bwd Pkt Len Max    331100 non-null  float64
 12  Bwd Pkt Len Min    331100 non-null  float64
 13  Bwd Pkt Len Mean   331100 non-null  float64
 14  Bwd Pkt Len Std    331100 non-null  float64
 15  Flow B

None

DataFrame '03-02-2018' after transformation:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 79 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   Dst Port           1048575 non-null  int64  
 1   Protocol           1048575 non-null  int64  
 2   Flow Duration      1048575 non-null  int64  
 3   Tot Fwd Pkts       1048575 non-null  int64  
 4   Tot Bwd Pkts       1048575 non-null  int64  
 5   TotLen Fwd Pkts    1048575 non-null  int64  
 6   TotLen Bwd Pkts    1048575 non-null  float64
 7   Fwd Pkt Len Max    1048575 non-null  int64  
 8   Fwd Pkt Len Min    1048575 non-null  int64  
 9   Fwd Pkt Len Mean   1048575 non-null  float64
 10  Fwd Pkt Len Std    1048575 non-null  float64
 11  Bwd Pkt Len Max    1048575 non-null  int64  
 12  Bwd Pkt Len Min    1048575 non-null  int64  
 13  Bwd Pkt Len Mean   1048575 non-null  float64
 14  Bwd Pkt Len Std    1048575 non-null  

None

In [36]:
# Display the counts with labels for dfs

for key in dfs.keys():
  df = dfs[key]
  label_counts_df = df["Label"].value_counts()
  print(f"\nLabel counts for '{key}' :")
  print(label_counts_df)


Label counts for '02-14-2018' :
Benign            667626
FTP-BruteForce    193360
SSH-Bruteforce    187589
Name: Label, dtype: int64

Label counts for '02-15-2018' :
Benign                   996077
DoS attacks-GoldenEye     41508
DoS attacks-Slowloris     10990
Name: Label, dtype: int64

Label counts for '02-16-2018' :
DoS attacks-Hulk            461912
Benign                      446772
DoS attacks-SlowHTTPTest    139890
Label                            1
Name: Label, dtype: int64

Label counts for '02-20-2018' :
Benign                    7372557
DDoS attacks-LOIC-HTTP     576191
Name: Label, dtype: int64

Label counts for '02-21-2018' :
DDOS attack-HOIC        686012
Benign                  360833
DDOS attack-LOIC-UDP      1730
Name: Label, dtype: int64

Label counts for '02-22-2018' :
Benign              1048213
Brute Force -Web        249
Brute Force -XSS         79
SQL Injection            34
Name: Label, dtype: int64

Label counts for '02-23-2018' :
Benign              1048009
B

In [37]:
dfs_copy = {}

for key in dfs.keys():
  df = dfs[key]

  print(f"Dataframe: '{key}'\n")

  df_copy = df.copy()

  # Print the shape of the DataFrame 'df_copy' after removing rows with missing values
  print(df_copy.shape)

  # Remove duplicate rows from the DataFrame 'df_copy' while resetting the index
  df_copy = df_copy.drop_duplicates()
  df_copy.reset_index(inplace=True, drop=True)

  # Print the shape of the DataFrame 'df_copy' after removing duplicates and resetting the index
  print(df_copy.shape)
  print("Memory Usage (MB):", df_copy.memory_usage(deep=True).sum() / (1024 ** 2), " vs ", df.memory_usage(deep=True).sum() / (1024 ** 2))
  dfs_copy[key] = df_copy

Dataframe: '02-14-2018'

(1048575, 79)
(674815, 79)
Memory Usage (MB): 442.8394784927368  vs  689.9058771133423
Dataframe: '02-15-2018'

(1048575, 79)
(884658, 79)
Memory Usage (MB): 580.3393211364746  vs  687.7504568099976
Dataframe: '02-16-2018'

(1048575, 79)
(591902, 79)
Memory Usage (MB): 389.18474292755127  vs  693.8059730529785
Dataframe: '02-20-2018'

(7948748, 79)
(6120535, 79)
Memory Usage (MB): 4018.796441078186  vs  5216.607151031494
Dataframe: '02-21-2018'

(1048575, 79)
(561405, 79)
Memory Usage (MB): 369.7378091812134  vs  693.5648851394653
Dataframe: '02-22-2018'

(1048575, 79)
(906926, 79)
Memory Usage (MB): 594.1978435516357  vs  687.0028219223022
Dataframe: '02-23-2018'

(1048575, 79)
(910662, 79)
Memory Usage (MB): 596.6474523544312  vs  687.0047130584717
Dataframe: '02-28-2018'

(613104, 79)
(518171, 79)
Memory Usage (MB): 339.9066286087036  vs  402.1498107910156
Dataframe: '03-01-2018'

(331125, 79)
(290884, 79)
Memory Usage (MB): 191.1351022720337  vs  217.565936

In [9]:
import pandas as pd

# Create an empty DataFrame to store duplicates
duplicate_rows = pd.DataFrame()

# Create a variable to count removed duplicates
removed_duplicates_count = 0

# Iterate through the DataFrames in the dictionary
for key1, df1 in dfs_copy.items():
    for key2, df2 in dfs_copy.items():
        # Skip comparing a DataFrame with itself
        if key1 == key2:
            continue

        # Find duplicate rows between df1 and df2
        duplicates = df1[df1.isin(df2.to_dict(orient='list')).all(1)]

        # Append duplicate rows to the result DataFrame
        duplicate_rows = pd.concat([duplicate_rows, duplicates])

        # Remove duplicates from df2 and count the removed duplicates
        num_removed = len(duplicates)
        df2 = df2[~df2.isin(duplicates.to_dict(orient='list')).all(1)]
        removed_duplicates_count += num_removed

# Display the removed duplicates count
print("Removed duplicates count:", removed_duplicates_count)


Removed duplicates count: 6


In [38]:
for key in dfs_copy.keys():
  df = dfs[key]

  print(f"Dataframe: '{key}'\n")
  print(dfs[key].shape)
  print(dfs_copy[key].shape)
  print("Memory Usage (MB):", dfs_copy[key].memory_usage(deep=True).sum() / (1024 ** 2), " vs ", dfs[key].memory_usage(deep=True).sum() / (1024 ** 2))


Dataframe: '02-14-2018'

(1048575, 79)
(674815, 79)
Memory Usage (MB): 442.8394784927368  vs  689.9058771133423
Dataframe: '02-15-2018'

(1048575, 79)
(884658, 79)
Memory Usage (MB): 580.3393211364746  vs  687.7504568099976
Dataframe: '02-16-2018'

(1048575, 79)
(591902, 79)
Memory Usage (MB): 389.18474292755127  vs  693.8059730529785
Dataframe: '02-20-2018'

(7948748, 79)
(6120535, 79)
Memory Usage (MB): 4018.796441078186  vs  5216.607151031494
Dataframe: '02-21-2018'

(1048575, 79)
(561405, 79)
Memory Usage (MB): 369.7378091812134  vs  693.5648851394653
Dataframe: '02-22-2018'

(1048575, 79)
(906926, 79)
Memory Usage (MB): 594.1978435516357  vs  687.0028219223022
Dataframe: '02-23-2018'

(1048575, 79)
(910662, 79)
Memory Usage (MB): 596.6474523544312  vs  687.0047130584717
Dataframe: '02-28-2018'

(613104, 79)
(518171, 79)
Memory Usage (MB): 339.9066286087036  vs  402.1498107910156
Dataframe: '03-01-2018'

(331125, 79)
(290884, 79)
Memory Usage (MB): 191.1351022720337  vs  217.565936

In [None]:
# Concatenate all DataFrames into one
#df_concat = pd.concat(list(dfs_copy.values()), ignore_index=True)

# Skipped concatenation as it takes a lot of memory and may crash the session

In [39]:
# Print out the DataFrames loaded in the memory
%whos DataFrame

Variable                  Type         Data/Info
------------------------------------------------
df                        DataFrame             Dst Port  Protoc<...>048575 rows x 79 columns]
df_copy                   DataFrame            Dst Port  Protoco<...>823732 rows x 79 columns]
missing_values_as_empty   DataFrame             Dst Port  Protoc<...>043116 rows x 80 columns]


In [40]:
try:
  del df
  del df_copy
  del missing_values_as_empty
  del df1
  del df2
  del duplicate_rows
  del duplicates
except:
  pass

In [42]:
# Explicitly call Python's garbage collector to free up memory
import gc
gc.collect()

3323

### Preprocessing of the CSE-CIC-IDS2018 dataset

In [43]:
# Check if the Dataset was not preprocess do:
  # 1 # Handling Missing Values
  # 2 # Normalization (Min-Max Scaling)
  # 3 # Encode Categorical Label
  # 4 # Removing duplicate records

from sklearn.impute import SimpleImputer

for key in dfs_copy.keys():
  df = dfs_copy[key]

  print(f"Dataframe: '{key}'\n")

  # Step 1: Handling Missing Values

  # Check for missing values, NAN
  check_nan = df.isna().sum().sum()
  print(check_nan)

  # Check if missing values are represented as empty values (",,")
  missing_values_as_empty = df.applymap(lambda x: x == '')

  # Count the number of missing values in each column
  missing_values_count = missing_values_as_empty.sum()

  # Check if all elements in the missing_values_count Series are different from 0
  check_null = (missing_values_count != 0).all()

  # Replace empty values with NaN
  if (check_null):
    df.replace("", np.nan, inplace=True)

  # Impute missing values with the most frequent value for categorical columns and mean for numerical columns
  if (check_null or check_nan !=0):
    imputer = SimpleImputer(strategy='most_frequent', missing_values=pd.NA)
    for col in df.columns:
        if df[col].dtype == 'object':
            df[col] = imputer.fit_transform(df[[col]])
        else:
            df[col] = df[col].fillna(df[col].mean())

Dataframe: '02-14-2018'

2209
Dataframe: '02-15-2018'

4759
Dataframe: '02-16-2018'

78
Dataframe: '02-20-2018'

30719
Dataframe: '02-21-2018'

0
Dataframe: '02-22-2018'

3428
Dataframe: '02-23-2018'

3662
Dataframe: '02-28-2018'

2498
Dataframe: '03-01-2018'

1892
Dataframe: '03-02-2018'

2515


In [44]:
# Check again for missing values, NAN
for key in dfs_copy.keys():
  df = dfs_copy[key]
  print(f"Dataframe: '{key}'\n")
  print(df.isna().sum().sum())

Dataframe: '02-14-2018'

0
Dataframe: '02-15-2018'

0
Dataframe: '02-16-2018'

0
Dataframe: '02-20-2018'

0
Dataframe: '02-21-2018'

0
Dataframe: '02-22-2018'

0
Dataframe: '02-23-2018'

0
Dataframe: '02-28-2018'

0
Dataframe: '03-01-2018'

0
Dataframe: '03-02-2018'

0


In [45]:
# Step 2: Normalization (Scaling)

from sklearn.preprocessing import MinMaxScaler
import numpy as np

for key in dfs_copy.keys():
  df = dfs_copy[key]
  print(f"Dataframe: '{key}'\n")

  # Specify the columns to fix
  float64_columns = [col for col in df.columns if df[col].dtype == 'float64']

  # Clean the data by replacing infinite or extremely large values
  max_allowed_value = 1e15  # Define a threshold for allowed values
  for col in float64_columns:
      df[col] = df[col].clip(lower=None, upper=max_allowed_value)

  # Create a Min-Max scaler
  scaler = MinMaxScaler()

  # Specify the columns to scale
  columns = [col for col in df.columns if col not in ['Label']]

  # Fit and transform the selected columns in the subset
  df[columns] = scaler.fit_transform(df[columns])

  # Display the DataFrame with the scaled subset
  display(df)

Dataframe: '02-14-2018'



Unnamed: 0,Dst Port,Protocol,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,0.000000,0.000000,0.999992,0.000391,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000166,5.726192e-10,0.000057,0.004469,Benign
1,0.000000,0.000000,0.999992,0.000391,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000166,4.708848e-10,0.000057,0.004469,Benign
2,0.000000,0.000000,0.999992,0.000391,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000166,1.241159e-09,0.000057,0.004469,Benign
3,0.000336,0.352941,0.999876,0.002738,0.001087,0.000144,0.000170,0.011546,0.0,0.007364,...,0.666667,0.000000,0.0,0.000000,0.000000,0.000000,0.000000e+00,0.000000,0.000000,Benign
4,0.000336,0.352941,0.999879,0.002542,0.001196,0.000133,0.000165,0.011546,0.0,0.007278,...,0.666667,0.000000,0.0,0.000000,0.000000,0.000000,0.000000e+00,0.000000,0.000000,Benign
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
674810,0.001221,0.352941,0.999880,0.000782,0.000544,0.000132,0.000199,0.008970,0.0,0.020255,...,0.416667,0.000000,0.0,0.000000,0.000000,0.000000,0.000000e+00,0.000000,0.000000,Benign
674811,0.001221,0.352941,0.999880,0.000782,0.000544,0.000127,0.000144,0.009109,0.0,0.019417,...,0.416667,0.000000,0.0,0.000000,0.000000,0.000000,0.000000e+00,0.000000,0.000000,Benign
674812,0.001221,0.352941,0.999875,0.000391,0.000109,0.000000,0.000000,0.000000,0.0,0.000000,...,0.416667,0.000000,0.0,0.000000,0.000000,0.000000,0.000000e+00,0.000000,0.000000,Benign
674813,0.001221,0.352941,0.999875,0.000391,0.000109,0.000000,0.000000,0.000000,0.0,0.000000,...,0.416667,0.000000,0.0,0.000000,0.000000,0.000000,0.000000e+00,0.000000,0.000000,Benign


Dataframe: '02-15-2018'



Unnamed: 0,Dst Port,Protocol,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,0.000000,0.000000,9.386763e-01,0.000222,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.469371,0.000010,0.469375,0.469367,Benign
1,0.000336,0.352941,3.113897e-01,0.001441,0.000626,0.000248,0.000107,0.011049,0.0,0.009369,...,0.571429,0.009147,0.010785,0.014297,0.002871,0.095267,0.050267,0.130154,0.074674,Benign
2,0.725028,0.352941,4.525000e-06,0.000111,0.000000,0.000007,0.000000,0.000993,0.0,0.001936,...,0.571429,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,Benign
3,0.000000,0.000000,9.386725e-01,0.000222,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.469369,0.000005,0.469372,0.469367,Benign
4,0.000000,0.000000,9.386739e-01,0.000222,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.469370,0.000010,0.469374,0.469366,Benign
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
884653,0.764657,0.352941,1.833333e-07,0.000222,0.000000,0.000004,0.000000,0.000481,0.0,0.000625,...,0.357143,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,Benign
884654,0.006760,0.352941,4.556899e-01,0.000443,0.000052,0.000014,0.000002,0.000714,0.0,0.001488,...,0.357143,0.001418,0.000000,0.001418,0.001418,0.454397,0.000000,0.454397,0.454397,Benign
884655,0.006760,0.352941,4.473508e-01,0.000443,0.000052,0.000014,0.000002,0.000714,0.0,0.001488,...,0.357143,0.002319,0.000000,0.002319,0.002319,0.445213,0.000000,0.445213,0.445213,Benign
884656,0.006760,0.352941,4.556947e-01,0.000443,0.000052,0.000014,0.000002,0.000714,0.0,0.001488,...,0.357143,0.001419,0.000000,0.001419,0.001419,0.454395,0.000000,0.454395,0.454395,Benign


Dataframe: '02-16-2018'



Unnamed: 0,Dst Port,Protocol,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,0.000000,0.000000,9.999911e-01,0.008772,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.616648,0.000006,0.616648,0.616648,Benign
1,0.000000,0.000000,1.000000e+00,0.008772,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.616648,0.000011,0.616648,0.616648,Benign
2,0.583708,0.352941,1.845154e-01,0.096491,0.049494,0.146035,0.000858,0.082418,0.213333,0.150578,...,0.500000,0.298335,0.000000,0.153493,0.362174,0.099214,0.000000,0.099214,0.099214,Benign
3,0.000000,0.000000,9.999917e-01,0.008772,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.616648,0.000003,0.616648,0.616648,Benign
4,0.000377,0.352941,1.686763e-07,0.000000,0.001125,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.500000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,Benign
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
591897,0.727237,0.352941,3.659576e-02,0.017544,0.003375,0.056516,0.000264,0.321085,0.000000,0.268062,...,0.800000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,Benign
591898,0.727204,0.352941,3.660037e-02,0.017544,0.003375,0.056516,0.000214,0.321085,0.000000,0.268062,...,0.800000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,Benign
591899,0.727499,0.352941,3.671526e-02,0.017544,0.003375,0.056516,0.000197,0.321085,0.000000,0.268062,...,0.800000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,Benign
591900,0.619955,0.352941,4.679412e-02,0.017544,0.004499,0.056516,0.000247,0.321085,0.000000,0.268062,...,0.800000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,Benign


Dataframe: '02-20-2018'



Unnamed: 0,Dst Port,Protocol,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,0.000336,0.352941,7.406258e-03,0.000036,0.000426,0.000139,0.000052,0.130335,0.0,0.077771,...,0.666667,0.0,0.0,0.0,0.0,0.000000,0.000000e+00,0.000000,0.000000,Benign
1,0.000000,0.000000,9.386901e-01,0.000007,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.469167,9.256057e-08,0.469167,0.469167,Benign
2,0.000000,0.000000,9.386893e-01,0.000007,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.469167,2.406575e-07,0.469167,0.469167,Benign
3,0.000000,0.000000,9.386887e-01,0.000007,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.469167,7.404846e-08,0.469167,0.469167,Benign
4,0.000000,0.000000,9.386892e-01,0.000007,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.469167,8.515573e-07,0.469167,0.469167,Benign
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6120530,0.009506,0.352941,7.836833e-04,0.000004,0.000039,0.000000,0.000000,0.000000,0.0,0.000000,...,0.416667,0.0,0.0,0.0,0.0,0.000000,0.000000e+00,0.000000,0.000000,Benign
6120531,0.000336,0.352941,2.094008e-03,0.000011,0.000078,0.000000,0.000001,0.000000,0.0,0.000000,...,0.416667,0.0,0.0,0.0,0.0,0.000000,0.000000e+00,0.000000,0.000000,Benign
6120532,0.000351,0.352941,1.750000e-07,0.000000,0.000039,0.000000,0.000000,0.000000,0.0,0.000000,...,0.416667,0.0,0.0,0.0,0.0,0.000000,0.000000e+00,0.000000,0.000000,Benign
6120533,0.046372,0.352941,1.516283e-03,0.000004,0.000039,0.000000,0.000000,0.000000,0.0,0.000000,...,0.416667,0.0,0.0,0.0,0.0,0.000000,0.000000e+00,0.000000,0.000000,Benign


Dataframe: '02-21-2018'



Unnamed: 0,Dst Port,Protocol,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,0.001221,0.352941,0.000316,0.000013,0.024194,0.000014,0.013924,0.060701,0.000000,0.027664,...,0.727273,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,Benign
1,0.007630,1.000000,0.979779,0.000006,0.000000,0.000151,0.000000,0.224820,0.512295,0.512295,...,0.181818,0.000000,0.0,0.000000,0.000000,0.554717,0.473161,0.713208,0.396226,Benign
2,0.007630,1.000000,0.831200,0.000013,0.000000,0.000252,0.000000,0.224820,0.512295,0.512295,...,0.181818,0.200014,0.0,0.142867,0.200014,0.300943,0.753479,0.713208,0.067928,Benign
3,0.007630,1.000000,0.831200,0.000013,0.000000,0.000252,0.000000,0.224820,0.512295,0.512295,...,0.181818,0.200014,0.0,0.142867,0.200014,0.300943,0.753479,0.713208,0.067928,Benign
4,0.007630,1.000000,0.745663,0.000016,0.000000,0.000303,0.000000,0.224820,0.512295,0.512295,...,0.181818,0.200015,0.0,0.142868,0.200015,0.201887,0.304175,0.396226,0.067928,Benign
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
561400,0.846644,0.352941,0.000010,0.000013,0.016129,0.000094,0.030041,0.420414,0.000000,0.191598,...,0.454545,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,Benign
561401,0.879299,0.352941,0.000159,0.000013,0.016129,0.000094,0.037386,0.420414,0.000000,0.191598,...,0.454545,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,Benign
561402,0.879284,0.352941,0.000306,0.000013,0.016129,0.000094,0.037386,0.420414,0.000000,0.191598,...,0.454545,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,Benign
561403,0.879315,0.352941,0.000015,0.000013,0.016129,0.000094,0.037386,0.420414,0.000000,0.191598,...,0.454545,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,Benign


Dataframe: '02-22-2018'



Unnamed: 0,Dst Port,Protocol,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,0.000336,0.352941,0.999880,0.000784,0.000182,0.000267,2.313201e-05,0.195584,0.000000,0.058781,...,0.454545,0.009472,0.0,0.009472,0.009472,0.000049,0.000000,0.000020,0.000081,Benign
1,0.533914,0.352941,0.999855,0.000087,0.000000,0.000213,0.000000e+00,0.222923,0.000000,0.234459,...,0.454545,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,Benign
2,0.007630,1.000000,0.999976,0.000349,0.000000,0.000628,0.000000e+00,0.131441,0.342466,0.276485,...,0.181818,0.036881,0.0,0.036881,0.036881,0.000081,0.000145,0.000078,0.000030,Benign
3,0.007630,1.000000,0.999976,0.000349,0.000000,0.000628,0.000000e+00,0.131441,0.342466,0.276485,...,0.181818,0.036881,0.0,0.036881,0.036881,0.000081,0.000145,0.000078,0.000030,Benign
4,0.007630,1.000000,0.999963,0.000436,0.000000,0.000753,0.000000e+00,0.131441,0.342466,0.276485,...,0.181818,0.036884,0.0,0.036884,0.036884,0.000054,0.000058,0.000043,0.000030,Benign
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
906921,0.000809,1.000000,0.999855,0.000000,0.000026,0.000009,9.274205e-07,0.009464,0.024658,0.019907,...,0.181818,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,Benign
906922,0.000809,1.000000,0.999855,0.000087,0.000052,0.000020,4.530093e-06,0.010252,0.026712,0.021566,...,0.181818,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,Benign
906923,0.022889,0.352941,0.999959,0.000087,0.000000,0.000000,0.000000e+00,0.000000,0.000000,0.000000,...,0.454545,0.000000,0.0,0.000000,0.000000,0.000218,0.000000,0.000089,0.000359,Benign
906924,0.000809,1.000000,0.999855,0.000000,0.000026,0.000008,2.158036e-06,0.008412,0.021918,0.017695,...,0.181818,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,Benign


Dataframe: '02-23-2018'



Unnamed: 0,Dst Port,Protocol,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,0.000336,0.352941,0.012772,0.000172,0.000089,0.000745,1.259270e-05,0.098301,0.000000,0.059231,...,0.666667,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,Benign
1,0.007630,1.000000,0.979782,0.000034,0.000000,0.000948,0.000000e+00,0.075850,0.345066,0.276311,...,0.166667,0.000000,0.0,0.000000,0.000000,0.489925,0.331849,0.629902,0.349948,Benign
2,0.007630,1.000000,0.979782,0.000034,0.000000,0.000948,0.000000e+00,0.075850,0.345066,0.276311,...,0.166667,0.000000,0.0,0.000000,0.000000,0.489925,0.331849,0.629902,0.349948,Benign
3,0.000336,0.352941,0.014545,0.000172,0.000089,0.000745,1.259270e-05,0.098301,0.000000,0.059231,...,0.666667,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,Benign
4,0.007630,1.000000,0.745696,0.000086,0.000000,0.001896,0.000000e+00,0.075850,0.345066,0.276311,...,0.166667,0.035348,0.0,0.035348,0.035348,0.178102,0.213485,0.349938,0.060008,Benign
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
910657,0.825770,0.352941,0.716604,0.000017,0.000000,0.000000,0.000000e+00,0.000000,0.000000,0.000000,...,0.416667,0.000000,0.0,0.000000,0.000000,0.716654,0.000000,0.716654,0.716654,Benign
910658,0.517502,0.352941,0.000001,0.000017,0.000000,0.000000,0.000000e+00,0.000000,0.000000,0.000000,...,0.416667,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,Benign
910659,0.000809,1.000000,0.000659,0.000000,0.000008,0.000026,6.267571e-07,0.006220,0.028295,0.022658,...,0.166667,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,Benign
910660,0.051714,0.352941,0.083782,0.000172,0.000097,0.000835,1.061010e-05,0.102700,0.000000,0.066365,...,0.416667,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,Benign


Dataframe: '02-28-2018'



Unnamed: 0,Dst Port,Protocol,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,0.006760,0.352941,7.888169e-04,0.000041,0.000132,0.000005,0.000106,0.038607,0.0,0.040045,...,0.454545,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
1,0.006760,0.352941,1.716667e-06,0.000008,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,...,0.454545,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
2,0.006790,0.352941,1.379209e-03,0.000016,0.000019,0.000000,0.000000,0.000000,0.0,0.000000,...,0.454545,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
3,0.006760,0.352941,8.535753e-04,0.000041,0.000132,0.000005,0.000106,0.038607,0.0,0.040045,...,0.454545,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
4,0.006760,0.352941,1.391667e-06,0.000008,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,...,0.454545,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
518166,0.006790,0.352941,6.115669e-03,0.000008,0.000038,0.000000,0.000000,0.000000,0.0,0.000000,...,0.727273,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
518167,0.015763,0.352941,5.833335e-08,0.000000,0.000019,0.000000,0.000000,0.000000,0.0,0.000000,...,0.545455,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Infilteration
518168,0.169533,0.352941,2.500001e-08,0.000000,0.000019,0.000000,0.000000,0.000000,0.0,0.000000,...,0.545455,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Infilteration
518169,0.006790,0.352941,6.106069e-03,0.000008,0.000038,0.000000,0.000000,0.000000,0.0,0.000000,...,0.727273,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign


Dataframe: '03-01-2018'



Unnamed: 0,Dst Port,Protocol,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,0.000000,0.000000,0.960899,0.000219,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.015898,0.0,0.015898,0.015898,0.472500,0.108878,0.508333,0.437500,Benign
1,0.000000,0.000000,0.508312,0.000055,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.0,0.000000,0.000000,0.508333,0.000000,0.508333,0.508333,Benign
2,0.001022,1.000000,0.509575,0.000219,0.000000,0.000153,0.000000,0.152439,0.205479,0.205479,...,0.166667,0.030973,0.0,0.030973,0.030973,0.160000,0.226449,0.271667,0.066664,Benign
3,0.000000,0.000000,0.508313,0.000055,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.0,0.000000,0.000000,0.508333,0.000000,0.508333,0.508333,Benign
4,0.000000,0.000000,0.516646,0.000109,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.008771,0.0,0.008771,0.008771,0.508333,0.000000,0.508333,0.508333,Benign
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
290879,0.000000,0.000000,0.987451,0.001640,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,Benign
290880,0.040192,0.352941,0.000953,0.000055,0.000028,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.416667,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,Benign
290881,0.051713,0.352941,0.011707,0.000383,0.000197,0.000116,0.000038,0.335874,0.000000,0.096918,...,0.416667,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,Benign
290882,0.051713,0.352941,0.033056,0.000711,0.000197,0.000142,0.000009,0.345528,0.000000,0.067808,...,0.416667,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,Benign


Dataframe: '03-02-2018'



Unnamed: 0,Dst Port,Protocol,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,0.006760,0.352941,0.001178,0.000185,0.000101,0.000502,0.000037,0.118060,0.0,0.042085,...,0.454545,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
1,0.758141,0.352941,0.000002,0.000023,0.000014,0.000035,0.000000,0.022209,0.0,0.013014,...,0.454545,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
2,0.006760,0.352941,0.002332,0.000232,0.000217,0.000987,0.000104,0.225015,0.0,0.067621,...,0.454545,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
3,0.006760,0.352941,0.000001,0.000023,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,...,0.454545,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
4,0.006760,0.352941,0.002283,0.000185,0.000188,0.001168,0.000061,0.302162,0.0,0.097793,...,0.454545,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
823727,0.051714,0.352941,0.033185,0.000301,0.000116,0.001310,0.000017,0.423729,0.0,0.070548,...,0.454545,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
823728,0.051714,0.352941,0.031686,0.000301,0.000116,0.001308,0.000017,0.423729,0.0,0.070450,...,0.454545,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
823729,0.051714,0.352941,0.033369,0.000301,0.000116,0.001326,0.000017,0.433080,0.0,0.071380,...,0.454545,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
823730,0.051714,0.352941,0.033320,0.000301,0.000116,0.001326,0.000017,0.433080,0.0,0.071380,...,0.454545,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign


In [46]:
for key in dfs_copy.keys():
  df = dfs_copy[key]
  print(f"Dataframe: '{key}'\n")

  # Step 3: Encode Label

  # Identify categorical columns (non-numeric)
  categorical_columns = df.select_dtypes(include=['object']).columns.tolist()
  categorical_columns.remove('Label')

  # Encode the Label with 0 value for normal and 1 for the rest of the attacks
  df['Label'] = df['Label'].apply(lambda x: 0 if x == 'Benign' else 1)

  # Encode categorical columns using one-hot encoding (get_dummies)
  df = pd.get_dummies(df, columns=categorical_columns)

  # Now, df contains the encoded categorical features and label

  unique_labels = df['Label'].unique()

  # Print the unique labels
for label in unique_labels:
      print(label)

Dataframe: '02-14-2018'

Dataframe: '02-15-2018'

Dataframe: '02-16-2018'

Dataframe: '02-20-2018'

Dataframe: '02-21-2018'

Dataframe: '02-22-2018'

Dataframe: '02-23-2018'

Dataframe: '02-28-2018'

Dataframe: '03-01-2018'

Dataframe: '03-02-2018'

0
1


In [47]:
for key in dfs_copy.keys():
  df = dfs_copy[key]
  print(f"Dataframe: '{key}'\n")

  label_counts_df = df['Label'].value_counts()

  # Display the counts with labels for df
  print("\nLabel counts for df:")
  print(label_counts_df)

Dataframe: '02-14-2018'


Label counts for df:
0    580713
1     94102
Name: Label, dtype: int64
Dataframe: '02-15-2018'


Label counts for df:
0    833344
1     51314
Name: Label, dtype: int64
Dataframe: '02-16-2018'


Label counts for df:
0    446647
1    145255
Name: Label, dtype: int64
Dataframe: '02-20-2018'


Label counts for df:
0    5545171
1     575364
Name: Label, dtype: int64
Dataframe: '02-21-2018'


Label counts for df:
0    360814
1    200591
Name: Label, dtype: int64
Dataframe: '02-22-2018'


Label counts for df:
0    906585
1       341
Name: Label, dtype: int64
Dataframe: '02-23-2018'


Label counts for df:
0    910119
1       543
Name: Label, dtype: int64
Dataframe: '02-28-2018'


Label counts for df:
0    456123
1     62048
Name: Label, dtype: int64
Dataframe: '03-01-2018'


Label counts for df:
0    207705
1     83179
Name: Label, dtype: int64
Dataframe: '03-02-2018'


Label counts for df:
0    679197
1    144535
Name: Label, dtype: int64


In [49]:
# 4 # Removing duplicate records after Label encoding if collisions

for key in dfs_copy.keys():
  df = dfs_copy[key]
  print(f"Dataframe: '{key}'\n")

  # Print the shape of the DataFrame 'df' after removing rows with missing values
  print(df.shape)

  # Remove duplicate rows from the DataFrame 'df' while resetting the index
  df = df.drop_duplicates()
  df.reset_index(inplace=True, drop=True)

  # Print the shape of the DataFrame 'df' after removing duplicates and resetting the index
  print(df.shape)

Dataframe: '02-14-2018'

(674815, 79)
(674808, 79)
Dataframe: '02-15-2018'

(884658, 79)
(884658, 79)
Dataframe: '02-16-2018'

(591902, 79)
(591902, 79)
Dataframe: '02-20-2018'

(6120535, 79)
(6120535, 79)
Dataframe: '02-21-2018'

(561405, 79)
(561405, 79)
Dataframe: '02-22-2018'

(906926, 79)
(906923, 79)
Dataframe: '02-23-2018'

(910662, 79)
(910660, 79)
Dataframe: '02-28-2018'

(518171, 79)
(518171, 79)
Dataframe: '03-01-2018'

(290884, 79)
(290884, 79)
Dataframe: '03-02-2018'

(823732, 79)
(823732, 79)


Some feature vectores collisions on Label have occurred for the following Data Frames:
- 02-14-2018
- 02-22-2018
- 02-23-2018
- 02-28-2018

In [50]:
# Print out the DataFrames loaded in the memory
%whos DataFrame

Variable                  Type         Data/Info
------------------------------------------------
df                        DataFrame            Dst Port  Protoco<...>823732 rows x 79 columns]
missing_values_as_empty   DataFrame            Dst Port  Protoco<...>823732 rows x 79 columns]


In [51]:
del df
del missing_values_as_empty

In [52]:
import gc
gc.collect()

10155

In [56]:
import os

# Define the directory path you want to create
directory_path = "/content/drive/MyDrive/CSE-CIC-IDS2018-BM/Encoded/"

# Check if the directory already exists
if not os.path.exists(directory_path):
    # If it doesn't exist, create the directory
    os.makedirs(directory_path)


for key in dfs_copy.keys():
  df = dfs_copy[key]

  print(f"Dataframe: '{key}'\n")

  # Convert your Pandas DataFrames to a CSV file
  df.to_csv(os.path.join(directory_path, f"{key}.csv"), index=False)

Dataframe: '02-14-2018'

Dataframe: '02-15-2018'

Dataframe: '02-16-2018'

Dataframe: '02-20-2018'

Dataframe: '02-21-2018'

Dataframe: '02-22-2018'

Dataframe: '02-23-2018'

Dataframe: '02-28-2018'

Dataframe: '03-01-2018'

Dataframe: '03-02-2018'



## 2. Algorithm Evaluation

In this section, we assess the performance of various machine learning algorithms on the upper mentioned datasets.

### 2.6. CSE-CIC-IDS2018 dataset evaluation with baseline and traditional ML algorithms

In this section, we evaluate the precision and F1 scores, which serve as crucial benchmarks for assessing classification accuracy. Our evaluation covers a wide range of machine learning algorithms, encompassing fundamental classifiers like Zero Rule and One Rule, statistical approaches such as Naive Bayes, and more advanced models like Random Forest. This comprehensive analysis utilizes a robust 10-fold cross-validation methodology, considering both 10 and 20 of the best-selected features from the CSE-CIC-IDS2018 dataset.
The outcomes of this evaluation provide invaluable insights into devising an optimal dataset generation strategy. They inform the selection of efficient feature extraction techniques from raw data and help identify the most appropriate methodology tailored to the unique characteristics of cybersecurity datasets.

In [3]:
import os
import pandas as pd

# Specify the destination folder in your Google Drive
destination_folder = "/content/drive/MyDrive/CSE-CIC-IDS2018-BM/Encoded"

# Create an empty dictionary to store the dataframes
dfs_copy = {}

# Iterate over the CSV files in the folder
for filename in os.listdir(destination_folder):

    # Get the dataframe name from the file name
    df_name = os.path.splitext(filename)[0]

    # Read the CSV files
    df = pd.read_csv(os.path.join(destination_folder, filename))
    dfs_copy[df_name] = df

In [5]:
import warnings

# Suppress all warning messages
warnings.filterwarnings("ignore")

In [15]:
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif
import pandas as pd

# Initialize an empty DataFrame for X
X = pd.DataFrame()

# Create an empty dictionary to store the 20 features for each DataFrame
features_20_dict = {}

# Iterate through each DataFrame in the dictionary
for key in dfs_copy.keys():
    df = dfs_copy[key]

    # Separate features (X) and labels (y)
    X_part = df.drop('Label', axis=1)  # Exclude the label column
    y = df['Label']

    # Create a pipeline for feature selection on the preprocessed data
    pipeline_20_features = Pipeline([
        ('selector_20', SelectKBest(score_func=f_classif, k=20))
    ])

    # Fit and transform the data for 20 features
    X_selected_20 = pipeline_20_features.fit_transform(X_part, y)

    # Store the selected 20 features in the dictionary
    selected_feature_indices_20 = pipeline_20_features.named_steps['selector_20'].get_support(indices=True)
    selected_features_20 = X_part.columns[selected_feature_indices_20]
    features_20_dict[key] = selected_features_20
    print(f"20 best-selected features for the data frame '{key}':\n'{selected_features_20}'\n")

# Create a list to collect all the 20-feature groups
all_20_features = [list(features_20) for features_20 in features_20_dict.values()]

# Flatten the list of lists into a single list
all_20_features_flat = [feature for sublist in all_20_features for feature in sublist]

# Find the most common 20 features among all DataFrames
from collections import Counter
common_20_features = [feature for feature, count in Counter(all_20_features_flat).most_common(20)]

# Iterate through each DataFrame and select the common 20 features
for key in dfs_copy.keys():
    X_part = df[common_20_features]

    # Concatenate all X_part 20 features to the main X DataFrame
    X = pd.concat([X, X_part], axis=1)

# Initialize an empty list to store the final selected 10 features
selected_features_10 = []

# Create a pipeline for feature selection on the preprocessed data
pipeline_10_features = Pipeline([
    ('selector_10', SelectKBest(score_func=f_classif, k=10))
])

# Fit and transform the data for 10 and 20 features
X_selected_10 = pipeline_10_features.fit_transform(X, y)

# Display the selected features
print(X_selected_10.shape)  # Check the shape of the selected 10 features

selected_feature_indices_10 = pipeline_10_features.named_steps['selector_10'].get_support(indices=True)
selected_features_10 = X.columns[selected_feature_indices_10]

# Display the selected 10 features
print("\nCommon 10 features:")
print(selected_features_10)

# Display the shape of the final X DataFrame
print("\nThe final shape of the X DataFrame:")
print(X.shape)


20 best-selected features for the data frame '02-14-2018':
'Index(['Dst Port', 'Protocol', 'Fwd Pkt Len Max', 'Fwd Pkt Len Min',
       'Fwd Pkt Len Mean', 'Fwd Pkt Len Std', 'Bwd Pkt Len Max',
       'Bwd Pkt Len Min', 'Bwd Pkt Len Std', 'Fwd Header Len', 'Pkt Len Min',
       'Pkt Len Max', 'Pkt Len Std', 'PSH Flag Cnt', 'ACK Flag Cnt',
       'Fwd Seg Size Avg', 'Init Fwd Win Byts', 'Init Bwd Win Byts',
       'Fwd Act Data Pkts', 'Fwd Seg Size Min'],
      dtype='object')'

20 best-selected features for the data frame '02-15-2018':
'Index(['Protocol', 'Fwd Pkt Len Std', 'Bwd Pkt Len Min', 'Flow IAT Std',
       'Flow IAT Max', 'Fwd IAT Mean', 'Fwd IAT Max', 'Fwd IAT Min',
       'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min',
       'Pkt Len Min', 'PSH Flag Cnt', 'Init Fwd Win Byts', 'Fwd Seg Size Min',
       'Active Mean', 'Idle Mean', 'Idle Std', 'Idle Max'],
      dtype='object')'

20 best-selected features for the data frame '02-16-2018':
'Index(['Dst Port', 'Tot 

In [16]:
%whos DataFrame

Variable   Type         Data/Info
---------------------------------
X          DataFrame            Fwd Pkt Len Mean <...>23732 rows x 200 columns]
X_part     DataFrame            Fwd Pkt Len Mean <...>823732 rows x 20 columns]
df         DataFrame            Dst Port  Protoco<...>823732 rows x 79 columns]


In [17]:
del X_part
del df

import gc
gc.collect()

10610

In [18]:
import pandas as pd
import numpy as np
from sklearn.metrics import precision_score, mean_absolute_error, f1_score
from sklearn.dummy import DummyClassifier
from tabulate import tabulate
import time
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

# Define the number of desired folds for Cross-Validation (e.g., 10)
num_folds = 10

# Initialize performance metrics lists for 10 and 20 features
results_10_features = []
results_20_features = []

In [20]:
# Define a file name for saving the results
results_file_name = os.path.join(destination_folder, "csecicids2018_results.pkl")

# Check for results before rerunning the code snippet
if not os.path.exists(results_file_name):

  # Define ZeroRule classifier
  zero_rule = DummyClassifier(strategy="most_frequent")

  # Evaluate ZeroRule classifier
  start_time = time.time()  # Start measuring execution time
  precision_scores_10 = cross_val_score(zero_rule, X[selected_features_10], y, cv=num_folds, scoring='precision')
  f1_scores_10 = cross_val_score(zero_rule, X[selected_features_10], y, cv=num_folds, scoring='f1')
  elapsed_time_10 = time.time() - start_time  # Calculate execution time

  start_time = time.time()  # Start measuring execution time
  precision_scores_20 = cross_val_score(zero_rule, X, y, cv=num_folds, scoring='precision')
  f1_scores_20 = cross_val_score(zero_rule, X, y, cv=num_folds, scoring='f1')
  elapsed_time_20 = time.time() - start_time  # Calculate execution time

  variance_10 = np.var(precision_scores_10)
  variance_20 = np.var(precision_scores_20)

  predictions_10 = cross_val_predict(zero_rule, X[selected_features_10], y, cv=num_folds)
  mae_10 = mean_absolute_error(y, predictions_10)

  predictions_20 = cross_val_predict(zero_rule, X, y, cv=num_folds)
  mae_20 = mean_absolute_error(y, predictions_20)

  # Display ZeroRule results for 10 features
  print("ZeroRule Precision (10 features):", np.mean(precision_scores_10))
  print("ZeroRule F1 Score (10 features):", np.mean(f1_scores_10))
  print("ZeroRule Variance (10 features):", variance_10)
  print("ZeroRule MAE (10 features):", mae_10)
  print("ZeroRule Execution Time:", elapsed_time_10)

  # Display ZeroRule results for 20 features
  print("ZeroRule Precision (20 features):", np.mean(precision_scores_20))
  print("ZeroRule F1 Score (20 features):", np.mean(f1_scores_20))
  print("ZeroRule Variance (20 features):", variance_20)
  print("ZeroRule MAE (20 features):", mae_20)
  print("ZeroRule Execution Time:", elapsed_time_20)

  results_10_features.append(["ZeroRule", np.mean(precision_scores_10), np.mean(f1_scores_10), variance_10, mae_10, elapsed_time_10])
  results_20_features.append(["ZeroRule", np.mean(precision_scores_20), np.mean(f1_scores_20), variance_20, mae_20, elapsed_time_20])

ZeroRule Precision (10 features): 0.0
ZeroRule F1 Score (10 features): 0.0
ZeroRule Variance (10 features): 0.0
ZeroRule MAE (10 features): 0.17546362166335652
ZeroRule Execution Time: 6.306824445724487
ZeroRule Precision (20 features): 0.0
ZeroRule F1 Score (20 features): 0.0
ZeroRule Variance (20 features): 0.0
ZeroRule MAE (20 features): 0.17546362166335652
ZeroRule Execution Time: 10.245831727981567


In [22]:
# Check for results before rerunning the code snippet
if not os.path.exists(results_file_name):

  # Define OneRule classifier
  one_rule = DummyClassifier(strategy="stratified")

  # Evaluate OneRule classifier
  start_time = time.time()  # Start measuring execution time
  precision_scores_10 = cross_val_score(one_rule, X[selected_features_10], y, cv=num_folds, scoring='precision')
  f1_scores_10 = cross_val_score(one_rule, X[selected_features_10], y, cv=num_folds, scoring='f1')
  elapsed_time_10 = time.time() - start_time  # Calculate execution time

  start_time = time.time()  # Start measuring execution time
  precision_scores_20 = cross_val_score(one_rule, X, y, cv=num_folds, scoring='precision')
  f1_scores_20 = cross_val_score(one_rule, X, y, cv=num_folds, scoring='f1')
  elapsed_time_20 = time.time() - start_time  # Calculate execution time

  variance_10 = np.var(precision_scores_10)
  variance_20 = np.var(precision_scores_20)

  predictions_10 = cross_val_predict(one_rule, X[selected_features_10], y, cv=num_folds)
  mae_10 = mean_absolute_error(y, predictions_10)

  predictions_20 = cross_val_predict(one_rule, X, y, cv=num_folds)
  mae_20 = mean_absolute_error(y, predictions_20)

  # Display OneRule results for 10 features
  print("OneRule Precision (10 features):", np.mean(precision_scores_10))
  print("OneRule F1 Score (10 features):", np.mean(f1_scores_10))
  print("OneRule Variance (10 features):", variance_10)
  print("OneRule MAE (10 features):", mae_10)
  print("OneRule Execution Time:", elapsed_time_10)

  # Display OneRule results for 20 features
  print("OneRule Precision (20 features):", np.mean(precision_scores_20))
  print("OneRule F1 Score (20 features):", np.mean(f1_scores_20))
  print("OneRule Variance (20 features):", variance_20)
  print("OneRule MAE (20 features):", mae_20)
  print("OneRule Execution Time:", elapsed_time_20)

  results_10_features.append(["OneRule", np.mean(precision_scores_10), np.mean(f1_scores_10), variance_10, mae_10, elapsed_time_10])
  results_20_features.append(["OneRule", np.mean(precision_scores_20), np.mean(f1_scores_20), variance_20, mae_20, elapsed_time_20])

OneRule Precision (10 features): 0.17517836333508166
OneRule F1 Score (10 features): 0.1767940417061407
OneRule Variance (10 features): 5.111938572730934e-06
OneRule MAE (10 features): 0.28907945788193246
OneRule Execution Time: 6.406144380569458
OneRule Precision (20 features): 0.17525141166993846
OneRule F1 Score (20 features): 0.17562117159784002
OneRule Variance (20 features): 9.781975488477432e-06
OneRule MAE (20 features): 0.2891073795846222
OneRule Execution Time: 10.43408989906311


In [23]:
# Check for results before rerunning the code snippet
if not os.path.exists(results_file_name):

  # Define Naive Bayes classifier
  naive_bayes = GaussianNB()

  # Evaluate Naive Bayes classifier
  start_time = time.time()  # Start measuring execution time
  precision_scores_10 = cross_val_score(naive_bayes, X[selected_features_10], y, cv=num_folds, scoring='precision')
  f1_scores_10 = cross_val_score(naive_bayes, X[selected_features_10], y, cv=num_folds, scoring='f1')
  elapsed_time_10 = time.time() - start_time  # Calculate execution time

  start_time = time.time()  # Start measuring execution time
  precision_scores_20 = cross_val_score(naive_bayes, X, y, cv=num_folds, scoring='precision')
  f1_scores_20 = cross_val_score(naive_bayes, X, y, cv=num_folds, scoring='f1')
  elapsed_time_20 = time.time() - start_time  # Calculate execution time

  variance_10 = np.var(precision_scores_10)
  variance_20 = np.var(precision_scores_20)

  predictions_10 = cross_val_predict(naive_bayes, X[selected_features_10], y, cv=num_folds)
  mae_10 = mean_absolute_error(y, predictions_10)

  predictions_20 = cross_val_predict(naive_bayes, X, y, cv=num_folds)
  mae_20 = mean_absolute_error(y, predictions_20)

  # Display Naive Bayes results for 10 features
  print("Naive Bayes Precision (10 features):", np.mean(precision_scores_10))
  print("Naive Bayes F1 Score (10 features):", np.mean(f1_scores_10))
  print("Naive Bayes Variance (10 features):", variance_10)
  print("Naive Bayes MAE (10 features):", mae_10)
  print("Naive Bayes Execution Time:", elapsed_time_10)

  # Display Naive Bayes results for 20 features
  print("Naive Bayes Precision (20 features):", np.mean(precision_scores_20))
  print("Naive Bayes F1 Score (20 features):", np.mean(f1_scores_20))
  print("Naive Bayes Variance (20 features):", variance_20)
  print("Naive Bayes MAE (20 features):", mae_20)
  print("Naive Bayes Execution Time:", elapsed_time_20)

  results_10_features.append(["Naive Bayes", np.mean(precision_scores_10), np.mean(f1_scores_10), variance_10, mae_10, elapsed_time_10])
  results_20_features.append(["Naive Bayes", np.mean(precision_scores_20), np.mean(f1_scores_20), variance_20, mae_20, elapsed_time_20])

Naive Bayes Precision (10 features): 0.41599489156053
Naive Bayes F1 Score (10 features): 0.5771099044320416
Naive Bayes Variance (10 features): 0.00356662793445076
Naive Bayes MAE (10 features): 0.25114964575857196
Naive Bayes Execution Time: 29.47931957244873
Naive Bayes Precision (20 features): 0.8649107572972674
Naive Bayes F1 Score (20 features): 0.90647044053673
Naive Bayes Variance (20 features): 0.002068585229766033
Naive Bayes MAE (20 features): 0.034690894611354176
Naive Bayes Execution Time: 56.40955948829651


In [24]:
# Check for results before rerunning the code snippet
if not os.path.exists(results_file_name):

  # Create a Random Forest classifier with optimized parameters
  rf_classifier = RandomForestClassifier(n_estimators=100, max_depth=10, n_jobs=-1)  # Adjust parameters for optimization

  # Evaluate Random Forest classifier
  start_time = time.time()  # Start measuring execution time
  precision_scores_10 = cross_val_score(rf_classifier, X[selected_features_10], y, cv=num_folds, scoring='precision')
  f1_scores_10 = cross_val_score(rf_classifier, X[selected_features_10], y, cv=num_folds, scoring='f1')
  elapsed_time_10 = time.time() - start_time  # Calculate execution time

  start_time = time.time()  # Start measuring execution time
  precision_scores_20 = cross_val_score(rf_classifier, X, y, cv=num_folds, scoring='precision')
  f1_scores_20 = cross_val_score(rf_classifier, X, y, cv=num_folds, scoring='f1')
  elapsed_time_20 = time.time() - start_time  # Calculate execution time

  variance_10 = np.var(precision_scores_10)
  variance_20 = np.var(precision_scores_20)

  predictions_10 = cross_val_predict(rf_classifier, X[selected_features_10], y, cv=num_folds)
  mae_10 = mean_absolute_error(y, predictions_10)

  predictions_20 = cross_val_predict(rf_classifier, X, y, cv=num_folds)
  mae_20 = mean_absolute_error(y, predictions_20)

  # Display Random Forest results for 10 features
  print("Random Forest Precision (10 features):", np.mean(precision_scores_10))
  print("Random Forest F1 Score (10 features):", np.mean(f1_scores_10))
  print("Random Forest Variance (10 features):", variance_10)
  print("Random Forest MAE (10 features):", mae_10)
  print("Random Forest Execution Time:", elapsed_time_10)

  # Display Random Forest results for 20 features
  print("Random Forest Precision (20 features):", np.mean(precision_scores_20))
  print("Random Forest F1 Score (20 features):", np.mean(f1_scores_20))
  print("Random Forest Variance (20 features):", variance_20)
  print("Random Forest MAE (20 features):", mae_20)
  print("Random Forest Execution Time:", elapsed_time_20)

  results_10_features.append(["Random Forest", np.mean(precision_scores_10), np.mean(f1_scores_10), variance_10, mae_10, elapsed_time_10])
  results_20_features.append(["Random Forest", np.mean(precision_scores_20), np.mean(f1_scores_20), variance_20, mae_20, elapsed_time_20])

Random Forest Precision (10 features): 0.999590366459798
Random Forest F1 Score (10 features): 0.9913011147144981
Random Forest Variance (10 features): 7.52042520972457e-08
Random Forest MAE (10 features): 0.00301675787756212
Random Forest Execution Time: 1010.446768283844
Random Forest Precision (20 features): 0.9945822263846985
Random Forest F1 Score (20 features): 0.996695095080631
Random Forest Variance (20 features): 0.00022476785402515898
Random Forest MAE (20 features): 0.0013050361039755648
Random Forest Execution Time: 436.4050407409668


In [25]:
import pickle
import os

if not os.path.exists(results_file_name):

  # Save the results lists to a file
  with open(results_file_name, 'wb') as file:
      results_dict = {
          'results_10_features': results_10_features,
          'results_20_features': results_20_features
      }
      pickle.dump(results_dict, file)


In [26]:
# Load the results from the file
with open(results_file_name, 'rb') as file:
    loaded_results = pickle.load(file)

# Access the loaded results lists
results_10_features = loaded_results['results_10_features']
results_20_features = loaded_results['results_20_features']


In [27]:
# Print the results in tabular format
headers_10 = ["Algorithm", "Precision (10 Features)", "F1 Score (10 Features)", "Variance (10 Features)", "MAE (10 Features)", "Execution Time"]
headers_20 = ["Precision (20 Features)", "F1 Score (20 Features)", "Variance (20 Features)", "MAE (20 Features)", "Execution Time"]

print(tabulate(results_10_features, headers_10, tablefmt="pretty"))
print(tabulate(results_20_features, headers_20, tablefmt="pretty"))

+---------------+-------------------------+------------------------+------------------------+---------------------+-------------------+
|   Algorithm   | Precision (10 Features) | F1 Score (10 Features) | Variance (10 Features) |  MAE (10 Features)  |  Execution Time   |
+---------------+-------------------------+------------------------+------------------------+---------------------+-------------------+
|   ZeroRule    |           0.0           |          0.0           |          0.0           | 0.17546362166335652 | 6.306824445724487 |
|    OneRule    |   0.17517836333508166   |   0.1767940417061407   | 5.111938572730934e-06  | 0.28907945788193246 | 6.406144380569458 |
|  Naive Bayes  |    0.41599489156053     |   0.5771099044320416   |  0.00356662793445076   | 0.25114964575857196 | 29.47931957244873 |
| Random Forest |    0.999590366459798    |   0.9913011147144981   |  7.52042520972457e-08  | 0.00301675787756212 | 1010.446768283844 |
+---------------+-------------------------+-----

## References

<a id="1">[1]</a> Iman Sharafaldin, Arash Habibi Lashkari, and Ali A. Ghorbani, “Toward Generating a New Intrusion Detection Dataset and Intrusion Traffic Characterization”, 4th International Conference on Information Systems Security and Privacy (ICISSP), Portugal, January 2018.