# Kyoto 2015 - Benchmarking
## 0. Notebook initialization

In [None]:
# Mount your Google Drive

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
from psutil import virtual_memory
from tabulate import tabulate

# Function to get CPU information
def get_cpu_info():
    cpu_info = os.popen('lscpu').read()
    return cpu_info

# Function to get RAM information
def get_ram_info():
    ram = virtual_memory()
    total_ram = f"{ram.total / 1e9:.2f} GB"
    available_ram = f"{ram.available / 1e9:.2f} GB"
    return total_ram, available_ram

# Function to get GPU information
def get_gpu_info():
    # Execute nvidia-smi and get its output
    gpu_info = os.popen('nvidia-smi --query-gpu=name,memory.total,memory.used,memory.free --format=csv,noheader,nounits').read().strip()

    # Split the output to get individual GPU details
    details = gpu_info.split(", ")

    # Return GPU name, total, used, and free memory
    return details[0], f"{details[1]} MB", f"{details[2]} MB", f"{details[3]} MB"

# Collect system information
cpu_info = get_cpu_info()
total_ram, available_ram = get_ram_info()
try:
  gpu_name, gpu_total_memory, gpu_used_memory, gpu_free_memory = get_gpu_info()
except:
  gpu_name, gpu_total_memory, gpu_used_memory, gpu_free_memory = 'null',0,0,0

# Extract relevant CPU information
cpu_type = ""
cpu_architecture = ""

for line in cpu_info.splitlines():
    if "Model name:" in line:
        cpu_type = line.split(":")[1].strip()
    elif "Architecture:" in line:
        cpu_architecture = line.split(":")[1].strip()

# Create a table
table = [
    ["CPU Type", cpu_type],
    ["CPU Architecture", cpu_architecture],
    ["Total RAM", total_ram],
    ["Available RAM", available_ram],
    ["GPU Name", gpu_name],
    ["GPU Total Memory", gpu_total_memory],
    ["GPU Used Memory", gpu_used_memory],
    ["GPU Free Memory", gpu_free_memory]
]

# Display the table
print(tabulate(table, headers=["Characteristic", "Value"], tablefmt="pretty"))


+------------------+--------------------------------+
|  Characteristic  |             Value              |
+------------------+--------------------------------+
|     CPU Type     | Intel(R) Xeon(R) CPU @ 2.20GHz |
| CPU Architecture |             x86_64             |
|    Total RAM     |            54.76 GB            |
|  Available RAM   |            53.26 GB            |
|     GPU Name     |              null              |
| GPU Total Memory |               0                |
| GPU Used Memory  |               0                |
| GPU Free Memory  |               0                |
+------------------+--------------------------------+


## 1. Data Acquisition and Preprocessing

In this section, we focus on acquiring the above mentioned datasets.

### 1.9. Kyoto 2006+ dataset

The Kyoto (Kyoto 2006+) dataset, often referred to as the Kyoto University Honeypot Dataset, stands out for its honeypot-based intrusion detection and analysis methodology. Developed by Kyoto University, this dataset plays a vital role in the field of network security. It captures a wide array of cyber threats and attacks through honeypot sensors, which mimic vulnerable systems, attracting malicious activity for analysis. This approach allows researchers to gather real-world data on various types of attacks, including but not limited to, network scanning, brute force, and distributed denial of service (DDoS) attacks. This resource is invaluable for honing intrusion detection systems and advancing cybersecurity research and practices.

Due to the large amounts of data available in the Kyoto datasets (126 GB - 3257 files), we will focus only on the data gathered in the last year of the experiment, 2015. You can download the entire dataset, by executing the following routine:

### Download and Unzip Kyoto 2006+ dataset

***I'm using Bash commands for download, unzip, and initial preprocessing as it is much faster and the large dataset size may causes crashes of Python.***

In [None]:
!wget --mirror -np -nH --cut-dirs=1 -P '/content/drive/MyDrive/Kyoto2006+' -N -r -A '*' https://www.takakura.com/Kyoto_data/new_data201704/

--2023-10-13 11:41:37--  https://www.takakura.com/Kyoto_data/new_data201704/
Resolving www.takakura.com (www.takakura.com)... 150.100.120.5, 2001:2f8:3a:2100::5
Connecting to www.takakura.com (www.takakura.com)|150.100.120.5|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 9913 (9.7K) [text/html]
Saving to: ‘/content/drive/MyDrive/Kyoto2006+/new_data201704/index.html’


2023-10-13 11:41:38 (10.5 MB/s) - ‘/content/drive/MyDrive/Kyoto2006+/new_data201704/index.html’ saved [9913/9913]

Loading robots.txt; please ignore errors.
--2023-10-13 11:41:38--  https://www.takakura.com/robots.txt
Reusing existing connection to www.takakura.com:443.
HTTP request sent, awaiting response... 404 Not Found
2023-10-13 11:41:38 ERROR 404: Not Found.

--2023-10-13 11:41:38--  https://www.takakura.com/Kyoto_data/new_data201704/2006/2006.zip
Reusing existing connection to www.takakura.com:443.
HTTP request sent, awaiting response... 200 OK
Length: 90748747 (87M) [application/zip]
Sav

In [None]:
!ls -ahl '/content/drive/MyDrive/Kyoto2006+'

total 4.0K
drwx------ 2 root root 4.0K Oct 13 11:41 new_data201704


In [None]:
!find '/content/drive/MyDrive/Kyoto2006+/new_data201704/' -type f  -name "*.zip"

/content/drive/MyDrive/Kyoto2006+/new_data201704/2006/2006.zip
/content/drive/MyDrive/Kyoto2006+/new_data201704/2006/200611.zip
/content/drive/MyDrive/Kyoto2006+/new_data201704/2006/200612.zip
/content/drive/MyDrive/Kyoto2006+/new_data201704/2007/2007.zip
/content/drive/MyDrive/Kyoto2006+/new_data201704/2007/200701.zip
/content/drive/MyDrive/Kyoto2006+/new_data201704/2007/200702.zip
/content/drive/MyDrive/Kyoto2006+/new_data201704/2007/200703.zip
/content/drive/MyDrive/Kyoto2006+/new_data201704/2007/200704.zip
/content/drive/MyDrive/Kyoto2006+/new_data201704/2007/200705.zip
/content/drive/MyDrive/Kyoto2006+/new_data201704/2007/200706.zip
/content/drive/MyDrive/Kyoto2006+/new_data201704/2007/200707.zip
/content/drive/MyDrive/Kyoto2006+/new_data201704/2007/200708.zip
/content/drive/MyDrive/Kyoto2006+/new_data201704/2007/200709.zip
/content/drive/MyDrive/Kyoto2006+/new_data201704/2007/200710.zip
/content/drive/MyDrive/Kyoto2006+/new_data201704/2007/200711.zip
/content/drive/MyDrive/Kyoto2

In [None]:
import os
import zipfile
import shutil

# Function to extract all TXT files from ZIP archives
def extract_txt_files_from_zips(source_folder, target_folder):
    for root, _, files in os.walk(source_folder):
        for file in files:
            if file.endswith(".zip"):
                zip_path = os.path.join(root, file)
                print(zip_path)
                with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                    for member in zip_ref.infolist():
                        if member.filename.endswith(".txt"):
                            txt_file_data = zip_ref.read(member.filename)
                            txt_file_path = os.path.join(target_folder, os.path.basename(member.filename))
                            with open(txt_file_path, 'wb') as txt_file:
                                txt_file.write(txt_file_data)
                                print(txt_file_path)

source_folder = '/content/drive/MyDrive/Kyoto2006+/new_data201704'
target_folder = '/content/drive/MyDrive/Kyoto2006+/extracted'

# Ensure the target folder exists
if not os.path.exists(target_folder):
    os.makedirs(target_folder)

# Call the function to extract and flatten the TXT files
extract_txt_files_from_zips(source_folder, target_folder)

print("Extraction and flattening completed.")


In [None]:
!head /content/drive/MyDrive/Kyoto2006+/extracted/20061101.txt

27.561208	smtp	3179	175	0	0.00	0.00	0.00	0	0	0.00	0.00	0.00	SF	0	0	0	1	fda2:69aa:1f1a:84b0:130d:2736:3fa0:42da	2161	fda2:69aa:1f1a:61a4:7dc5:27f2:0713:0f0e	25	00:00:09	tcp
0.000000	other	0	0	0	0.00	0.00	0.00	0	0	0.00	0.00	0.00	S0	0	0	0	-1	fda2:69aa:1f1a:0104:3fff:571a:ff2c:00a5	138	fda2:69aa:1f1a:2108:3f84:570e:ffe4:007b	138	00:00:14	udp
86366.249616	other	244776	0	0	0.00	0.00	0.00	0	0	0.00	0.00	0.00	S0	0	0	0	-1	fda2:69aa:1f1a:540c:7d80:2750:07a6:28a5	32770	fda2:69aa:1f1a:0d61:1001:01e2:02fb:2a22	8649	00:00:15	udp
2994.374758	other	15744	18154	0	0.00	0.00	0.00	0	0	0.00	0.00	0.00	RSTOS0	0	0	0	-1	fda2:69aa:1f1a:3aef:7af3:3027:3045:7ff2	1400	fda2:69aa:1f1a:e714:277f:10e1:03f2:425a	80	00:00:16	tcp
4.749378	smtp	7895	244	0	0.00	0.00	0.00	0	0	0.00	0.00	0.00	SF	0	0	0	1	fda2:69aa:1f1a:381e:25aa:0bff:12e8:0365	1806	fda2:69aa:1f1a:61a4:7dc5:27f2:0713:0f0e	25	00:00:17	tcp
10358.914673	other	2932	3500	0	0.00	0.00	0.00	0	0	0.00	0.00	0.00	SF	0	0	0	-1	fda2:69aa:1f1a:2c7e:7a23:304a:30ba:01e5	1723	fda2

In [None]:
# Count the resulting dataset files
!ls /content/drive/MyDrive/Kyoto2006+/extracted/ | wc -l

3257


In [None]:
# Size of the entire Kyoto2006+ dataset
!du -sh /content/drive/MyDrive/Kyoto2006+/extracted/

126G	/content/drive/MyDrive/Kyoto2006+/extracted/


In [None]:
%%bash

# Define the column names
column_names=("Duration" "Service" "Source bytes" "Destination bytes" "Count" \
              "Same srv rate" "Serror rate" "Srv serror rate" "Dst host count" \
              "Dst host srv count" "Dst host same src port rate" "Dst host serror rate" \
              "Dst host srv serror rate" "Flag" "IDS detection" "Malware detection" \
              "Ashula detection" "Label" "Source IP Address" "Source Port Number" \
              "Destination IP Address" "Destination Port Number" "Start Time" "Protocol")

# Convert the column names array to a space-separated string
header_text=$(IFS=" "; echo "${column_names[*]}")

# Create the header file
echo "$header_text" > header.txt

# Print a message to indicate the header file is created
echo "Header file 'header.txt' is created."

cat header.txt


Header file 'header.txt' is created.
Duration Service Source bytes Destination bytes Count Same srv rate Serror rate Srv serror rate Dst host count Dst host srv count Dst host same src port rate Dst host serror rate Dst host srv serror rate Flag IDS detection Malware detection Ashula detection Label Source IP Address Source Port Number Destination IP Address Destination Port Number Start Time Protocol


In [None]:
%%bash

# Specify the source directory where the files are located
source_dir="/content/drive/MyDrive/Kyoto2006+/extracted/"

# Specify the destination directory where concatenated files will be saved
destination_dir="concatenated/"

# Create the destination directory if it doesn't exist
mkdir -p "$destination_dir"

# Loop through the years
for year in {2006..2015}; do
    # Use a wildcard to select files for the current year
    files_for_year="$source_dir/$year*.txt"

    # Concatenate files for the current year and save to destination directory
    cat $files_for_year > "${destination_dir}${year}_concatenated.txt"

    # Print a message to indicate the concatenation is completed for the year
    echo "Concatenation completed for $year"
done

# Print a final message
echo "All concatenations completed."


Concatenation completed for 2006
Concatenation completed for 2007
Concatenation completed for 2008
Concatenation completed for 2009
Concatenation completed for 2010
Concatenation completed for 2011
Concatenation completed for 2012
Concatenation completed for 2013
Concatenation completed for 2014
Concatenation completed for 2015
All concatenations completed.


In [None]:
!ls /content/concatenated -ahl

total 126G
drwxr-xr-x 2 root root 4.0K Oct 14 07:58 .
drwxr-xr-x 1 root root 4.0K Oct 14 06:54 ..
-rw-r--r-- 1 root root 683M Oct 14 06:54 2006_concatenated.txt
-rw-r--r-- 1 root root 4.3G Oct 14 07:00 2007_concatenated.txt
-rw-r--r-- 1 root root 6.9G Oct 14 07:06 2008_concatenated.txt
-rw-r--r-- 1 root root  14G Oct 14 07:14 2009_concatenated.txt
-rw-r--r-- 1 root root  28G Oct 14 07:25 2010_concatenated.txt
-rw-r--r-- 1 root root  14G Oct 14 07:33 2011_concatenated.txt
-rw-r--r-- 1 root root 8.1G Oct 14 07:40 2012_concatenated.txt
-rw-r--r-- 1 root root  15G Oct 14 07:49 2013_concatenated.txt
-rw-r--r-- 1 root root  18G Oct 14 07:58 2014_concatenated.txt
-rw-r--r-- 1 root root  19G Oct 14 08:07 2015_concatenated.txt


In [None]:
!head /content/concatenated/2015_concatenated.txt

0.000000	dhcp	0	0	0	0.00	0.00	0.00	0	0	0.00	0.00	0.00	S0	0	0	0	-1	fd0e:9e00:6814:170a:4341:33f8:2fa1:0cf2	59619	fd0e:9e00:6814:71a4:7d24:27c6:60ef:310c	67	17:51:17	udp
0.000000	dhcp	0	0	0	0.00	0.00	0.00	0	0	0.00	0.00	0.00	S0	0	0	0	-1	fd0e:9e00:6814:170a:4341:33f8:2fa1:0cf2	59619	fd0e:9e00:6814:c1fd:7d27:27a7:601e:21fe	67	00:16:12	udp
0.000000	dhcp	0	0	0	0.00	0.00	0.00	0	0	0.00	0.00	0.00	S0	0	0	0	-1	fd0e:9e00:6814:170a:4341:33f8:2fa1:0cf2	59619	fd0e:9e00:6814:d7ee:7ded:27d9:0fbc:15f7	67	10:52:21	udp
0.000000	dhcp	0	0	0	0.00	0.00	0.00	0	0	0.00	0.00	0.00	S0	0	0	0	-1	fd0e:9e00:6814:2598:39c5:0fc6:2dbe:1b7e	59619	fd0e:9e00:6814:0681:7dec:2773:605d:1b1c	67	12:41:33	udp
0.000000	dhcp	0	0	0	0.00	0.00	0.00	0	0	0.00	0.00	0.00	S0	0	0	0	-1	fd0e:9e00:6814:2598:39c5:0fc6:2dbe:1b7e	59619	fd0e:9e00:6814:0a6e:7ddb:27b6:60eb:5941	67	13:19:07	udp
0.000000	dhcp	0	0	0	0.00	0.00	0.00	0	0	0.00	0.00	0.00	S0	0	0	0	-1	fd0e:9e00:6814:2598:39c5:0fc6:2dbe:1b7e	59619	fd0e:9e00:6814:149f:7d8a:273f:60cc:0858	67	20:10

In [None]:
!cp /content/concatenated/2015_concatenated.txt /content/drive/MyDrive/Kyoto2006+/2015_concatenated.txt

### Preprocessing of the Kyoto 2015 dataset

In [None]:
!wc -l /content/concatenated/2015_concatenated.txt

115574748 /content/concatenated/2015_concatenated.txt


In [None]:
!cut -f 18 -d $'\t' /content/concatenated/2015_concatenated.txt | sort | uniq

-1
1
-2


In [None]:
import pandas as pd
import warnings
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import gc

# Ignore all warnings
warnings.filterwarnings("ignore")

# Specify the path to your large text file
file_path = '/content/drive/MyDrive/Kyoto2006+/2015_concatenated.txt'

# Define the column names if needed
column_names = ['Duration', 'Service', 'Source bytes', 'Destination bytes', 'Count',
                'Same srv rate', 'Serror rate', 'Srv serror rate', 'Dst host count',
                'Dst host srv count', 'Dst host same src port rate', 'Dst host serror rate',
                'Dst host srv serror rate', 'Flag', 'IDS detection', 'Malware detection',
                'Ashula detection', 'Label', 'Source IP Address', 'Source Port Number',
                'Destination IP Address', 'Destination Port Number', 'Start Time', 'Protocol']

# Define the chunk size (adjust this based on your available memory)
chunk_size = 10000000  # 10 million rows per chunk, you can increase or decrease this value

i = 0

# Define a custom function to map values to 1 if non-zero, otherwise to 0
def map_to_binary(value):
    if value == '0':
        return 0
    else:
        return 1

# Mapping function to convert labels
def map_label(label):
    if label == 1:
        return 0  # Normal
    else:
        return 1  # Attack

# Initialize an empty DataFrame for the final results
df = pd.DataFrame(columns=column_names)

# Open the file and read it in chunks
for chunk in pd.read_csv(file_path, sep='\t', header=None, names=column_names, chunksize=chunk_size):

    #display(chunk.head)

    # Check memory usage and perform sampling if memory is almost full
    mem_usage = chunk.memory_usage(deep=True).sum() / (1024**2)  # in MB
    if mem_usage < 47000:  # memory threshold, adjust as needed

      # Print the chunk number
      i += 1
      print("Processing chunk", i)

      # Process each chunk as needed

      """# Step 1: Handling Missing Values
      # This step can be skipped, as the dataset does not have missing values or NaN

      # Check for missing values, NAN
      check_nan = chunk.isna().sum().sum()

      # Check if missing values are represented as empty values (",,")
      missing_values_as_empty = chunk.applymap(lambda x: x == '')

      # Count the number of missing values in each column
      missing_values_count = missing_values_as_empty.sum()

      # Check if all elements in the missing_values_count Series are different from 0
      check_null = (missing_values_count != 0).all()

      # Replace empty values with NaN
      if (check_null):
        chunk.replace("", np.nan, inplace=True)

      # Impute missing values with the most frequent value for categorical columns and mean for numerical columns
      if (check_null or check_nan !=0):
        imputer = SimpleImputer(strategy='most_frequent', missing_values=pd.NA)
        for col in chunk.columns:
          if chunk[col].dtype == 'object':
            chunk[col] = imputer.fit_transform(df[[col]])
          else:
            chunk[col] = chunk[col].fillna(chunk[col].mean())"""

      # Check again for missing values, NAN
      display(chunk.isna().sum(axis=0))

      # Check chunk info
      #display(chunk.info())

      # 2 # Encode Categorical Features and Label

      # Apply the custom encoding to columns 15, 16, and 17. 0 - no detection / 1 - alert
      chunk['IDS detection'] = chunk['IDS detection'].apply(map_to_binary)
      chunk['Malware detection'] = chunk['Malware detection'].apply(map_to_binary)
      chunk['Ashula detection'] = chunk['Ashula detection'].apply(map_to_binary)

      chunk['IDS detection'] = chunk['IDS detection'].astype(int)
      chunk['Malware detection'] = chunk['Malware detection'].astype(int)
      chunk['Ashula detection'] = chunk['Ashula detection'].astype(int)

      print(chunk['IDS detection'].unique())
      print(chunk['Malware detection'].unique())
      print(chunk['Ashula detection'].unique())

      # Before applying map_label
      print("Type of 'Label' column:", chunk['Label'].dtype)
      print("Unique values in 'Label' column before mapping:")
      print(chunk['Label'].unique())

      # Apply the mapping function to the 'Label' column
      chunk['Label'] = chunk['Label'].apply(map_label)
      chunk['Label'] = chunk['Label'].astype(int)

      # After applying map_label
      print("Type of 'Label' column after mapping:", chunk['Label'].dtype)
      print("Unique values in 'Label' column after mapping:")
      print(chunk['Label'].unique())

      categorical_columns = [col for col in chunk.columns if chunk[col].dtype == 'object']
      label_encoder = LabelEncoder()
      for col in categorical_columns:
          try:
              chunk[col] = label_encoder.fit_transform(chunk[col])
          except:
              display(chunk[col])

      # 3 # Normalization (Min-Max Scaling)
      numeric_columns = chunk.select_dtypes(exclude=['object']).columns
      scaler = MinMaxScaler()
      chunk[numeric_columns] = scaler.fit_transform(chunk[numeric_columns])

      """# 4 # Removing duplicate records
      # This step can be skipped, as we already removed duplicated rows during unzip and concatenation in previous section

      # Print the shape of the chunks
      display(chunk.shape)

      # Remove duplicate rows from the chunk while resetting the index
      chunk = chunk.drop_duplicates()
      chunk.reset_index(inplace=True, drop=True)

      if i % 2 == 0:
        print("Shape of previous df")
        display(df.shape)
        # Remove duplicate rows from the chunk while resetting the index
        df = df.drop_duplicates()
        df.reset_index(inplace=True, drop=True)
        print("New shape of df")
        display(df.shape)

      # Print the shape of the chunks
      display(chunk.shape)"""

      # Append the processed chunk to the main DataFrame
      df = pd.concat([df, chunk], ignore_index=True)
      gc.collect()

Processing chunk 1


Duration                       0
Service                        0
Source bytes                   0
Destination bytes              0
Count                          0
Same srv rate                  0
Serror rate                    0
Srv serror rate                0
Dst host count                 0
Dst host srv count             0
Dst host same src port rate    0
Dst host serror rate           0
Dst host srv serror rate       0
Flag                           0
IDS detection                  0
Malware detection              0
Ashula detection               0
Label                          0
Source IP Address              0
Source Port Number             0
Destination IP Address         0
Destination Port Number        0
Start Time                     0
Protocol                       0
dtype: int64

[1 0]
[1 0]
[1 0]
Type of 'Label' column: int64
Unique values in 'Label' column before mapping:
[-1  1 -2]
Type of 'Label' column after mapping: int64
Unique values in 'Label' column after mapping:
[1 0]
Processing chunk 2


Duration                       0
Service                        0
Source bytes                   0
Destination bytes              0
Count                          0
Same srv rate                  0
Serror rate                    0
Srv serror rate                0
Dst host count                 0
Dst host srv count             0
Dst host same src port rate    0
Dst host serror rate           0
Dst host srv serror rate       0
Flag                           0
IDS detection                  0
Malware detection              0
Ashula detection               0
Label                          0
Source IP Address              0
Source Port Number             0
Destination IP Address         0
Destination Port Number        0
Start Time                     0
Protocol                       0
dtype: int64

[1 0]
[1 0]
[1 0]
Type of 'Label' column: int64
Unique values in 'Label' column before mapping:
[-1  1 -2]
Type of 'Label' column after mapping: int64
Unique values in 'Label' column after mapping:
[1 0]
Processing chunk 3


Duration                       0
Service                        0
Source bytes                   0
Destination bytes              0
Count                          0
Same srv rate                  0
Serror rate                    0
Srv serror rate                0
Dst host count                 0
Dst host srv count             0
Dst host same src port rate    0
Dst host serror rate           0
Dst host srv serror rate       0
Flag                           0
IDS detection                  0
Malware detection              0
Ashula detection               0
Label                          0
Source IP Address              0
Source Port Number             0
Destination IP Address         0
Destination Port Number        0
Start Time                     0
Protocol                       0
dtype: int64

[1 0]
[1 0]
[1 0]
Type of 'Label' column: int64
Unique values in 'Label' column before mapping:
[-1  1]
Type of 'Label' column after mapping: int64
Unique values in 'Label' column after mapping:
[1 0]
Processing chunk 4


Duration                       0
Service                        0
Source bytes                   0
Destination bytes              0
Count                          0
Same srv rate                  0
Serror rate                    0
Srv serror rate                0
Dst host count                 0
Dst host srv count             0
Dst host same src port rate    0
Dst host serror rate           0
Dst host srv serror rate       0
Flag                           0
IDS detection                  0
Malware detection              0
Ashula detection               0
Label                          0
Source IP Address              0
Source Port Number             0
Destination IP Address         0
Destination Port Number        0
Start Time                     0
Protocol                       0
dtype: int64

[1 0]
[1 0]
[1 0]
Type of 'Label' column: int64
Unique values in 'Label' column before mapping:
[-1  1 -2]
Type of 'Label' column after mapping: int64
Unique values in 'Label' column after mapping:
[1 0]
Processing chunk 5


Duration                       0
Service                        0
Source bytes                   0
Destination bytes              0
Count                          0
Same srv rate                  0
Serror rate                    0
Srv serror rate                0
Dst host count                 0
Dst host srv count             0
Dst host same src port rate    0
Dst host serror rate           0
Dst host srv serror rate       0
Flag                           0
IDS detection                  0
Malware detection              0
Ashula detection               0
Label                          0
Source IP Address              0
Source Port Number             0
Destination IP Address         0
Destination Port Number        0
Start Time                     0
Protocol                       0
dtype: int64

[0 1]
[0 1]
[1 0]
Type of 'Label' column: int64
Unique values in 'Label' column before mapping:
[-1  1 -2]
Type of 'Label' column after mapping: int64
Unique values in 'Label' column after mapping:
[1 0]
Processing chunk 6


Duration                       0
Service                        0
Source bytes                   0
Destination bytes              0
Count                          0
Same srv rate                  0
Serror rate                    0
Srv serror rate                0
Dst host count                 0
Dst host srv count             0
Dst host same src port rate    0
Dst host serror rate           0
Dst host srv serror rate       0
Flag                           0
IDS detection                  0
Malware detection              0
Ashula detection               0
Label                          0
Source IP Address              0
Source Port Number             0
Destination IP Address         0
Destination Port Number        0
Start Time                     0
Protocol                       0
dtype: int64

[0 1]
[1 0]
[1 0]
Type of 'Label' column: int64
Unique values in 'Label' column before mapping:
[-1  1 -2]
Type of 'Label' column after mapping: int64
Unique values in 'Label' column after mapping:
[1 0]
Processing chunk 7


Duration                       0
Service                        0
Source bytes                   0
Destination bytes              0
Count                          0
Same srv rate                  0
Serror rate                    0
Srv serror rate                0
Dst host count                 0
Dst host srv count             0
Dst host same src port rate    0
Dst host serror rate           0
Dst host srv serror rate       0
Flag                           0
IDS detection                  0
Malware detection              0
Ashula detection               0
Label                          0
Source IP Address              0
Source Port Number             0
Destination IP Address         0
Destination Port Number        0
Start Time                     0
Protocol                       0
dtype: int64

[0 1]
[1 0]
[1]
Type of 'Label' column: int64
Unique values in 'Label' column before mapping:
[-1  1]
Type of 'Label' column after mapping: int64
Unique values in 'Label' column after mapping:
[1 0]
Processing chunk 8


Duration                       0
Service                        0
Source bytes                   0
Destination bytes              0
Count                          0
Same srv rate                  0
Serror rate                    0
Srv serror rate                0
Dst host count                 0
Dst host srv count             0
Dst host same src port rate    0
Dst host serror rate           0
Dst host srv serror rate       0
Flag                           0
IDS detection                  0
Malware detection              0
Ashula detection               0
Label                          0
Source IP Address              0
Source Port Number             0
Destination IP Address         0
Destination Port Number        0
Start Time                     0
Protocol                       0
dtype: int64

[0 1]
[1 0]
[1 0]
Type of 'Label' column: int64
Unique values in 'Label' column before mapping:
[ 1 -1]
Type of 'Label' column after mapping: int64
Unique values in 'Label' column after mapping:
[0 1]
Processing chunk 9


Duration                       0
Service                        0
Source bytes                   0
Destination bytes              0
Count                          0
Same srv rate                  0
Serror rate                    0
Srv serror rate                0
Dst host count                 0
Dst host srv count             0
Dst host same src port rate    0
Dst host serror rate           0
Dst host srv serror rate       0
Flag                           0
IDS detection                  0
Malware detection              0
Ashula detection               0
Label                          0
Source IP Address              0
Source Port Number             0
Destination IP Address         0
Destination Port Number        0
Start Time                     0
Protocol                       0
dtype: int64

[0 1]
[0 1]
[0 1]
Type of 'Label' column: int64
Unique values in 'Label' column before mapping:
[-1  1 -2]
Type of 'Label' column after mapping: int64
Unique values in 'Label' column after mapping:
[1 0]
Processing chunk 10


Duration                       0
Service                        0
Source bytes                   0
Destination bytes              0
Count                          0
Same srv rate                  0
Serror rate                    0
Srv serror rate                0
Dst host count                 0
Dst host srv count             0
Dst host same src port rate    0
Dst host serror rate           0
Dst host srv serror rate       0
Flag                           0
IDS detection                  0
Malware detection              0
Ashula detection               0
Label                          0
Source IP Address              0
Source Port Number             0
Destination IP Address         0
Destination Port Number        0
Start Time                     0
Protocol                       0
dtype: int64

[1 0]
[0 1]
[0 1]
Type of 'Label' column: int64
Unique values in 'Label' column before mapping:
[ 1 -1 -2]
Type of 'Label' column after mapping: int64
Unique values in 'Label' column after mapping:
[0 1]
Processing chunk 11


Duration                       0
Service                        0
Source bytes                   0
Destination bytes              0
Count                          0
Same srv rate                  0
Serror rate                    0
Srv serror rate                0
Dst host count                 0
Dst host srv count             0
Dst host same src port rate    0
Dst host serror rate           0
Dst host srv serror rate       0
Flag                           0
IDS detection                  0
Malware detection              0
Ashula detection               0
Label                          0
Source IP Address              0
Source Port Number             0
Destination IP Address         0
Destination Port Number        0
Start Time                     0
Protocol                       0
dtype: int64

[0 1]
[0 1]
[0 1]
Type of 'Label' column: int64
Unique values in 'Label' column before mapping:
[-1  1 -2]
Type of 'Label' column after mapping: int64
Unique values in 'Label' column after mapping:
[1 0]
Processing chunk 12


Duration                       0
Service                        0
Source bytes                   0
Destination bytes              0
Count                          0
Same srv rate                  0
Serror rate                    0
Srv serror rate                0
Dst host count                 0
Dst host srv count             0
Dst host same src port rate    0
Dst host serror rate           0
Dst host srv serror rate       0
Flag                           0
IDS detection                  0
Malware detection              0
Ashula detection               0
Label                          0
Source IP Address              0
Source Port Number             0
Destination IP Address         0
Destination Port Number        0
Start Time                     0
Protocol                       0
dtype: int64

[0 1]
[0 1]
[0 1]
Type of 'Label' column: int64
Unique values in 'Label' column before mapping:
[-1  1 -2]
Type of 'Label' column after mapping: int64
Unique values in 'Label' column after mapping:
[1 0]


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 115574748 entries, 0 to 115574747
Data columns (total 24 columns):
 #   Column                       Dtype  
---  ------                       -----  
 0   Duration                     float64
 1   Service                      float64
 2   Source bytes                 float64
 3   Destination bytes            float64
 4   Count                        float64
 5   Same srv rate                float64
 6   Serror rate                  float64
 7   Srv serror rate              float64
 8   Dst host count               float64
 9   Dst host srv count           float64
 10  Dst host same src port rate  float64
 11  Dst host serror rate         float64
 12  Dst host srv serror rate     float64
 13  Flag                         float64
 14  IDS detection                float64
 15  Malware detection            float64
 16  Ashula detection             float64
 17  Label                        float64
 18  Source IP Address            float64
 

In [None]:
# Save the final DataFrame to a Parquet file
df.to_parquet('/content/processed_data.parquet', index=False)

In [None]:
!cp /content/processed_data.parquet /content/drive/MyDrive/Kyoto2006+/Kyoto2015_encoded.parquet

In [None]:
df.shape

(115574748, 24)

In [None]:
# Print out the DataFrames loaded in the memory
%whos DataFrame

Variable   Type         Data/Info
---------------------------------
chunk      DataFrame               Duration   Ser<...>574748 rows x 24 columns]
df         DataFrame    

In [None]:
!du -sh /content/processed_data.parquet

1.4G	/content/processed_data.parquet


## 2. Algorithm Evaluation

In this section, we assess the performance of various machine learning algorithms on the upper mentioned datasets.

### 2.4. Kyoto2015 dataset evaluation with baseline and traditional ML algorithms

In this section, we evaluate the performance of various machine learning algorithms on the Kyoto2015 dataset. We assess the precision and F1 scores, essential indicators of classification accuracy, for a range of algorithms, including fundamental classifiers like Zero Rule and One Rule, statistical approaches like Naive Bayes, and more advanced models such as Random Forest.

Given that the Kyoto2015 is a large dataset, to ensure a robust assessment and be able to employ a 10-fold cross-validation methodology, we sampled the dataset to 10%. These results offer valuable insights into the optimal dataset generation strategy, aiding in the selection of the most effective feature extraction methods for cybersecurity specific dataset.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import pyarrow.parquet as pq

# Specify the path to your Parquet file
parquet_file_path = '/content/drive/MyDrive/Kyoto2006+/Kyoto2015_encoded.parquet'

# Read the Parquet file into a DataFrame
df = pq.read_table(parquet_file_path).to_pandas()

In [None]:
df.shape

(115574748, 24)

In [None]:
# Sampling full_df for best features selection
part_df = df.sample(frac=0.01, random_state=42) # Sample 1% of data

In [None]:
part_df.shape

(1155747, 24)

In [None]:
label_counts_Kyoto_sample = df['Label'].value_counts()

# Display the counts with labels for df
print("Label counts for the sample from Kyoto 2015:")
print(label_counts_Kyoto_sample)

label_counts_df = part_df['Label'].value_counts()

# Display the counts with labels for part_df
print("\nLabel counts for part_df:")
print(label_counts_df)


Label counts for the sample from Kyoto 2015:
1.0    109256672
0.0      6318076
Name: Label, dtype: int64

Label counts for part_df:
1.0    1092952
0.0      62795
Name: Label, dtype: int64


In [None]:
del df

import gc
gc.collect()

0

In [None]:
# Separate features (X) and labels (y)
X = part_df.drop('Label', axis=1)  # Exclude the label column
y = part_df['Label']

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif

# Create a pipeline for feature selection on the preprocessed data
pipeline_10_features = Pipeline([
    ('selector_10', SelectKBest(score_func=f_classif, k=10))
])

pipeline_20_features = Pipeline([
    ('selector_20', SelectKBest(score_func=f_classif, k=20))
])

# Fit and transform the data for 10 and 20 features
X_selected_10 = pipeline_10_features.fit_transform(X, y)
X_selected_20 = pipeline_20_features.fit_transform(X, y)

# Display the selected features
print(X_selected_10.shape)  # Check the shape of the selected 10 features
print(X_selected_20.shape)  # Check the shape of the selected 20 features

# Display the selected features
print("Selected 10 features:")
selected_feature_indices_10 = pipeline_10_features.named_steps['selector_10'].get_support(indices=True)
selected_features_10 = X.columns[selected_feature_indices_10]
print(selected_features_10)

print("\nSelected 20 features:")
selected_feature_indices_20 = pipeline_20_features.named_steps['selector_20'].get_support(indices=True)
selected_features_20 = X.columns[selected_feature_indices_20]
print(selected_features_20)

(1155747, 10)
(1155747, 20)
Selected 10 features:
Index(['Duration', 'Same srv rate', 'Srv serror rate', 'Dst host count',
       'Dst host srv count', 'Dst host srv serror rate', 'Flag',
       'IDS detection', 'Source Port Number', 'Protocol'],
      dtype='object')

Selected 20 features:
Index(['Duration', 'Service', 'Count', 'Same srv rate', 'Serror rate',
       'Srv serror rate', 'Dst host count', 'Dst host srv count',
       'Dst host same src port rate', 'Dst host serror rate',
       'Dst host srv serror rate', 'Flag', 'IDS detection', 'Ashula detection',
       'Source IP Address', 'Source Port Number', 'Destination IP Address',
       'Destination Port Number', 'Start Time', 'Protocol'],
      dtype='object')


In [None]:
selected_features_10 = ['Duration', 'Same srv rate', 'Srv serror rate', 'Dst host count',
       'Dst host srv count', 'Dst host srv serror rate', 'Flag',
       'IDS detection', 'Source Port Number', 'Protocol']
print(selected_features_10)
selected_features_20 = ['Duration', 'Service', 'Count', 'Same srv rate', 'Serror rate',
       'Srv serror rate', 'Dst host count', 'Dst host srv count',
       'Dst host same src port rate', 'Dst host serror rate',
       'Dst host srv serror rate', 'Flag', 'IDS detection', 'Ashula detection',
       'Source IP Address', 'Source Port Number', 'Destination IP Address',
       'Destination Port Number', 'Start Time', 'Protocol']
print(selected_features_20)

['Duration', 'Same srv rate', 'Srv serror rate', 'Dst host count', 'Dst host srv count', 'Dst host srv serror rate', 'Flag', 'IDS detection', 'Source Port Number', 'Protocol']
['Duration', 'Service', 'Count', 'Same srv rate', 'Serror rate', 'Srv serror rate', 'Dst host count', 'Dst host srv count', 'Dst host same src port rate', 'Dst host serror rate', 'Dst host srv serror rate', 'Flag', 'IDS detection', 'Ashula detection', 'Source IP Address', 'Source Port Number', 'Destination IP Address', 'Destination Port Number', 'Start Time', 'Protocol']


In [None]:
import warnings
import os
import pandas as pd
import numpy as np
from sklearn.metrics import precision_score, mean_absolute_error, f1_score
from sklearn.dummy import DummyClassifier
from tabulate import tabulate
import time
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

# Define the number of desired folds for Cross-Validation (e.g., 10)
num_folds = 10

# Initialize performance metrics lists for 10 and 20 features
results_10_features = []
results_20_features = []

# Suppress all warning messages
warnings.filterwarnings("ignore")

In [None]:
# Specify the destination folder in your Google Drive
destination_folder = "/content/drive/MyDrive/Kyoto2006+/"

# Define a file name for saving the results
results_file_name = os.path.join(destination_folder, "kyoto2015_results.pkl")

# Check for results before rerunning the code snippet
if not os.path.exists(results_file_name):

  # Define ZeroRule classifier
  zero_rule = DummyClassifier(strategy="most_frequent")

  # Evaluate ZeroRule classifier
  start_time = time.time()  # Start measuring execution time
  precision_scores_10 = cross_val_score(zero_rule, X[selected_features_10], y, cv=num_folds, scoring='precision')
  f1_scores_10 = cross_val_score(zero_rule, X[selected_features_10], y, cv=num_folds, scoring='f1')
  elapsed_time_10 = time.time() - start_time  # Calculate execution time

  start_time = time.time()  # Start measuring execution time
  precision_scores_20 = cross_val_score(zero_rule, X[selected_features_20], y, cv=num_folds, scoring='precision')
  f1_scores_20 = cross_val_score(zero_rule, X[selected_features_20], y, cv=num_folds, scoring='f1')
  elapsed_time_20 = time.time() - start_time  # Calculate execution time

  variance_10 = np.var(precision_scores_10)
  variance_20 = np.var(precision_scores_20)

  predictions_10 = cross_val_predict(zero_rule, X[selected_features_10], y, cv=num_folds)
  mae_10 = mean_absolute_error(y, predictions_10)

  predictions_20 = cross_val_predict(zero_rule, X[selected_features_20], y, cv=num_folds)
  mae_20 = mean_absolute_error(y, predictions_20)

  # Display ZeroRule results for 10 features
  print("ZeroRule Precision (10 features):", np.mean(precision_scores_10))
  print("ZeroRule F1 Score (10 features):", np.mean(f1_scores_10))
  print("ZeroRule Variance (10 features):", variance_10)
  print("ZeroRule MAE (10 features):", mae_10)
  print("ZeroRule Execution Time:", elapsed_time_10)

  # Display ZeroRule results for 20 features
  print("ZeroRule Precision (20 features):", np.mean(precision_scores_20))
  print("ZeroRule F1 Score (20 features):", np.mean(f1_scores_20))
  print("ZeroRule Variance (20 features):", variance_20)
  print("ZeroRule MAE (20 features):", mae_20)
  print("ZeroRule Execution Time:", elapsed_time_20)

  results_10_features.append(["ZeroRule", np.mean(precision_scores_10), np.mean(f1_scores_10), variance_10, mae_10, elapsed_time_10])
  results_20_features.append(["ZeroRule", np.mean(precision_scores_20), np.mean(f1_scores_20), variance_20, mae_20, elapsed_time_20])

ZeroRule Precision (10 features): 0.9456671745736667
ZeroRule F1 Score (10 features): 0.9720749642355937
ZeroRule Variance (10 features): 1.7542188112392676e-11
ZeroRule MAE (10 features): 0.0543328254367089
ZeroRule Execution Time: 3.944232702255249
ZeroRule Precision (20 features): 0.9456671745736667
ZeroRule F1 Score (20 features): 0.9720749642355937
ZeroRule Variance (20 features): 1.7542188112392676e-11
ZeroRule MAE (20 features): 0.0543328254367089
ZeroRule Execution Time: 4.807303428649902


In [None]:
# Check for results before rerunning the code snippet
if not os.path.exists(results_file_name):

  # Define OneRule classifier
  one_rule = DummyClassifier(strategy="stratified")

  # Evaluate OneRule classifier
  start_time = time.time()  # Start measuring execution time
  precision_scores_10 = cross_val_score(one_rule, X[selected_features_10], y, cv=num_folds, scoring='precision')
  f1_scores_10 = cross_val_score(one_rule, X[selected_features_10], y, cv=num_folds, scoring='f1')
  elapsed_time_10 = time.time() - start_time  # Calculate execution time

  start_time = time.time()  # Start measuring execution time
  precision_scores_20 = cross_val_score(one_rule, X[selected_features_20], y, cv=num_folds, scoring='precision')
  f1_scores_20 = cross_val_score(one_rule, X[selected_features_20], y, cv=num_folds, scoring='f1')
  elapsed_time_20 = time.time() - start_time  # Calculate execution time

  variance_10 = np.var(precision_scores_10)
  variance_20 = np.var(precision_scores_20)

  predictions_10 = cross_val_predict(one_rule, X[selected_features_10], y, cv=num_folds)
  mae_10 = mean_absolute_error(y, predictions_10)

  predictions_20 = cross_val_predict(one_rule, X[selected_features_20], y, cv=num_folds)
  mae_20 = mean_absolute_error(y, predictions_20)

  # Display OneRule results for 10 features
  print("OneRule Precision (10 features):", np.mean(precision_scores_10))
  print("OneRule F1 Score (10 features):", np.mean(f1_scores_10))
  print("OneRule Variance (10 features):", variance_10)
  print("OneRule MAE (10 features):", mae_10)
  print("OneRule Execution Time:", elapsed_time_10)

  # Display OneRule results for 20 features
  print("OneRule Precision (20 features):", np.mean(precision_scores_20))
  print("OneRule F1 Score (20 features):", np.mean(f1_scores_20))
  print("OneRule Variance (20 features):", variance_20)
  print("OneRule MAE (20 features):", mae_20)
  print("OneRule Execution Time:", elapsed_time_20)

  results_10_features.append(["OneRule", np.mean(precision_scores_10), np.mean(f1_scores_10), variance_10, mae_10, elapsed_time_10])
  results_20_features.append(["OneRule", np.mean(precision_scores_20), np.mean(f1_scores_20), variance_20, mae_20, elapsed_time_20])

OneRule Precision (10 features): 0.9456520870501178
OneRule F1 Score (10 features): 0.9456875593971354
OneRule Variance (10 features): 4.3971115713171974e-08
OneRule MAE (10 features): 0.10311253241410101
OneRule Execution Time: 3.9662866592407227
OneRule Precision (20 features): 0.9455930155581648
OneRule F1 Score (20 features): 0.9454977602039143
OneRule Variance (20 features): 1.9497232018266623e-08
OneRule MAE (20 features): 0.10294207988426533
OneRule Execution Time: 4.468806028366089


In [None]:
# Check for results before rerunning the code snippet
if not os.path.exists(results_file_name):

  # Define Naive Bayes classifier
  naive_bayes = GaussianNB()

  # Evaluate Naive Bayes classifier
  start_time = time.time()  # Start measuring execution time
  precision_scores_10 = cross_val_score(naive_bayes, X[selected_features_10], y, cv=num_folds, scoring='precision')
  f1_scores_10 = cross_val_score(naive_bayes, X[selected_features_10], y, cv=num_folds, scoring='f1')
  elapsed_time_10 = time.time() - start_time  # Calculate execution time

  start_time = time.time()  # Start measuring execution time
  precision_scores_20 = cross_val_score(naive_bayes, X[selected_features_20], y, cv=num_folds, scoring='precision')
  f1_scores_20 = cross_val_score(naive_bayes, X[selected_features_20], y, cv=num_folds, scoring='f1')
  elapsed_time_20 = time.time() - start_time  # Calculate execution time

  variance_10 = np.var(precision_scores_10)
  variance_20 = np.var(precision_scores_20)

  predictions_10 = cross_val_predict(naive_bayes, X[selected_features_10], y, cv=num_folds)
  mae_10 = mean_absolute_error(y, predictions_10)

  predictions_20 = cross_val_predict(naive_bayes, X[selected_features_20], y, cv=num_folds)
  mae_20 = mean_absolute_error(y, predictions_20)

  # Display Naive Bayes results for 10 features
  print("Naive Bayes Precision (10 features):", np.mean(precision_scores_10))
  print("Naive Bayes F1 Score (10 features):", np.mean(f1_scores_10))
  print("Naive Bayes Variance (10 features):", variance_10)
  print("Naive Bayes MAE (10 features):", mae_10)
  print("Naive Bayes Execution Time:", elapsed_time_10)

  # Display Naive Bayes results for 20 features
  print("Naive Bayes Precision (20 features):", np.mean(precision_scores_20))
  print("Naive Bayes F1 Score (20 features):", np.mean(f1_scores_20))
  print("Naive Bayes Variance (20 features):", variance_20)
  print("Naive Bayes MAE (20 features):", mae_20)
  print("Naive Bayes Execution Time:", elapsed_time_20)

  results_10_features.append(["Naive Bayes", np.mean(precision_scores_10), np.mean(f1_scores_10), variance_10, mae_10, elapsed_time_10])
  results_20_features.append(["Naive Bayes", np.mean(precision_scores_20), np.mean(f1_scores_20), variance_20, mae_20, elapsed_time_20])

Naive Bayes Precision (10 features): 0.9756340305288976
Naive Bayes F1 Score (10 features): 0.8469311043933752
Naive Bayes Variance (10 features): 3.871468417651549e-07
Naive Bayes MAE (10 features): 0.25576445363907496
Naive Bayes Execution Time: 8.66162395477295
Naive Bayes Precision (20 features): 0.9849695525937443
Naive Bayes F1 Score (20 features): 0.7888414911851874
Naive Bayes Variance (20 features): 3.0192939254192053e-07
Naive Bayes MAE (20 features): 0.3330529951624361
Naive Bayes Execution Time: 12.879241466522217


In [None]:
# Check for results before rerunning the code snippet
if not os.path.exists(results_file_name):

  # Create a Random Forest classifier with optimized parameters
  rf_classifier = RandomForestClassifier(n_estimators=100, max_depth=10, n_jobs=-1)  # Adjust parameters for optimization

  # Evaluate Random Forest classifier
  start_time = time.time()  # Start measuring execution time
  precision_scores_10 = cross_val_score(rf_classifier, X[selected_features_10], y, cv=num_folds, scoring='precision')
  print("Random Forest Precision (10 features):", np.mean(precision_scores_10))
  f1_scores_10 = cross_val_score(rf_classifier, X[selected_features_10], y, cv=num_folds, scoring='f1')
  print("Random Forest F1 Score (10 features):", np.mean(f1_scores_10))
  elapsed_time_10 = time.time() - start_time  # Calculate execution time
  print("Random Forest Execution Time:", elapsed_time_10)

  start_time = time.time()  # Start measuring execution time
  precision_scores_20 = cross_val_score(rf_classifier, X[selected_features_20], y, cv=num_folds, scoring='precision')
  print("Random Forest Precision (20 features):", np.mean(precision_scores_20))
  f1_scores_20 = cross_val_score(rf_classifier, X[selected_features_20], y, cv=num_folds, scoring='f1')
  print("Random Forest F1 Score (20 features):", np.mean(f1_scores_20))
  elapsed_time_20 = time.time() - start_time  # Calculate execution time
  print("Random Forest Execution Time:", elapsed_time_20)

  variance_10 = np.var(precision_scores_10)
  print("Random Forest Variance (10 features):", variance_10)
  variance_20 = np.var(precision_scores_20)
  print("Random Forest Variance (20 features):", variance_20)

  predictions_10 = cross_val_predict(rf_classifier, X[selected_features_10], y, cv=num_folds)
  mae_10 = mean_absolute_error(y, predictions_10)
  print("Random Forest MAE (10 features):", mae_10)

  predictions_20 = cross_val_predict(rf_classifier, X[selected_features_20], y, cv=num_folds)
  mae_20 = mean_absolute_error(y, predictions_20)
  print("Random Forest MAE (20 features):", mae_20)

  results_10_features.append(["Random Forest", np.mean(precision_scores_10), np.mean(f1_scores_10), variance_10, mae_10, elapsed_time_10])
  results_20_features.append(["Random Forest", np.mean(precision_scores_20), np.mean(f1_scores_20), variance_20, mae_20, elapsed_time_20])

Random Forest Precision (10 features): 0.9645078006289648
Random Forest F1 Score (10 features): 0.9800951847154471
Random Forest Execution Time: 546.3033177852631
Random Forest Precision (20 features): 0.9678073230157805
Random Forest F1 Score (20 features): 0.9831671487583508
Random Forest Execution Time: 757.6577804088593
Random Forest Variance (10 features): 1.9547026574542038e-07
Random Forest Variance (20 features): 6.89779269292472e-07
Random Forest MAE (10 features): 0.03820905440377522
Random Forest MAE (20 features): 0.032241485376989946


In [None]:
import pickle
import os

if not os.path.exists(results_file_name):

  # Save the results lists to a file
  with open(results_file_name, 'wb') as file:
      results_dict = {
          'results_10_features': results_10_features,
          'results_20_features': results_20_features
      }
      pickle.dump(results_dict, file)


In [None]:
import pickle, os

# Specify the destination folder in your Google Drive
destination_folder = "/content/drive/MyDrive/Kyoto2006+/"

# Define a file name for saving the results
results_file_name = os.path.join(destination_folder, "kyoto2015_results.pkl")

# Load the results from the file
with open(results_file_name, 'rb') as file:
    loaded_results = pickle.load(file)

# Access the loaded results lists
results_10_features = loaded_results['results_10_features']
results_20_features = loaded_results['results_20_features']


In [None]:
from tabulate import tabulate

# Print the results in tabular format
headers_10 = ["Algorithm", "Precision (10 Features)", "F1 Score (10 Features)", "Variance (10 Features)", "MAE (10 Features)", "Execution Time"]
headers_20 = ["Precision (20 Features)", "F1 Score (20 Features)", "Variance (20 Features)", "MAE (20 Features)", "Execution Time"]

print(tabulate(results_10_features, headers_10, tablefmt="pretty"))
print(tabulate(results_20_features, headers_20, tablefmt="pretty"))

+---------------+-------------------------+------------------------+------------------------+---------------------+--------------------+
|   Algorithm   | Precision (10 Features) | F1 Score (10 Features) | Variance (10 Features) |  MAE (10 Features)  |   Execution Time   |
+---------------+-------------------------+------------------------+------------------------+---------------------+--------------------+
|   ZeroRule    |   0.9456671745736667    |   0.9720749642355937   | 1.7542188112392676e-11 | 0.0543328254367089  | 3.944232702255249  |
|    OneRule    |   0.9456520870501178    |   0.9456875593971354   | 4.3971115713171974e-08 | 0.10311253241410101 | 3.9662866592407227 |
|  Naive Bayes  |   0.9756340305288976    |   0.8469311043933752   | 3.871468417651549e-07  | 0.25576445363907496 |  8.66162395477295  |
| Random Forest |   0.9645078006289648    |   0.9800951847154471   | 1.9547026574542038e-07 | 0.03820905440377522 | 546.3033177852631  |
+---------------+------------------------