# Holistic nEtwork featuRes Aggregator (HERA)

The user indicates the path of the workspace, program generates HERA Flow File. User selects features, the program extracts them. If the user wants, it can extract ground-truth labels.

## 1. Imports

Necessary Project Imports: \
**os** for file checking | **subprocess** for running argus | **ipywidgets** for selecting features and showing progress | **contextmanager** for dynamic widgets (showing progress) | **pandas** for dataframes for features | **numpy** for insertion of nan values | **utils** for necessary functions to execute the code

In [None]:
import os
import subprocess

import ipywidgets as widgets
from IPython.display import display, HTML
from contextlib import contextmanager

import pandas as pd
import numpy as np

import csv

import utils as u

## 2. Define Workspace

Define paths for where the pcap, argus and csv files will be at. They can all be in the same folder, or different ones.

In [None]:
# PCAP Path
folder_path_pcap = input('Enter the path where your pcap files are: ')
valid_folder_path_pcap = u.check_valid_folder(folder_path_pcap)

# Hera Path    
folder_path_hera = input('Enter the path for the HERA flow files: ')
valid_folder_path_hera = u.check_valid_folder(folder_path_hera)

# CSV Path
folder_path_csv = input('Enter the path where you wish to save your CSV files to: ')
valid_folder_path_csv = u.check_valid_folder(folder_path_csv)

# Name Geneation
pcap_file_name, hera_file_name, csv_file_name = u.get_file_names(valid_folder_path_pcap)

# HERA Path + CSV

hera_path = u.check_last_digit(valid_folder_path_hera)
csv_path = u.check_last_digit(valid_folder_path_csv)

## 3. Generate HERA flow File (skippable if argus file is already generated)

### 3.1. Choose arguments when creating the argus file

In [None]:
argus_fields_select = widgets.SelectMultiple(
    options=u.argus_fields,
    value=u.default_argus_values,  
    disabled=False
)

argus_fields_select.layout.width = '600px'
argus_fields_select.layout.height = '150px'

argus_fields_description_label = widgets.HTML('<h4>Fields to generate HERA flow files:</h4>')

display(widgets.VBox([argus_fields_description_label, argus_fields_select]))

### 3.2. Choose flow's intervals in seconds

In [None]:
S_value_textbox = widgets.IntText(
    value=60,
    min=0,  
    description='Value:',
    disabled=False
)

confirm_button = widgets.Button(description="Confirm")
output = widgets.Output()

S_value = 60

def on_button_clicked(b):
    global S_value
    S_value = S_value_textbox.value
    with output:
        if S_value >= 0:
            print("Value for -S:", S_value)
        else:
            print("Please enter a non-negative value.")
        
confirm_button.on_click(on_button_clicked)

argus_flow_interval_description_label = widgets.HTML('<h4>Please select a value in seconds for the flow interval, default value is 60:</h4>')
display(widgets.VBox([argus_flow_interval_description_label, S_value_textbox, confirm_button, output]))

### 3.3. Create the argus files

In [None]:
argus_args = [item.split(' | ')[0] for item in argus_fields_select.value]

label = widgets.Text('Waiting for button click.')
argus_file_button = widgets.Button(description='Create HERA Flow File')

@contextmanager
def show_loading():
    label.value = 'Running Argus...'
    yield
    label.value = 'Ready!'

def start_hera_file_creation_process():
    pcap_path = u.check_last_digit(valid_folder_path_pcap)
    hera_path = u.check_last_digit(valid_folder_path_hera)

    #+ ["-U 64"]
    for i in range(len(pcap_file_name)):
        argus_command_file = ["argus"] + ["-S", str(S_value)] + argus_args + ["-r", pcap_path + pcap_file_name[i], "-w", hera_path + hera_file_name[i]]
        argus_process = subprocess.Popen(argus_command_file, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        argus_process.communicate()

def create_hera_file(self):
    with show_loading():
        start_hera_file_creation_process()
        
argus_file_button.on_click(create_hera_file)

display(argus_file_button)    
display(label)

## 4. Create Dataset

### 4.1. Select Features

Shows the user the full list for features argus can generate (for more information: https://manpages.ubuntu.com/manpages/noble/en/man1/ra.1.html). Using ctrl the user can select multiple features or use the buttons to select all features or the default features proposed in the thesis.
Default Features: 'sport', 'dport', 'saddr', 'daddr', 'proto', 'bytes', 'sbytes', 'dbytes', 'pkts', 'spkts', 'dpkts', 'dur', 'runtime', 'idle', 'flgs', 'tcpopt', 'Ssaddr', 'Sdaddr'. 

Calculated Features: 'Ssaddr' and 'Sdaddr'. \
'Ssaddr' - Nº of connections with the same service and source address; \
'Sdaddr' - Nº of connections with the same service and destination address.

In [None]:
feature_selector = widgets.SelectMultiple(
    options=u.features,
    disabled=False
)

features_description_label = widgets.HTML('<h4>Select features:</h4>')

display(widgets.VBox([features_description_label, feature_selector]))

# Select All Button

all_feature_button = widgets.Button(description = 'Select all')

def select_all(*args):
    feature_selector.value = feature_selector.options

all_feature_button.on_click(select_all)

# Default Feature Button

default_feature_button = widgets.Button(description = 'Default Features')

def select_default(*args):
    feature_selector.value = u.default_features

default_feature_button.on_click(select_default)

# CIC Feature Button

cic_10_feature_button = widgets.Button(description = 'CIC 10 Features')

def select_cic_10(*args):
    feature_selector.value = u.cic_10_features

cic_10_feature_button.on_click(select_cic_10)

# UNSWNB15 Feature Button

unswnb15_feature_button = widgets.Button(description = 'UNSW-NB15')

def select_unswnb15(*args):
    feature_selector.value = u.unswnb15_features

unswnb15_feature_button.on_click(select_unswnb15)

# Bot-IoT Feature Button

botiot_feature_button = widgets.Button(description = 'Bot-IoT')

def select_botiot(*args):
    feature_selector.value = u.botiot_features

botiot_feature_button.on_click(select_botiot)


# display all buttons

display(widgets.HBox([all_feature_button, default_feature_button]))
display(widgets.HBox([unswnb15_feature_button, botiot_feature_button]))
display(widgets.HBox([cic_10_feature_button]))

### 4.2. Select Client

In [None]:
client = ['ra', 'racluster']

client_selector = widgets.Select(
    options=client,
    disabled=False
)

client_selector.layout.height = '60px'

client_description_label = widgets.HTML('<h4>Select client to use:</h4>')

display(widgets.VBox([client_description_label, client_selector]))

### 4.3. Select Management

man - print management records | noman - do not print management records

In [None]:
man = ['noman', 'man']

man_selector = widgets.Select(
    options=man,
    disabled=False
)

man_selector.layout.height = '60px'

man_description_label = widgets.HTML('<h4>Select which to use:</h4>')

display(widgets.VBox([man_description_label, man_selector]))

### 4.4. Extract Features into a CSV File

In [None]:
# Feature Logic

calculated_features = ['Ssaddr', 'Sdaddr']

calculated_features_dict = {'Ssaddr': 'saddr, sport', 'Sdaddr': 'daddr, dport'}

features_string = 'rank stime ltime proto saddr sport daddr dport'
calculated_features_array = []

for feature in feature_selector.value:
    if feature not in calculated_features:
        features_string += ' ' + feature
    else:
        calculated_features_array.append(feature)
        
#-nn", "-u",- ip",
def calculate_features(valid_hera_path): 
    # Generating normal features
    
    features_n = subprocess.run([client_selector.value, "-n", "-M", man_selector.value, "-u", "-r", valid_hera_path, "-c", "," , "-s", features_string], capture_output=True, text=True)
    
    lines_nf = features_n.stdout.strip().split('\n')
    column_names_nf = lines_nf[0].split(',')
    data_nf = [[field.strip() if field.strip() else np.nan for field in line.split(',')] for line in lines_nf[1:]]
    features_df = pd.DataFrame(data_nf, columns=column_names_nf)
    
    # Generating ID
    
    features_id = subprocess.run([client_selector.value, "-n", "-M", man_selector.value, "-u", "-r", valid_hera_path, "-c", "," , "-s", "daddr, saddr, dport, sport, proto"], capture_output=True, text=True)
    lines_id = features_id.stdout.strip().split('\n')
    data_id = ['-'.join(["nan" if field.strip() == '' else field.strip() for field in line.split(',')]) for line in lines_id[1:]]
    features_df.insert(0, 'FlowID', data_id)

    # Generating calculated features
    
    def calculate_features(valid_hera_path, calculated_feature):
        features = subprocess.run([client_selector.value, "-n", "-M", man_selector.value, "-u", "-r", valid_hera_path, "-c", ",", "-s", calculated_features_dict[calculated_feature]], capture_output=True, text=True)
        lines = features.stdout.strip().split('\n')
    
        column_titles = lines[0].split(',')
        
        service_count = []
        for line in lines[1:]:
            f1, f2 = line.split(',')
            service_count.append([f1, f2])  
        df = pd.DataFrame(service_count, columns=column_titles)
    
        features_df[calculated_feature] = df.groupby([df.columns[0], df.columns[1]])[df.columns[0]].transform('count').astype(pd.Int64Dtype())
    
    for feat in calculated_features_array:
        calculate_features(valid_hera_path, feat)
    
    return features_df

i = 0

for file in hera_file_name:
    if not os.path.exists(csv_path + client_selector.value + '_' + csv_file_name[i]):
        f = calculate_features(hera_path + file)
        f.to_csv(csv_path + client_selector.value + '_' + csv_file_name[i], index=False)
        print(f"Features extracted to CSV file: {csv_path + client_selector.value + '_' + csv_file_name[i]}")
        
        with open(csv_path + "racount_" + csv_file_name[i] + ".txt", "w") as output_file:
            subprocess.run(["racount", "-r", os.path.join(hera_path, file), "-M", "addr", "proto"], stdout=output_file)
    else:
        print(f"The file '{csv_path + client_selector.value + '_' + csv_file_name[i]}' already exists.")
    i += 1

## 5. Label Dataset

### 5.1. Input Ground Truth

#### StartTime,LastTime,SrcAddr,Sport,DstAddr,Dport,Proto,Label

In [None]:
ground_truth_file = input('Enter the ground truth file: ')

In [None]:
gt = pd.read_csv(ground_truth_file, dtype={'Sport': 'object', 'Dport': 'object'}, low_memory=False)

csv_files = []

csv_dir_files = os.listdir(valid_folder_path_csv)
for file in csv_dir_files:
    if os.path.splitext(os.path.basename(file))[1] == ".csv":
        csv_files.append(file)

def label(data, gt):
    data['StartTime'] = data['StartTime'].round().astype(int)
    data['LastTime'] = data['LastTime'].round().astype(int)
    
    gt['Sport'] = gt['Sport'].astype(str)
    gt['Dport'] = gt['Dport'].astype(str)
    
    start_time = data['StartTime'].max()
    end_time = data['LastTime'].min()
    
    gt['StartTime'] = gt['StartTime'].round().astype(int)
    gt['LastTime'] = gt['LastTime'].round().astype(int)
    
    if gt['StartTime'].isna().any() and gt['LastTime'].isna().any():
        gt_filtered = gt  
    elif gt['StartTime'].isna().any():
        gt_filtered = gt[(gt['LastTime'] <= end_time) & (gt['LastTime'] >= start_time)]
    elif gt['LastTime'].isna().any():
        gt_filtered = gt[(gt['StartTime'] >= start_time) & (gt['StartTime'] <= end_time)] 
    else:
        gt_filtered = gt[(gt['StartTime'] <= start_time) & (gt['LastTime'] >= end_time)]

    print("Full: ", len(gt))
    print("Used: ", len(gt_filtered))
    
    data['GTLabel'] = "Labelling"
    
    for row in gt_filtered.itertuples(index=False, name='GT'):
        label_gt = (((data['StartTime'] >= row.StartTime) if not pd.isna(row.StartTime) else True) &
                    ((data['LastTime'] <= row.LastTime) if not pd.isna(row.LastTime) else True) &
                    ((data['SrcAddr'] == row.SrcAddr) if not pd.isna(row.SrcAddr) else True) &
                    ((data['DstAddr'] == row.DstAddr) if not pd.isna(row.DstAddr) else True) &
                    ((data['Sport'] == row.Sport) if not pd.isna(row.Sport) else True) &
                    ((data['Dport'] == row.Dport) if not pd.isna(row.Dport) else True) &
                    ((data['Proto'] == row.Proto) if not pd.isna(row.Proto) else True)
                   )
        
        data.loc[label_gt,'GTLabel'] = row.Label
    
    data.loc[data['GTLabel'] == "Labelling", 'GTLabel'] = "Benign"
        
    return data

i = 0

for file in csv_files:
    data = pd.read_csv(csv_path + file, dtype={'Sport': 'object', 'Dport': 'object'}, low_memory=False)
    labelled_data = label(data, gt)
    
    data['GTLabel'] = labelled_data['GTLabel']
    data.to_csv(csv_path + os.path.splitext(os.path.basename(csv_files[i]))[0] + '_labelled.csv', index=False)
    
    stats = labelled_data['GTLabel'].value_counts()
    with open(csv_path + os.path.splitext(os.path.basename(csv_files[i]))[0] + '_labelled' + ".txt", "w") as output_file:
        output_file.write(stats.to_string())
    
    i += 1
    
    print(file)