In [3]:
import sys
sys.path.append("..")

import numpy as np
import pandas as pd 
import re
import os, glob
import copy


from utils.dataset_processing.dataset_processing import DatasetPreprocess
from utils.dataset_processing.pcap_to_flow import Pcap2Flow
from utils.dataset_processing.feature_selection import FeatureSelection

In [None]:
######################### Setup Pathes, Parameters, Objects #########################
#####################################################################################


############################# Setup Parameters #############################
# number of packets extracted within a flow
packet_count = 8


############################### Setup Pathes ###############################
# find all pcap pathes
dataset_path = "../../../datasets/pcaps"
pcap_path_list = glob.glob(os.path.join(dataset_path, f"*-WorkingHours.pcap"))

# setup path list for flow-based datasets
raw_flow_path_list = []
raw_flow_base_path = "../../dataset/raw_flow_datasets/"
flow_preprocessed_path_list = []
flow_preprocessed_single_base_path = "../../dataset/flow_preprocessed_datasets/singal_datasets/"
flow_preprocessed_merged_path = "../../dataset/flow_preprocessed_datasets/merged_datasets/" + f"flow_preprocessed_merged_{packet_count}.csv"
flow_preprocessed_merged_balanced_path = flow_preprocessed_merged_path[: -4] + "_balanced.csv"

for pcap_path in pcap_path_list:
    # find the dataset day
    left_matching = "pcaps/"
    right_matching = "-Working"
    dataset_day = pcap_path[pcap_path.index(left_matching) + len(left_matching): pcap_path.index(right_matching)]
    raw_flow_path = raw_flow_base_path +  f"{dataset_day}_raw_flow_{packet_count}.csv"
    raw_flow_path_list.append(raw_flow_path)
    flow_preprocessed_path = flow_preprocessed_single_base_path + f"{dataset_day}_flow_preprocessed_{packet_count}.csv"
    flow_preprocessed_path_list.append(flow_preprocessed_path)
    

############################ Initialize Objects ############################
pcap2Flow = Pcap2Flow()
dataProc = DatasetPreprocess()
featureSelection = FeatureSelection()

In [None]:
########################### Dataset Preprocessing ##########################
############################################################################

############################# Unused Features ##############################
# identification features
features_identificaiton = ['id', 
                           'expiration_id', 
                           'src_ip', 
                           'src_mac', 
                           'src_oui', 
                           'src_port',
                           'dst_ip', 
                           'dst_mac', 
                           'dst_oui', 
                           'dst_port', 
                           'ip_version', 
                           'vlan_id', 
                           'tunnel_id']
# features related 'mean' except 'bidirectional_mean_ps'
features_mean = ['src2dst_mean_ps', 'dst2src_mean_ps']

# sum of dropped features
features_drop_sum = features_identificaiton + features_mean

# wildcards
feature_wildcards = ['ms', 'protocol', 'stddev']


############### Aggregation, Preprocessing (singal dateset) ################
# aggregate packet-based dataset (.pcap) into flow-based dataset (.csv)
for i, path in enumerate(pcap_path_list):
    pcap2Flow.to_flow(pcap_path=path, save_path=raw_flow_path_list[i],
                       flow_extract_type='sub-flow', limit=packet_count)

# label flow entry and remove unused features
for i, path in enumerate(raw_flow_path_list):
    dataProc.preprocess(dataset_path=path, save_path=flow_preprocessed_path_list[i], packet_count=packet_count, drop_features=features_drop_sum, drop_features_wildcards=feature_wildcards)

############################### Merge Datasets ##############################
dataProc.merge(dataset_path_list=flow_preprocessed_path_list, save_path=flow_preprocessed_merged_path)

In [None]:
############################# Dataset Balancing ############################
############################################################################
df_balanced = dataProc.balance_dataset(dataset_path=flow_preprocessed_merged_path, save_path=flow_preprocessed_merged_balanced_path)


In [None]:
############################# Feature Selection #############################
#############################################################################
# pathes
feature_importance_path = "../base_files/feature_importances.csv"
logging_path = "../../logs/computation_feature_importance_logger.log"
logging_info = "Dataset: 8 pkts of sub-flow, balanced"


featureSelection.compute_feature_importances(dataset_path=flow_preprocessed_merged_balanced_path, feature_scores_save_path=feature_importance_path, logging_path=logging_path, logging_info=logging_info, repeat_time=5)

In [8]:
############################### Refine Dataset ##############################
#############################################################################
feature_num = 20
feature_importance_path = "../base_files/feature_importances.csv"

dataset_refined_save_path = flow_preprocessed_merged_balanced_path[: -4] + f"_feature_num_{feature_num}.csv"

# refine all datasets
featureSelection.refine_dataset(feature_scores_path=feature_importance_path, dataset_path=flow_preprocessed_merged_balanced_path, dataset_relevant_save_path=dataset_refined_save_path, relevant_features_num=feature_num)
