In [1]:
import pandas as pd
import time
import plotly.graph_objects as go
import plotly.express as px

from pathlib import Path
from itertools import combinations
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from plotly.subplots import make_subplots
from copy import deepcopy

from utils.data_processing import clean_data, load_data

In [2]:
palette = px.colors.qualitative.Plotly

In [3]:

path=Path("./data")
if not path.joinpath("processed_data.csv").exists():
    clean_data(path.joinpath("cybersecurity_attacks.csv"))
cleaned_data = load_data(path.joinpath("processed_data.csv"))

In [4]:

relaunch = ""
while relaunch.lower() not in ["y","n"]:
    relaunch = input("Do you want to relaunch the analysis to find good combinations? (y/n) ")
if relaunch.lower() == "y":
    col_combinations = [col for col in cleaned_data.columns if col != "Attack Type"]
    col_combinations = list(combinations(col_combinations,3))

    nb_rows=cleaned_data.shape[0]
    good_combinations = []
    start_time = time.monotonic()
    for i,combination in enumerate(col_combinations):
        columns = [col for col in combination]+["Attack Type"]
        temp_data = cleaned_data[columns]
        temp_data = temp_data.pivot_table(
            index=combination,  
            columns="Attack Type",
            aggfunc='size',
            fill_value=0
        )
        temp_data["Total"]= temp_data.sum(axis=1)
        percentage_data = temp_data[["DDoS","Malware", "Intrusion"]].div(temp_data["Total"], axis=0)
        percentage_data = percentage_data.max(axis=1) - percentage_data.min(axis=1)
        stats = percentage_data.describe()
        if stats.loc["25%"] > 0.02 and stats.loc["75%"] < 0.99:
            print(f"Found good {combination}")
            output = [feature for feature in combination]
            output.extend([f"%.4f" % stats.loc[x] for x in stats.index])
            good_combinations.append(output)
        if i%100==0 and i!=0:
            print(f"Time since start for {i:d} combinations: {time.monotonic() - start_time:.2f} seconds")
    time.monotonic() - start_time
    good_combinations_df = pd.DataFrame(good_combinations, columns=["Feature 1", "Feature 2", "Feature 3", 'count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'])
    good_combinations_df.to_csv(path.joinpath("good_combinations.csv"), index=False)
else:
    good_combinations_df = pd.read_csv(path.joinpath("good_combinations.csv"))

In [5]:
combination = good_combinations_df.iloc[0,0:3].tolist()
columns = deepcopy(combination)
columns.append("Attack Type")

temp_data = cleaned_data[columns]
temp_data = temp_data.pivot_table(
    index=combination,  
    columns="Attack Type",
    aggfunc='size',
    fill_value=0
)
temp_data["Total"]= temp_data.sum(axis=1)
# Get unique combinations of first 3 features
unique_combos = temp_data.index.unique()
attack_types = ["DDoS", "Malware", "Intrusion"]

# Create subplots - one for each (Traffic Type, Protocol, Packet Type) combination
num_combos = len(unique_combos)
nb_cols = 9
nb_rows = (num_combos + nb_cols -1) // nb_cols  # 3 columns per row

fig = make_subplots(
    rows=nb_rows,
    cols=nb_cols,
    subplot_titles=[f"{c[0]} | {c[1]} | {c[2]}" for c in unique_combos],
    horizontal_spacing=0.05,
    vertical_spacing=0.05,
)

for idx, combo in enumerate(unique_combos):
    row = (idx // nb_cols) + 1
    col = (idx % nb_cols) + 1
    for i, attack_type in enumerate(attack_types):
        fig.add_trace(
            go.Bar(
                x=[attack_type],
                y=[temp_data.loc[combo, attack_type]],
                name=attack_type,
                marker_color=palette[i % len(palette)],
                legendgroup=attack_type,
                showlegend=False
            ),
            row=row, col=col
        )
        
    fig.update_xaxes(title_text="Combo", row=row, col=col, tickangle=-45)
    fig.update_yaxes(title_text="Count", row=row, col=col)

fig.update_layout(
    height=400 * nb_rows,
    width=1600,
    title={
        'text': f"Combinations for each ({', '.join(combination)})",
        'y': 0.98,
        'x': 0.5,
        'xanchor': 'center',
        'yanchor': 'top'
    },
    barmode='group',
    showlegend=True
)

fig.show()

In [6]:
temp_data["Total"]= temp_data.sum(axis=1)
# Calculate percentages for each tuple (row-wise)
for attack_type in attack_types:
    temp_data[f"{attack_type} %"] = (temp_data[attack_type] / temp_data['Total'] * 100).round(2)


# Calculate difference between max and min percentage for each tuple (row-wise)
percentage_cols = [f"{attack_type} %" for attack_type in attack_types]
temp_data['Max-Min Diff %'] = (temp_data[percentage_cols].max(axis=1) - temp_data[percentage_cols].min(axis=1)).round(2)

# Reorder columns to show count and percentage together
new_order = []
for attack_type in attack_types:
    new_order.append(attack_type)
    new_order.append(f"{attack_type} %")
new_order.extend(['Total', 'Max-Min Diff %'])
temp_data = temp_data[new_order]
temp_data.describe()

Attack Type,DDoS,DDoS %,Malware,Malware %,Intrusion,Intrusion %,Total,Max-Min Diff %
count,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0
mean,186.5,16.785139,184.819444,16.642639,184.236111,16.572361,1111.111111,2.0525
std,23.975927,1.069657,22.762909,0.961923,24.023752,0.971761,124.292653,1.143762
min,132.0,14.18,136.0,13.95,128.0,13.45,836.0,0.08
25%,173.0,16.06,169.0,15.865,169.0,15.91,1045.0,1.3575
50%,191.0,16.885,186.0,16.725,188.0,16.64,1145.0,1.78
75%,203.0,17.3475,203.25,17.275,199.25,17.2025,1202.5,2.8125
max,236.0,19.38,241.0,19.44,234.0,18.54,1270.0,5.93


In [7]:
columns = ["IsWeekend", "Hour", "Attack Type"]
combination = ["IsWeekend", "Hour"]
temp_data = cleaned_data[columns]
temp_data = temp_data.pivot_table(
    index=combination,  
    columns="Attack Type",
    aggfunc='size',
    fill_value=0
)

In [8]:
# Get unique combinations of first 3 features
unique_combos = temp_data.index.unique()
attack_types = ["DDoS", "Malware", "Intrusion"]

# Create subplots - one for each (Traffic Type, Protocol, Packet Type) combination
num_combos = len(unique_combos)
nb_cols = 8
nb_rows = (num_combos + nb_cols -1) // nb_cols  # 3 columns per row

fig = make_subplots(
    rows=nb_rows,
    cols=nb_cols,
    subplot_titles=[f"{c[0]} | {c[1]}" for c in unique_combos],
    vertical_spacing=0.1,
    horizontal_spacing=0.05,
)

for idx, combo in enumerate(unique_combos):
    row = (idx // nb_cols) + 1
    col = (idx % nb_cols) + 1
    for i, attack_type in enumerate(attack_types):
        fig.add_trace(
            go.Bar(
                x=[attack_type],
                y=[temp_data.loc[combo, attack_type]],
                name=attack_type,
                marker_color=palette[i % len(palette)],
                legendgroup=attack_type,
                showlegend=False
            ),
            row=row, col=col
        )
        
    fig.update_xaxes(title_text="Combo", row=row, col=col, tickangle=-45)
    fig.update_yaxes(title_text="Count", row=row, col=col)

fig.update_layout(
    height=200 * nb_rows,
    width=1600,
    title={
        'text': f"Combinations for each ({', '.join(combination)})",
        'y': 0.98,
        'x': 0.5,
        'xanchor': 'center',
        'yanchor': 'top'
    },
    barmode='group',
    showlegend=True
)

fig.show()

In [9]:
temp_data["Total"]= temp_data.sum(axis=1)
# Calculate percentages for each tuple (row-wise)
for attack_type in attack_types:
    temp_data[f"{attack_type} %"] = (temp_data[attack_type] / temp_data['Total'] * 100).round(2)


# Calculate difference between max and min percentage for each tuple (row-wise)
percentage_cols = [f"{attack_type} %" for attack_type in attack_types]
temp_data['Max-Min Diff %'] = (temp_data[percentage_cols].max(axis=1) - temp_data[percentage_cols].min(axis=1)).round(2)

# Reorder columns to show count and percentage together
new_order = []
for attack_type in attack_types:
    new_order.append(attack_type)
    new_order.append(f"{attack_type} %")
new_order.extend(['Total', 'Max-Min Diff %'])
temp_data = temp_data[new_order]
temp_data.describe()

Attack Type,DDoS,DDoS %,Malware,Malware %,Intrusion,Intrusion %,Total,Max-Min Diff %
count,48.0,48.0,48.0,48.0,48.0,48.0,48.0,48.0
mean,279.75,33.38875,277.229167,33.325417,276.354167,33.285833,833.333333,3.541458
std,125.358804,1.891103,120.418992,1.585159,119.044152,1.700051,362.96101,1.977164
min,129.0,28.54,142.0,29.77,140.0,30.17,444.0,0.42
25%,159.75,32.1375,157.5,32.535,162.25,32.075,473.75,2.1275
50%,275.5,33.435,272.0,33.02,269.0,33.41,825.5,2.985
75%,401.25,34.6675,395.75,34.2025,393.75,34.4125,1197.25,4.795
max,442.0,37.55,429.0,37.68,436.0,36.77,1254.0,9.07


In [10]:
main_features = good_combinations_df.iloc[:,0:2].value_counts()

From this we can see a pattern: datetime information seems to give a good idea of how input data can be separated. The top combinations all include one part of a datetime in the first two features. <br>
We will use these features as combinations of their own as they are similar features. <br>
We then remove this features from the data and redo to filter to see if we can find other combinations.

In [11]:
relaunch = ""
while relaunch.lower() not in ["y","n"]:
    relaunch = input("Do you want to relaunch the analysis to find good combinations without date features? (y/n) ")
if relaunch.lower() == "y":
    data_wh_date = cleaned_data.copy().drop(columns=["Day", "Month", "Year","Hour","Minute", "Second","IsWeekend","DayOfWeek"])
    col_combinations = [col for col in data_wh_date.columns if col != "Attack Type"]
    col_combinations = list(combinations(col_combinations,3))

    nb_rows=data_wh_date.shape[0]
    good_combinations = []
    start_time = time.monotonic()
    for i,combination in enumerate(col_combinations):
        columns = [col for col in combination]+["Attack Type"]
        temp_data = data_wh_date[columns]
        temp_data = temp_data.pivot_table(
            index=combination,  
            columns="Attack Type",
            aggfunc='size',
            fill_value=0
        )
        temp_data["Total"]= temp_data.sum(axis=1)
        percentage_data = temp_data[["DDoS","Malware", "Intrusion"]].div(temp_data["Total"], axis=0)
        percentage_data = percentage_data.max(axis=1) - percentage_data.min(axis=1)
        stats = percentage_data.describe()
        if stats.loc["25%"] > 0.02 and stats.loc["75%"] < 0.99:
            print(f"Found good {combination}")
            output = [feature for feature in combination]
            output.extend([f"%.4f" % stats.loc[x] for x in stats.index])
            good_combinations.append(output)
        if i%100==0 and i!=0:
            print(f"Time since start for {i:d} combinations: {time.monotonic() - start_time:.2f} seconds")
    time.monotonic() - start_time
    combinations_wh_date_df = pd.DataFrame(good_combinations, columns=["Feature 1", "Feature 2", "Feature 3", 'count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'])
    combinations_wh_date_df.to_csv(path.joinpath("combinations_wh_date.csv"), index=False)
else:
    combinations_wh_date_df = pd.read_csv(path.joinpath("combinations_wh_date.csv"))

In [12]:
main_features = combinations_wh_date_df.iloc[:,0:2].value_counts()

Same idea with Browser and device information.

In [13]:
relaunch = ""
while relaunch.lower() not in ["y","n"]:
    relaunch = input("Do you want to relaunch the analysis to find good combinations without device features? (y/n) ")
if relaunch.lower() == "y":
    data_wh_device = cleaned_data.copy().drop(columns=["Day", "Month", "Year","Hour","Minute", "Second","IsWeekend","DayOfWeek",
                                                     "Browser Name", "Browser Version", "Browser Minor","Browser Patch", "Browser Patch Minor",
                                                     "OS Name", "OS Version", "OS Version Minor", "OS Version Patch", "OS Version Patch Minor",
                                                     "Device Brand", "Device Model", "Device Type"])
    col_combinations = [col for col in data_wh_device.columns if col != "Attack Type"]
    col_combinations = list(combinations(col_combinations,3))

    nb_rows=data_wh_device.shape[0]
    good_combinations = []
    start_time = time.monotonic()
    for i,combination in enumerate(col_combinations):
        columns = [col for col in combination]+["Attack Type"]
        temp_data = data_wh_device[columns]
        temp_data = temp_data.pivot_table(
            index=combination,  
            columns="Attack Type",
            aggfunc='size',
            fill_value=0
        )
        temp_data["Total"]= temp_data.sum(axis=1)
        percentage_data = temp_data[["DDoS","Malware", "Intrusion"]].div(temp_data["Total"], axis=0)
        percentage_data = percentage_data.max(axis=1) - percentage_data.min(axis=1)
        stats = percentage_data.describe()
        if stats.loc["25%"] > 0.02 and stats.loc["75%"] < 0.99:
            print(f"Found good {combination}")
            output = [feature for feature in combination]
            output.extend([f"%.4f" % stats.loc[x] for x in stats.index])
            good_combinations.append(output)
        if i%100==0 and i!=0:
            print(f"Time since start for {i:d} combinations: {time.monotonic() - start_time:.2f} seconds")
    time.monotonic() - start_time
    combinations_wh_device_df = pd.DataFrame(good_combinations, columns=["Feature 1", "Feature 2", "Feature 3", 'count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'])
    combinations_wh_device_df.to_csv(path.joinpath("combinations_wh_device.csv"), index=False)
else:
    combinations_wh_device_df = pd.read_csv(path.joinpath("combinations_wh_device.csv"))

In [14]:
main_features = combinations_wh_device_df.iloc[:,0:2].value_counts()

At first look, Packet Type seems to be a good feature to use in combination

In [15]:
unique_features = main_features.reset_index()[["Feature 1","Feature 2"]].stack().unique()
unique_features


<StringArray>
[             'Protocol',              'Packet_T',           'Packet Type',
      'Attack Signature',          'Action Taken',        'Severity Level',
       'Network Segment',         'Firewall Logs',        'IDS/IPS Alerts',
            'Log Source',            'Packet Bin',      'Global Source IP',
        'Anomaly Scores',     'Geo-location Data', 'Global Destination IP']
Length: 18, dtype: str

Geo location must be modified to represent city and state <br>

In [19]:
missing_features = set(cleaned_data.columns) - set(unique_features) - set(["Day", "Month", "Year","Hour","Minute", "Second","IsWeekend","DayOfWeek",
                                                                         "Browser Name", "Browser Version", "Browser Minor","Browser Patch", "Browser Patch Minor",
                                                                         "OS Name", "OS Version", "OS Version Minor", "OS Version Patch", "OS Version Patch Minor",
                                                                         "Device Brand", "Device Model", "Device Type", "Attack Type"])
missing_features

{'Destination Port',
 'Int Destination IP',
 'Int Source IP',
 'Is Proxy',
 'Source Port'}

In [None]:
IP-related features do not appear to provide much information about the attack type