In [1]:
from pathlib import Path
import pandas as pd
import numpy as np

import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.figure_factory as ff


from utils.data_processing import load_data, raw_columns, full_dtypes, transform_datetime, df_ua_parser, transform_ipinfo, transform_packetinfo, transform_proxyinfo

In [2]:
RANDOM_STATE = 124
data_path = Path("./data")
if data_path.joinpath("first_ml_processing.csv").exists():
    processed_data = pd.read_csv(data_path.joinpath("first_ml_processing.csv"))
    raw_data = pd.read_csv(data_path.joinpath("cybersecurity_attacks.csv"))
    ip_cols = ["Int Source IP", "Int Destination IP", "Global Source IP", "Global Destination IP"]
    raw_data[ip_cols] = transform_ipinfo(raw_data[["Source IP Address", "Destination IP Address"]])
else:
    # Must use clean_data function to load data 
    dtypes = {col: col_type for col, col_type in full_dtypes.items() if col in raw_columns}
    raw_data = load_data(data_path.joinpath("cybersecurity_attacks.csv"), dtype=dtypes)

    datetime_cols = ["Year", "Month", "Day", "Hour", "Minute", "Second", "DayOfWeek", "IsWeekend"]
    raw_data[datetime_cols] = transform_datetime(raw_data["Timestamp"])
    device_cols = ["String","Browser Name", "Browser Version", "Browser Minor", "Browser Patch",
                    "Browser Patch Minor", "OS Name", "OS Version", "OS Version Minor",
                    "OS Version Patch", "OS Version Patch Minor", "Device Brand", "Device Model",
                    "Device Type"]
    raw_data[device_cols] = df_ua_parser(raw_data["Device Information"])
    proxy_cols = ["Is Proxy"]
    raw_data[proxy_cols] = transform_proxyinfo(raw_data["Proxy Information"])
    ip_cols = ["Int Source IP", "Int Destination IP", "Global Source IP", "Global Destination IP"]
    raw_data[ip_cols] = transform_ipinfo(raw_data[["Source IP Address", "Destination IP Address"]])
    packet_cols = ["Packet Bin"]
    raw_data[packet_cols] = transform_packetinfo(raw_data["Packet Length"], scale=False)

    processed_data = raw_data.drop(columns=["Payload Data","Timestamp", "String", "Device Information",
                                    "Proxy Information", "Source IP Address", "Destination IP Address"])
    processed_data.to_csv(data_path.joinpath("first_ml_processing.csv"), index=False)

In [3]:
temp_data = processed_data[["Packet Length","Anomaly Scores", "Attack Type"]]
fig = px.scatter(temp_data, x="Packet Length", y="Anomaly Scores", color="Attack Type", title="Packet Length vs Anomaly Score")
fig.show()

The scatter plot above shows the relationship between packet length and anomaly scores, colored by attack type. <br>
We cannot observe any clear pattern or cluster in the data.

In [4]:
fig = make_subplots(rows=1, cols=1, )
temp_data = processed_data[["Packet Length","Anomaly Scores", "Attack Type"]]
temp_data["Packet Length"] = temp_data["Packet Length"] **4
temp_data["Anomaly Scores"] = temp_data["Anomaly Scores"] **7
for attack_type in temp_data["Attack Type"].unique():
    attack_data = temp_data[temp_data["Attack Type"] == attack_type]
    fig.add_trace(go.Scatter(x=attack_data["Packet Length"], y=attack_data["Anomaly Scores"], mode="markers", name=attack_type))
fig.update_layout(title="Packet Length vs Anomaly Score (Transformed)",
                  width=800, height=600) 
fig.show()

Most of the data is still concentraded in the lower part of the graphs and there doesn't seem the data is clustered by the type of the attack for the observations that are not close

In [5]:
def create_displot(data, group_labels):
    return ff.create_distplot(data, group_labels=group_labels, bin_size=25, show_hist=False, show_rug=False)

num_cols = processed_data.select_dtypes(include="number").columns.tolist()
temp_data = processed_data[num_cols].dropna(axis = "columns", how="any")
temp_data["Attack Type"] = processed_data["Attack Type"]

In [6]:
num_cols = temp_data.select_dtypes(include="number").columns.tolist()
nb_cols = 4
nb_rows = int(np.ceil(len(num_cols) / nb_cols))
fig = make_subplots(rows=nb_rows, cols=nb_cols)
for i, col in enumerate(num_cols):
    row = i // nb_cols + 1
    col_pos = i % nb_cols + 1
    hist_data = [temp_data[temp_data["Attack Type"] == attack_type][col].values for attack_type in temp_data["Attack Type"].unique()]
    displot = create_displot(hist_data, group_labels=temp_data["Attack Type"].unique())
    for trace in displot.select_traces():
        fig.add_trace(trace, row=row, col=col_pos,)
        fig.update_xaxes(title_text=f"{col} Distribution", row=row, col=col_pos)

fig.update_layout(title="Distribution of Numerical Features by Attack Type",
                  width=1600, height=nb_rows*400+200, showlegend=False)
fig.show()

All distribution/ kernel of the numerical distribution shows  