In [None]:
### Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import Birch
import hdbscan

In [None]:
### Read CSV file & Future Formatting
df_data = pd.read_csv("data.csv")
pd.set_option("display.max_columns", None)
pd.set_option('display.max_rows', None)
pd.options.display.float_format = "{:.2f}".format

# Identify unnecessary and target columns
DROP_LABELS = ["Flow ID", "Src IP", "Src Port", "Dst IP", "Dst Port", "Timestamp"]
TARGET_LABELS = ["Label", "Traffic Type", "Traffic Subtype"]

In [None]:
### Data introduction
# Sampling
def sampling(df: pd.DataFrame, cap: int, rate: float, notext=False) -> pd.DataFrame:
    if notext == False:
        print("Dataset before sampling has", df.shape[0], "rows and", df.shape[1], "columns")
        print("Sampling ...")

    if cap != 0:
        subtype_counts = df["Traffic Subtype"].value_counts()  # All the traffic subtypes
        subtypes_sampling = subtype_counts[subtype_counts > cap].index    # All the traffic subtypes with over 10000 rows
        subtypes_nosampling = subtype_counts[subtype_counts <= cap].index # All the traffic subtypes with under 10000 rows
        
        df_sampling = df[df["Traffic Subtype"].isin(subtypes_sampling)]   # DataFrame with all the oversized traffic subtypes
        df_sampled = df_sampling.sample(frac=rate)  # Sampled DataFrame for oversized traffic types
        df_notsampled = df[df["Traffic Subtype"].isin(subtypes_nosampling)]   # DataFrame for rest of traffic types
        df_final =  pd.concat([df_sampled, df_notsampled], ignore_index=True)  # Final  partially sampled DataFrame

    else:
        df_final = df.sample(frac=rate)
        
    if notext == False:
        print("Dataset after sampling has", df_final.shape[0], "rows and", df_final.shape[1], "columns")
    
    return df_final

# Drop unnecessary data
def drop_unnecessary(df: pd.DataFrame) -> pd.DataFrame:
    print("Dataset before removing unnecessary data has", df.shape[0], "rows and", df.shape[1], "columns")
    print("Removing unnecessary data ...")
    df_final =  df.drop(columns=DROP_LABELS)
    print("Dataset after removing unnecessary data has", df_final.shape[0], "rows and", df_final.shape[1], "columns")
    
    return df_final

# Data presentation
def present(df: pd.DataFrame):
    display(df.head())
    display(df.tail())
    display(df.describe().T)
    display(df.groupby(TARGET_LABELS).size().reset_index(name="Counts"))

In [None]:
### Correlation, target correlation and intresting columns
LABEL_MAPPER = {"Malicious": 1, "Benign": 0}
TYPES_MAPPER = {"Audio": 0, "Background": 1, "Text": 2, "Video": 3, "Bruteforce": 4, "DoS": 5, "Information Gathering": 6, "Mirai": 7}

def labels_to_numerical(df: pd.DataFrame, mapper: dict) -> pd.DataFrame:
    df_copy = df.copy()
    df_copy["Label"] = df_copy["Label"].map(mapper)
    return df_copy

def ttypes_to_numerical(df: pd.DataFrame, mapper: dict) -> pd.DataFrame:
    df_copy = df.copy()
    df_copy["Traffic Type"] = df_copy["Traffic Type"].map(mapper)
    return df_copy

def correlation(df_numerical: pd.DataFrame, target=False):
    if target == True:
        important_metrics_label = []
        important_metrics_ttype = []
        df_copy = df_numerical.copy()
        cor = df_copy.corr(numeric_only=True).filter(["Label", "Traffic Type"])

        for r in cor.index.tolist():
            if abs(cor.at[r, "Label"]) > 0.1 and r != "Traffic Type":
                important_metrics_label.append(r)
                print(r, cor.at[r, "Label"])
            if abs(cor.at[r, "Traffic Type"]) > 0.1 and r != "Label":
                important_metrics_ttype.append(r)

        print(len(important_metrics_label), "Metrics with high correlation with Label")
        print(len(important_metrics_ttype), "Metrics with high correlation with Traffic Type")
        return cor, important_metrics_label, important_metrics_ttype

    else:
        df_copy = df_numerical.copy()
        return df_copy.corr(numeric_only=True)

In [None]:
### Data Visualization
# Create heatmap from correlation Pandas Dataframe
def heatmap(corr: pd.DataFrame, title: str) -> None:
    print("Generating heatmap ...")
    plt.figure(figsize=(30, 20))
    sns.heatmap(corr, cmap="coolwarm", square=True)
    plt.title(title)
    plt.show()

# Create boxplot diagram to showcase target differences
def boxplot(df: pd.DataFrame, title: str, y_lim_bot: float, y_lim_top: float) -> None:
    print("Generating boxplot ...")
    plt.figure(figsize=(15, 5))
    sns.boxplot(data=df)
    plt.ylim((y_lim_bot, y_lim_top))
    plt.xticks(rotation = 65, ha = "right")
    plt.title(title)
    plt.show()

# Create barplot to showcase certain values
def barplot(x, y, title: str) -> None:
    print("Generating barplot ...")
    plt.figure(figsize=(15, 5))
    sns.barplot(x=x, y=y)
    plt.ylim((-1, 1))
    plt.xticks(rotation = 65, ha = "right")
    plt.title(title)
    plt.grid(True, axis="y")
    plt.show()

In [None]:
### Dimensionality Reduction
PCA_COLS = ["PC1", "PC2", "PC3", "PC4", "PC5", "PC6", "PC7", "PC8", "PC9", "PC10", "PC11", "PC12", "PC13", "PC14", "PC15"]

def dim_reduction(df: pd.DataFrame, N: int, method: str) -> pd.DataFrame:
    if method == "PCA":
        scaler = StandardScaler()
        df_scaled = scaler.fit_transform(df)
        pca = PCA(n_components=N)
        principal_components = pca.fit_transform(df_scaled)
        df_final = pd.DataFrame(data=principal_components, columns=PCA_COLS[:N])
        print("PCA METHOD --> Cumulative variance:", pca.explained_variance_ratio_.cumsum()[-1])
    
    return df_final

In [None]:
### Clustering
# BIRCH Algorithm
def BIRCH_clustering(df: pd.DataFrame,) -> pd.DataFrame:
    scaler = StandardScaler()
    df_scaled = scaler.fit_transform(df)
    clusterer = Birch(n_clusters=None)
    clusterer.fit(df_scaled)
    df["Cluster"] = clusterer.labels_
    return df

# HDBSCAN Algorithm
def HDBSCAN_clustering(df: pd.DataFrame, minimum_cluster_size: int) -> pd.DataFrame:
    scaler = StandardScaler()
    df_scaled = scaler.fit_transform(df)
    clusterer = hdbscan.HDBSCAN(min_cluster_size=minimum_cluster_size, metric="euclidean")
    clusters = clusterer.fit_predict(df_scaled)
    df["Cluster"] = clusters
    return df

# Present HDBSCAN results
def HDBSCAN_results(df: pd.DataFrame):
    sns.countplot(x="HDBSCAN_Cluster", data=df)
    

In [None]:
### Initial sampling, remove excess metrics and duplicates in order to create plots in reasonable time, present data
df = sampling(df_data, cap=10000, rate=0.05)
df = drop_unnecessary(df)
present(df)

In [None]:
### Present distributions for all metrics for both Benign and Malicious labels
ylim_top = 1.4 * 1e8
ylim_bot = -0.1 * 1e8
df_benign = df[df["Label"] == "Benign"]
df_malicious = df[df["Label"] == "Malicious"]

benign_list = ["Audio", "Background", "Text", "Video"]
malicious_list = ["Bruteforce", "DoS", "Information Gathering", "Mirai"]

for ttype in benign_list:
    boxplot(df_benign[df_benign["Traffic Type"] == ttype].drop(columns=TARGET_LABELS), "Boxplot for traffic type: " + ttype, ylim_bot, ylim_top)

for ttype in malicious_list:
    boxplot(df_malicious[df_malicious["Traffic Type"] == ttype].drop(columns=TARGET_LABELS), "Boxplot for traffic type: " + ttype, ylim_bot, ylim_top)

boxplot(df_benign.drop(columns=TARGET_LABELS), "Boxplot for Benign", ylim_bot, ylim_top)
boxplot(df_malicious.drop(columns=TARGET_LABELS), "Boxplot for Malicious", ylim_bot, ylim_top)

In [None]:
# Convert all data to numerical and compute correlations
df_num = labels_to_numerical(df, LABEL_MAPPER)
df_num = ttypes_to_numerical(df_num, TYPES_MAPPER)

cor1 = correlation(df_num)
cor2, important_metrics_label, important_metrics_ttype = correlation(df_num, target=True)

In [None]:
### Create heatmaps
heatmap(cor1, "Correlation between metrics")
heatmap(cor2, "Correlation between metrics and target")

In [None]:
### Showcase top correlations
df_important_metrics_label_count = cor2.filter(items=important_metrics_label, axis="index").filter(["Label"]).sort_values(by="Label")
barplot(
    df_important_metrics_label_count.index, df_important_metrics_label_count.values.reshape(len(df_important_metrics_label_count.index)),
    "Top metrics in terms of correlation with Label"
)

df_important_metrics_ttype_count = cor2.filter(items=important_metrics_ttype, axis="index").filter(["Traffic Type"]).sort_values(by="Traffic Type")
barplot(
    df_important_metrics_ttype_count.index, df_important_metrics_ttype_count.values.reshape(len(df_important_metrics_ttype_count.index)),
    "Top metrics in terms of correlation with Traffic Type"
)

In [None]:
### DataFrames with metrics highly correlated with targets, dimension reduction
DIM_TARGET = 10
df_label = df_num[important_metrics_label]
df_ttype = df_num[important_metrics_ttype]

print("Target Label:")
df_labelDimReduction = dim_reduction(df_label.drop(columns=["Label"]), DIM_TARGET, "PCA")
df_labelDimReduction["Label"] = df_label["Label"]
print("Target Traffic Type:")
df_ttypeDimReduction = dim_reduction(df_ttype.drop(columns=["Traffic Type"]), DIM_TARGET, "PCA")
df_ttypeDimReduction["Traffic Type"] = df_ttype["Traffic Type"]


In [None]:
### Sampling, Clustering with Kmeans, Clustering with BIRCH
# Data reduction through sampling
df_label_sampling = sampling(df_labelDimReduction, 0, 0.3, notext=True)
df_ttype_sampling = sampling(df_ttypeDimReduction, 0, 0.3, notext=True)

# Data reduction through BIRCH clustering
df_label_BIRCH = BIRCH_clustering(df_labelDimReduction.drop(columns=["Label"]))
df_ttype_BIRCH = BIRCH_clustering(df_ttypeDimReduction.drop(columns=["Traffic Type"]))

# Data reduction through HDBSCAN clustering
df_label_HDBSCAN = HDBSCAN_clustering(df_labelDimReduction.drop(columns=["Label"]), 10)
df_ttype_HDBSCAN = HDBSCAN_clustering(df_ttypeDimReduction.drop(columns=["Traffic Type"]), 10)

In [None]:
display(df_label_BIRCH.groupby("Cluster").size().reset_index(name="Counts"))
display(df_ttype_BIRCH.groupby("Cluster").size().reset_index(name="Counts"))
display(df_label_HDBSCAN.groupby("Cluster").size().reset_index(name="Counts"))
display(df_ttype_HDBSCAN.groupby("Cluster").size().reset_index(name="Counts"))