In [21]:
%load_ext autoreload
%autoreload 2 

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [22]:
import numpy as np 
import pandas as pd

In [23]:
dataset_path = "./dataset.csv"
full = True

df = pd.read_csv(
    dataset_path,
    # dtype is specified to prevent a DtypeWarning
    dtype={
        "full_query": str,
        "label": int,
        "statement_type": str,
        "query_template_id": str,
        "attack_payload": str,
        "attack_id": str,
        "attack_technique": str,
        "attack_desc": str,
        "split": str,
        "sqlmap_status" : str,
        "attack_stage" : str,
    },
)


## Generic stats   


In [24]:
def display_counts_recon(df : pd.DataFrame):
    result = df.groupby('attack_id')['attack_stage'].value_counts().unstack(fill_value=0)
    
    if full:
        with pd.option_context('display.max_rows', None, 
                        'display.max_columns', None,
                        'display.width', None,
                        'display.max_colwidth', None):
            display(result)
    else: 
        display(result)

        
_ = display_counts_recon(df)


attack_stage,exploit,recon
attack_id,Unnamed: 1_level_1,Unnamed: 2_level_1
error-0,0,10
error-1,0,4
error-2,0,15
error-3,0,9
error-4,0,3
error-5,0,25
error-6,0,10
error-7,2,40
error-8,0,8


In [25]:
def display_attack_normal_per_set(df : pd.DataFrame):
    df_train_atk = df[(df["label"] == 1) & (df["split"] == "train")]  
    df_train_normal = df[(df["label"] == 0) & (df["split"] == "train")]  

    df_test_atk = df[(df["label"] == 1) & (df["split"] == "test")]  
    df_test_normal = df[(df["label"] == 0) & (df["split"] == "test")]  

    print(f"Number of attacks in train: {df_train_atk.shape[0]}")
    print(f"Number of normal in train: {df_train_normal.shape[0]}")
    print(f"Number of attacks in test: {df_test_atk.shape[0]}")
    print(f"Number of normal in test: {df_test_normal.shape[0]}")

display_attack_normal_per_set(df)

Number of attacks in train: 81
Number of normal in train: 86
Number of attacks in test: 45
Number of normal in test: 495


In [26]:
def display_ids_per_set(df: pd.DataFrame):
    df_train = df[df["split"] == "train"]
    df_test = df[df["split"] == "test"]

    tids_train = df_train["query_template_id"].unique()
    tids_test = df_test["query_template_id"].unique()
    tids_complement = set(tids_test) - set(tids_train)
    print(f"Template IDS in df train: {len(tids_train)}, {tids_train}")
    print(f"Template IDS in df test:   {len(tids_test)}, {tids_test}")
    print(
        f"Template IDs in test but NOT in train: {len(tids_complement)}, {tids_complement}"
    )

    # Compute which template only have normal queries
    df_n = df.loc[df["label"] == 0,"query_template_id"].unique()
    df_a = df.loc[df["label"] == 1,"query_template_id"].unique()

    print(f"Templates IDs with no attacks: {set(df_n) - set(df_a)}")


display_ids_per_set(df)

Template IDS in df train: 9, ['airport-I9' 'airport-S8' 'airport-S10' 'airport-S11' 'airport-D6'
 'airport-U3' 'airport-I5' 'airport-D7' 'airport-admin10']
Template IDS in df test:   13, ['airport-I4' 'airport-U3' 'airport-I5' 'airport-S14' 'airport-S8'
 'airport-I9' 'airport-D7' 'airport-D6' 'airport-S10' 'airport-S11'
 'airport-admin8' 'airport-admin4' 'airport-admin11']
Template IDs in test but NOT in train: 5, {'airport-S14', 'airport-admin11', 'airport-I4', 'airport-admin4', 'airport-admin8'}
Templates IDs with no attacks: {'airport-D7', 'airport-admin11', 'airport-admin10', 'airport-admin4', 'airport-admin8'}


In [27]:
def _stmt_proportion(df: pd.DataFrame):
    return df['statement_type'].value_counts(normalize=True)

def display_ratio_per_stmt(df : pd.DataFrame):
    
    df_a = df[df["label"] == 1]
    df_n = df[df["label"] == 0]
    print(f"Proportion amongst normal: {_stmt_proportion(df=df_n)}")
    print(f"Proportion amongst attacks: {_stmt_proportion(df=df_a)}")
        
    df_train = df[df["split"] == "train"]
    df_test = df[df["split"] == "test"]
    print(f"Proportion amongst train: {_stmt_proportion(df=df_train)}")
    print(f"Proportion amongst test: {_stmt_proportion(df=df_test)}")

    dfntest = df_test[df_test["label"] == 0]

display_ratio_per_stmt(df=df)

Proportion amongst normal: statement_type
select    0.709122
delete    0.106713
insert    0.089501
update    0.084337
admin     0.010327
Name: proportion, dtype: float64
Proportion amongst attacks: statement_type
insert    0.595238
select    0.293651
update    0.079365
delete    0.031746
Name: proportion, dtype: float64
Proportion amongst train: statement_type
select    0.514970
insert    0.323353
update    0.101796
delete    0.053892
admin     0.005988
Name: proportion, dtype: float64
Proportion amongst test: statement_type
select    0.672222
insert    0.135185
delete    0.105556
update    0.077778
admin     0.009259
Name: proportion, dtype: float64


In [None]:
# ASR = Attack success rate
def display_asr_per_technique(df : pd.DataFrame):
    df_a = df[df["label"] == 1]
    # First keep a sample for each ID: 
    # dfs = df_a.groupby('attack_id').sample(n=1, random_state=42)
    # # Then, compute for each technique, the success 
    # r = dfs["attack_status"].value_counts(dropna=False)
    # print(r)

display_asr_per_technique(df=df)

NameError: name 'r' is not defined

In [None]:
def display_NAN_sqlmap_status(df : pd.DataFrame): 
    df_a = df[df["label"] == 1]
    dfa_nan = df_a[df_a["sqlmap_status"].isna()]
    display(dfa_nan)
