In [10]:
%load_ext autoreload
%autoreload 2 

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [11]:
import numpy as np 
import pandas as pd

In [12]:
dataset_path = "./dataset.csv"
full = True

df = pd.read_csv(
    dataset_path,
    # dtype is specified to prevent a DtypeWarning
    dtype={
        "full_query": str,
        "label": int,
        "statement_type": str,
        "query_template_id": str,
        "attack_payload": str,
        "attack_id": str,
        "attack_technique": str,
        "attack_desc": str,
        "split": str,
        "sqlmap_status" : str,
        "attack_stage" : str,
    },
)


  df = pd.read_csv(


## Generic stats   


In [13]:
def display_counts_recon(df : pd.DataFrame):
    result = df.groupby('attack_id')['attack_stage'].value_counts().unstack(fill_value=0)
    
    if full:
        with pd.option_context('display.max_rows', None, 
                        'display.max_columns', None,
                        'display.width', None,
                        'display.max_colwidth', None):
            display(result)
    else: 
        display(result)

        
_ = display_counts_recon(df)


attack_stage,exploit,recon
attack_id,Unnamed: 1_level_1,Unnamed: 2_level_1
boolean-0,8111,2463
boolean-102,21,153
boolean-108,6322,151
boolean-114,21,154
boolean-12,8134,2730
boolean-120,21,149
boolean-126,6326,1653
boolean-132,1872,50
boolean-138,6326,1035
boolean-144,8,173


In [14]:
def display_attack_normal_per_set(df : pd.DataFrame):
    df_train_atk = df[(df["label"] == 1) & (df["split"] == "train")]  
    df_train_normal = df[(df["label"] == 0) & (df["split"] == "train")]  

    df_test_atk = df[(df["label"] == 1) & (df["split"] == "test")]  
    df_test_normal = df[(df["label"] == 0) & (df["split"] == "test")]  

    print(f"Number of attacks in train: {df_train_atk.shape[0]}")
    print(f"Number of normal in train: {df_train_normal.shape[0]}")
    print(f"Number of attacks in test: {df_test_atk.shape[0]}")
    print(f"Number of normal in test: {df_test_normal.shape[0]}")

display_attack_normal_per_set(df)

Number of attacks in train: 174831
Number of normal in train: 172588
Number of attacks in test: 91881
Number of normal in test: 1010402


In [15]:
def display_ids_per_set(df: pd.DataFrame):
    df_train = df[df["split"] == "train"]
    df_test = df[df["split"] == "test"]

    tids_train = df_train["query_template_id"].unique()
    tids_test = df_test["query_template_id"].unique()
    tids_complement = set(tids_test) - set(tids_train)
    print(f"Template IDS in df train: {len(tids_train)}, {tids_train}")
    print(f"Template IDS in df test:   {len(tids_test)}, {tids_test}")
    print(
        f"Template IDs in test but NOT in train: {len(tids_complement)}, {tids_complement}"
    )

    # Compute which template only have normal queries
    df_n = df.loc[df["label"] == 0,"query_template_id"].unique()
    df_a = df.loc[df["label"] == 1,"query_template_id"].unique()

    print(f"Templates IDs with no attacks: {set(df_n) - set(df_a)}")


display_ids_per_set(df)

Template IDS in df train: 44, ['airport-I2' 'airport-D8' 'airport-U8' 'airport-S1' 'airport-S17'
 'airport-S2' 'airport-S15' 'airport-U9' 'airport-U5' 'airport-S10'
 'airport-S8' 'airport-S5' 'airport-D4' 'airport-S12' 'airport-S9'
 'airport-D5' 'airport-S14' 'airport-I1' 'airport-S4' 'airport-S11'
 'airport-I3' 'airport-U2' 'airport-I8' 'airport-D2' 'airport-I9'
 'airport-U1' 'airport-D6' 'airport-admin11' 'airport-I4' 'airport-D1'
 'airport-admin7' 'airport-D7' 'airport-admin2' 'airport-admin1'
 'airport-admin9' 'airport-admin10' 'airport-admin5' 'airport-admin8'
 'airport-admin6' 'airport-admin12' 'airport-admin13' 'airport-admin3'
 'airport-admin4' 'airport-admin14']
Template IDS in df test:   51, ['airport-S4' 'airport-S13' 'airport-I4' 'airport-I2' 'airport-S8'
 'airport-S3' 'airport-S9' 'airport-admin5' 'airport-I3' 'airport-S2'
 'airport-S10' 'airport-I5' 'airport-S11' 'airport-S15' 'airport-D2'
 'airport-S16' 'airport-U9' 'airport-S12' 'airport-I9' 'airport-U1'
 'airport-D3' '

In [16]:
def _stmt_proportion(df: pd.DataFrame):
    return df['statement_type'].value_counts(normalize=True)

def display_ratio_per_stmt(df : pd.DataFrame):
    
    df_a = df[df["label"] == 1]
    df_n = df[df["label"] == 0]
    print(f"Proportion amongst normal: {_stmt_proportion(df=df_n)}")
    print(f"Proportion amongst attacks: {_stmt_proportion(df=df_a)}")
        
    df_train = df[df["split"] == "train"]
    df_test = df[df["split"] == "test"]
    print(f"Proportion amongst train: {_stmt_proportion(df=df_train)}")
    print(f"Proportion amongst test: {_stmt_proportion(df=df_test)}")

display_ratio_per_stmt(df=df)

Proportion amongst normal: statement_type
select    0.838212
update    0.073728
delete    0.045642
insert    0.039583
admin     0.002835
Name: proportion, dtype: float64
Proportion amongst attacks: statement_type
insert    0.498736
select    0.259823
update    0.164005
delete    0.077436
Name: proportion, dtype: float64
Proportion amongst train: statement_type
select    0.528621
insert    0.286818
update    0.104586
delete    0.073214
admin     0.006761
Name: proportion, dtype: float64
Proportion amongst test: statement_type
select    0.795840
update    0.085846
insert    0.072757
delete    0.044645
admin     0.000912
Name: proportion, dtype: float64


In [17]:
# ASR = Attack success rate
def display_asr_per_technique(df : pd.DataFrame):
    df_a = df[df["label"] == 1]
    # First keep a sample for each ID: 
    dfs = df_a.groupby('attack_id').sample(n=1, random_state=42)
    # Then, compute for each technique, the success 
    r = dfs["sqlmap_status"].value_counts(dropna=False)
    print(r)

# display_asr_per_technique(df=df)

In [18]:
def display_NAN_sqlmap_status(df : pd.DataFrame): 
    df_a = df[df["label"] == 1]
    dfa_nan = df_a[df_a["sqlmap_status"].isna()]
    display(dfa_nan)

def fix_NAN_sqlmap_status(df : pd.DataFrame):
    dfa = df[df["label"] == 1]
    for attack in dfa["attack_id"].unique():
        dfa_aid = dfa[dfa["attack_id"] == attack]
        status = dfa_aid.loc[dfa_aid["attack_stage"] == "exploit"].iloc[0]["sqlmap_status"]
# fix_NAN_sqlmap_status(df=df)