# Cleaning Dataset and Obtaining Samples for Testing:

## a preprocessing guide for raw NIDS data for use in QML testing

### Import dependencies

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import dask.dataframe as dd

### Load chosen datasets

In [4]:
# Dask is better for reading large CSVs
attack = dd.read_csv(
    "BCCC-CIC-CSE-IDS2018/Tuesday_20_02_2018/tuesday_20_02_2018_loic_http/tuesday_20_02_2018_loic_http.csv", 
    low_memory=False, 
    dtype={'delta_start': 'object',
       'handshake_duration': 'object'}
)
benign = dd.read_csv(
    "BCCC-CIC-CSE-IDS2018/Tuesday_20_02_2018/tuesday_20_02_2018_benign/tuesday_20_02_2018_benign.csv", 
    low_memory=False,
    dtype={'delta_start': 'object',
       'handshake_duration': 'object'}
)

# randomly sample 500 pts each
attack = attack.sample(frac=1, random_state=42).head(500)
benign = benign.sample(frac=1, random_state=42).head(500)


## DATA CLEANING

In [7]:
# Maintain only numeric columns for now
attack = attack.select_dtypes(include=[np.number])
benign = benign.select_dtypes(include=[np.number])

# Replace inf with NaN (to drop later)
attack.replace([np.inf, -np.inf], np.nan, inplace=True)
benign.replace([np.inf, -np.inf], np.nan, inplace=True)

# Drop columns with >20% missing in attack
cols = attack.isnull().mean() < 0.2
attack = attack.loc[:, cols]
benign = benign.loc[:, cols]

# Drop single-value columns (attack)
multi_cols = attack.nunique()[attack.nunique() > 1].index
shared_cols = [col for col in multi_cols if col in benign.columns]
attack = attack[shared_cols]
benign = benign[shared_cols]

# Drop highly collinear columns (corr > 0.95)
corr = attack.corr().abs()
upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
to_drop = [col for col in upper.columns if any(upper[col] > 0.95)]
attack = attack.drop(columns=to_drop)
benign = benign.drop(columns=to_drop)

# Drop any column that still has NaNs 
no_nan_cols = benign.columns[~benign.isnull().any()]
attack = attack[no_nan_cols]
benign = benign[no_nan_cols]

# Remove all 0 rows
benign = benign.loc[~(benign == 0.0).all(axis=1)]
attack = attack.loc[~(attack == 0.0).all(axis=1)]

## Get importances from GBF model

In [None]:
# 1. Combine and label
X = pd.concat([benign, attack], axis=0)
y = np.array([0] * len(benign) + [1] * len(attack))  # 0 = benign, 1 = attack

# 2. Train an XGBoost classifier
model = xgb.XGBClassifier(
    eval_metric='logloss', 
    n_estimators=100,
    max_depth=4,
    learning_rate=0.1,
    random_state=42
)
model.fit(X, y)

# 3. Pick top 8 most relevant features
importances = pd.Series(model.feature_importances_, index=X.columns)
top_features = importances.nlargest(8).index.tolist()
print(top_features)

['subflow_fwd_packets', 'packets_count', 'packets_IAT_median', 'max_fwd_packets_delta_len', 'duration', 'bwd_packets_IAT_median', 'packet_IAT_min', 'bwd_packets_IAT_min']


## Construct new cleaned dataset for testing using selected features

In [9]:
# Extract relevant features
attack = attack[top_features]
benign = benign[top_features]

# Save as numpy arrays
np.save('500-loic-attack.npy', attack.to_numpy())
np.save('500-loic-benign.npy', benign.to_numpy())