# 03: Perform Data Cleaning
This notebook drops redundant features, strips whitespace, handles missing values, removes duplicates and impossible records.

## 3.1: Import Library

In [1]:
import pandas as pd
pd.set_option('display.max_colwidth', None)

## 3.2: Load CSV File

In [2]:
df = pd.read_csv('../data/processed/UNSW-NB15.csv', na_values=['-'], low_memory=False)

## 3.3: Remove Redundant Features

In [3]:
features_to_drop = [
    'srcip',  # Makes model learn environment not behaviour
    'dstip',  # Makes model learn environment not behaviour
    'sport',  # Randomly assigned by OS
    'dsport', # 'service' already captures protocol-level destination port meaning in a cleaner categorical form
    'Stime',  # Makes model learn environment not behaviour
    'Ltime',  # Makes model learn environment not behaviour
    'tcprtt', # Sum of 'synack' + 'ackdat' (does not provide information)
    'stcpb',  # TCP sequence numbers are randomly initialised
    'dtcpb'   # TCP sequence numbers are randomly initialised
]
df.drop(columns=features_to_drop, inplace=True)

## 3.4: Strip Whitespace

In [4]:
# Strip whitespace in column headers
df.columns = df.columns.str.strip()
# Strip whitespace in string features
columns = df.select_dtypes(include=['object', 'string']).columns
df[columns] = df[columns].apply(lambda x: x.str.strip())

In [5]:
# Check if whitespace in 'attack_cat' column has been removed
df['attack_cat'].value_counts()

attack_cat
Generic           215481
Exploits           44525
Fuzzers            24246
DoS                16353
Reconnaissance     13987
Analysis            2677
Backdoor            1795
Shellcode           1511
Backdoors            534
Worms                174
Name: count, dtype: int64

## 3.5: Handle Missing Values

In [6]:
# Check which features contain missing values
missing_values = df.isna().sum()
missing_values[missing_values > 0]

service             1246397
ct_flw_http_mthd    1348145
is_ftp_login        1429879
attack_cat          2218764
dtype: int64