## DATA CLEANUP

In [12]:
####################################################################################
# Notebook to be used for cleaning up the raw data files,  special values, etc.    # 
####################################################################################

In [1]:
# Import libraries
import numpy as np
import pandas as pd
import os
import re
import datetime
from dateutil import parser

In [2]:
# Create a set of the raw data and processed files name
# CONFIG NEEDED: Uncomment only the specific files to be processed on your node

csv_files = {
 '02-14-2018.csv': '02-14-2018-bruteforce-ftp-ssh.csv',
 '02-15-2018.csv': '02-15-2018-dos-goldeneye-slowloris.csv',
 '02-16-2018.csv': '02-16-2018-dos-slowhttp-hulk.csv',
 '02-21-2018.csv': '02-21-2018-ddos-loic-udp.csv',
 '02-22-2018.csv': '02-22-2018-bruteforce-webxss.csv',
 '02-23-2018.csv': '02-23-2018-bruteforce-webxss-sql.csv',
 '02-28-2018.csv': '02-28-2018-infiltration.csv',
 '03-01-2018.csv': '03-01-2018-botnet.csv',
 '03-02-2018.csv': '03-02-2018-infiltration.csv',
 '02-20-2018.csv': '02-20-2018-ddos-loic-tcp.csv'   # WARNING: 4GB FILE.
}

In [3]:
# Set the folder name for raw data and processed files under the project directory
# CONFIG NEEDED: Change the './data' and 'processed' to what you named your directories
# Raw Data Files Location: final_project/data
# Processed Data Files Location: final_project/data/processed

rawdata_path = '../data/CSE-CIC-IDS2018'
processed_path = os.path.join(rawdata_path, 'processed')

In [4]:
# Remove duplicate headers
def remove_headers(f):    
    return f[~f['Dst Port'].str.contains('Dst Port', na=False)]

In [5]:
# Drop rows that have 'Infinity', 'infinity', or 'inf' as value
def drop_infinity(f):
    # Remove infinity and NaN values    
    num_of_raw_records = f.shape[0]
    
    print('Number of Infinity or NaN Values')
    print(f.isin([np.nan, np.inf, -np.inf]).sum().sum())

    # Replace infinity to NaN and drop NaN values
    f = f.replace([np.inf, -np.inf], np.nan)
    f = f.dropna()
    f = f.reset_index(drop=True)

    dropped_NaN_records = num_of_raw_records - f.shape[0]
    print('Number of NaN/Inf Records Dropped: ', dropped_NaN_records)

    # Check infinity and NaN values
    print('Remaining Infinity or NaN Values')
    print(f.isin([np.nan, np.inf, -np.inf]).sum().sum())
    
    return f

In [6]:
column_name_regex = re.compile(r"\W", re.IGNORECASE)

# Clean (spaces, special characters, etc.) column headers and lower case 
def remove_non_word_chars_from_column_names(f):
    return [column_name_regex.sub('_', c.lower()) for c in df.columns]

In [11]:
# Create folder for processed files if none exists
if not os.path.exists(processed_path):
    os.mkdir(processed_path)    

# Process the list of files specified
for f, out in csv_files.items():
    file_path = os.path.join(rawdata_path, f)
    output_path = os.path.join(processed_path, out)

    print('reading:', file_path)
    # One of the data files has four extra columns which need to be dropped. Checks each file for extra columns.
    df = pd.read_csv(file_path, dtype=str).drop(columns=['Flow ID', 'Src IP', 'Dst IP', 'Src Port'], errors='ignore')
    for i in range(len(df.columns)):
        print(df.columns[i], '-', df.dtypes[i])

    # Clean up the data files
    print('processing:', file_path)
    df = remove_headers(df)
    df.columns = remove_non_word_chars_from_column_names(df)
    df = drop_infinity(df)

    # Write it as a cleaned file in the processed directory
    print('writing:', output_path)
    df.to_csv(output_path, index=False)

reading: ../data/CSE-CIC-IDS2018/02-14-2018.csv
Dst Port - object
Protocol - object
Timestamp - object
Flow Duration - object
Tot Fwd Pkts - object
Tot Bwd Pkts - object
TotLen Fwd Pkts - object
TotLen Bwd Pkts - object
Fwd Pkt Len Max - object
Fwd Pkt Len Min - object
Fwd Pkt Len Mean - object
Fwd Pkt Len Std - object
Bwd Pkt Len Max - object
Bwd Pkt Len Min - object
Bwd Pkt Len Mean - object
Bwd Pkt Len Std - object
Flow Byts/s - object
Flow Pkts/s - object
Flow IAT Mean - object
Flow IAT Std - object
Flow IAT Max - object
Flow IAT Min - object
Fwd IAT Tot - object
Fwd IAT Mean - object
Fwd IAT Std - object
Fwd IAT Max - object
Fwd IAT Min - object
Bwd IAT Tot - object
Bwd IAT Mean - object
Bwd IAT Std - object
Bwd IAT Max - object
Bwd IAT Min - object
Fwd PSH Flags - object
Bwd PSH Flags - object
Fwd URG Flags - object
Bwd URG Flags - object
Fwd Header Len - object
Bwd Header Len - object
Fwd Pkts/s - object
Bwd Pkts/s - object
Pkt Len Min - object
Pkt Len Max - object
Pkt Len Mean 

KeyboardInterrupt: 

In [None]:
############################################################################################
# Cells below this are only needed if you want to test if the files were created correctly #
# Comment/Uncomment as needed
############################################################################################

In [8]:
print(os.listdir(processed_path))  # Print list of files in the processed directory

['02-20-2018-ddos-loic-tcp.csv', '02-22-2018-bruteforce-webxss.csv', '02-28-2018-infiltration.csv', '02-15-2018-dos-goldeneye-slowloris.csv', '03-01-2018-botnet.csv', '03-02-2018-infiltration.csv', '02-14-2018-bruteforce-ftp-ssh.csv', '02-16-2018-dos-slowhttp-hulk.csv', '02-21-2018-ddos-loic-udp.csv', '02-23-2018-bruteforce-webxss-sql.csv']


In [9]:
# Read a sample file and check label counts
# CONFIG NEEDED: Change file name to the file you want to check
df = pd.read_csv("../data/processed/03-02-2018-infiltration.csv")
df['label'].value_counts()

Benign    760892
Bot       286191
Name: label, dtype: int64