## DATA CLEANUP

In [12]:
####################################################################################
# Notebook to be used for cleaning up the raw data files,  special values, etc.    # 
####################################################################################

In [13]:
# Import libraries
import numpy as np
import pandas as pd
import os
import re
import datetime
from dateutil import parser

In [14]:
# Create a set of the raw data and processed files name
# CONFIG NEEDED: Uncomment only the specific files to be processed on your node

csv_files = {
 '02-14-2018.csv': '02-14-2018-bruteforce-ftp-ssh.csv',
 '02-15-2018.csv': '02-15-2018-dos-goldeneye-slowloris.csv',
 '02-16-2018.csv': '02-16-2018-dos-slowhttp-hulk.csv',
 '02-21-2018.csv': '02-21-2018-ddos-loic-udp.csv',
 '02-22-2018.csv': '02-22-2018-bruteforce-webxss.csv',
 '02-23-2018.csv': '02-23-2018-bruteforce-webxss-sql.csv',
 '02-28-2018.csv': '02-28-2018-infiltration.csv',
 '03-01-2018.csv': '03-01-2018-botnet.csv',
 '03-02-2018.csv': '03-02-2018-infiltration.csv',
 '02-20-2018.csv': '02-20-2018-ddos-loic-tcp.csv'   # WARNING: 4GB FILE.
}

In [15]:
# Set the folder name for raw data and processed files under the project directory
# CONFIG NEEDED: Change the './data' and 'processed' to what you named your directories
# Raw Data Files Location: final_project/data
# Processed Data Files Location: final_project/data/processed

rawdata_path = '../data'
processed_path = os.path.join(rawdata_path, 'processed')

In [16]:
# Remove duplicate headers
def remove_headers(f):    
    return f[~f['Dst Port'].str.contains('Dst Port', na=False)]

In [17]:
# Drop rows that have 'Infinity', 'infinity', or 'inf' as value
def drop_infinity(f):
    # Remove infinity and NaN values    
    num_of_raw_records = f.shape[0]
    
    print('Number of Infinity or NaN Values')
    print(f.isin([np.nan, np.inf, -np.inf]).sum().sum())

    # Replace infinity to NaN and drop NaN values
    f = f.replace([np.inf, -np.inf], np.nan)
    f = f.dropna()
    f = f.reset_index(drop=True)

    dropped_NaN_records = num_of_raw_records - f.shape[0]
    print('Number of NaN/Inf Records Dropped: ', dropped_NaN_records)

    # Check infinity and NaN values
    print('Remaining Infinity or NaN Values')
    print(f.isin([np.nan, np.inf, -np.inf]).sum().sum())
    
    return f

In [18]:
column_name_regex = re.compile(r"\W", re.IGNORECASE)

# Clean (spaces, special characters, etc.) column headers and lower case 
def remove_non_word_chars_from_column_names(f):
    return [column_name_regex.sub('_', c.lower()) for c in df.columns]

In [None]:
# Create folder for processed files if none exists
if not os.path.exists(processed_path):
    os.mkdir(processed_path)    

# Process the list of files specified
for f, out in csv_files.items():
    file_path = os.path.join(rawdata_path, f)
    output_path = os.path.join(processed_path, out)

    print('reading:', file_path)
    # One of the data files has four extra columns which need to be dropped. Checks each file for extra columns.
    df = pd.read_csv(file_path, dtype=str).drop(columns=['Flow ID', 'Src IP', 'Dst IP', 'Src Port'], errors='ignore')

    # Clean up the data files
    print('processing:', file_path)
    df = remove_headers(df)
    df.columns = remove_non_word_chars_from_column_names(df)
    df = drop_infinity(df)

    # Write it as a cleaned file in the processed directory
    print('writing:', output_path)
    df.to_csv(output_path, index=False)

reading: ../data/02-14-2018.csv
processing: ../data/02-14-2018.csv
Number of Infinity or NaN Values
2277
Number of NaN/Inf Records Dropped:  2277
Remaining Infinity or NaN Values
0
writing: ../data/processed/02-14-2018-bruteforce-ftp-ssh.csv
reading: ../data/02-15-2018.csv
processing: ../data/02-15-2018.csv
Number of Infinity or NaN Values
4921
Number of NaN/Inf Records Dropped:  4921
Remaining Infinity or NaN Values
0
writing: ../data/processed/02-15-2018-dos-goldeneye-slowloris.csv
reading: ../data/02-16-2018.csv
processing: ../data/02-16-2018.csv
Number of Infinity or NaN Values
0
Number of NaN/Inf Records Dropped:  0
Remaining Infinity or NaN Values
0
writing: ../data/processed/02-16-2018-dos-slowhttp-hulk.csv
reading: ../data/02-21-2018.csv
processing: ../data/02-21-2018.csv
Number of Infinity or NaN Values
0
Number of NaN/Inf Records Dropped:  0
Remaining Infinity or NaN Values
0
writing: ../data/processed/02-21-2018-ddos-loic-udp.csv
reading: ../data/02-22-2018.csv
processing: .

In [None]:
############################################################################################
# Cells below this are only needed if you want to test if the files were created correctly #
# Comment/Uncomment as needed
############################################################################################

In [None]:
print(os.listdir(processed_path))  # Print list of files in the processed directory

In [None]:
# Read a sample file and check label counts
# CONFIG NEEDED: Change file name to the file you want to check
df = pd.read_csv("../data/processed/03-02-2018-infiltration.csv")
df['label'].value_counts()