In [1]:
import zipfile
import os
import pandas as pd
import chardet


## Loading Data

You can download the dataset from [this link](https://www.unb.ca/cic/datasets/ids-2017.html). I opted to use only the processed data in the CSVs folder instead of the raw packets data, as it allows for more direct investigation and easier insight generation.

To download the data via terminal, use the following command:
```
wget -r -np -nH --cut-dirs=3 -R "index.html*" http://205.174.165.80/CICDataset/CIC-IDS-2017/Dataset/CIC-IDS-2017/CSVs/
```

In [2]:
# Unzip the data
path = "GeneratedLabelledFlows.zip"
with zipfile.ZipFile(path, 'r') as zip_ref:
    zip_ref.extractall(".")


In [3]:
# print the files in the directory
print(os.listdir())


['train_nn.ipynb', 'CICDS2017.ipynb', 'all_data.csv', 'checkpoint', 'label_encoder.pkl', 'GeneratedLabelledFlows.md5', 'model_checkpoint.data-00000-of-00001', 'model_checkpoint.index', 'X_test.csv', 'y_test.csv', 'GeneratedLabelledFlows.zip', 'y_train_sample.npy', 'y_train.csv', 'X_test_sample.npy', 'sample_data.csv', 'TrafficLabelling ', 'nn_model.h5', 'X_train.csv', 'X_train_sample.npy', 'y_test_sample.npy', 'Loading_data.ipynb']


Folder 'TrafficLabelling ' was created, now let's see what files are in it:

In [4]:
folder_data = "TrafficLabelling /"
print(os.listdir(folder_data))

['Tuesday-WorkingHours.pcap_ISCX.csv', 'Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv', 'Friday-WorkingHours-Morning.pcap_ISCX.csv', 'Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv', 'Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv', 'Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv', 'Monday-WorkingHours.pcap_ISCX.csv', 'Wednesday-workingHours.pcap_ISCX.csv']


There are 8 CSV files extracted. To facilitate better visualization and preprocessing, combine them into a single dataframe.

While reading the CSVs, one of them (`Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv`) failed due to the following error:

```
UnicodeDecodeError: 'utf-8' codec can't decode byte 0x96 in position 22398: invalid start byte
```

This can be fixed by detecting the encoding for each file and using the correct encoding while reading them into a pandas dataframe.

In [5]:
# Load the data into one dataframe all_data to make it easier to visualize and process

folder_data = "TrafficLabelling /"
all_data = pd.DataFrame()

for file in os.listdir(folder_data):
    if file.endswith(".csv"):
        print("reading file: ", file)

        # Detect encoding
        with open(folder_data + file, 'rb') as f:
            result = chardet.detect(f.read())
            encoding = result['encoding']
        data = pd.read_csv(folder_data + file, encoding=encoding, low_memory=False, skipinitialspace=True)
        all_data = pd.concat([all_data, data])



reading file:  Tuesday-WorkingHours.pcap_ISCX.csv
reading file:  Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv
reading file:  Friday-WorkingHours-Morning.pcap_ISCX.csv
reading file:  Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv
reading file:  Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv
reading file:  Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv
reading file:  Monday-WorkingHours.pcap_ISCX.csv
reading file:  Wednesday-workingHours.pcap_ISCX.csv


In [6]:
all_data.head()

Unnamed: 0,Flow ID,Source IP,Source Port,Destination IP,Destination Port,Protocol,Timestamp,Flow Duration,Total Fwd Packets,Total Backward Packets,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,192.168.10.3-192.168.10.5-88-49182-6,192.168.10.5,49182.0,192.168.10.3,88.0,6.0,4/7/2017 8:54,640.0,7.0,4.0,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN
1,192.168.10.3-192.168.10.5-88-49183-6,192.168.10.5,49183.0,192.168.10.3,88.0,6.0,4/7/2017 8:54,900.0,9.0,4.0,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN
2,192.168.10.3-192.168.10.5-88-49184-6,192.168.10.5,49184.0,192.168.10.3,88.0,6.0,4/7/2017 8:54,1205.0,7.0,4.0,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN
3,192.168.10.3-192.168.10.5-88-49185-6,192.168.10.5,49185.0,192.168.10.3,88.0,6.0,4/7/2017 8:54,511.0,7.0,4.0,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN
4,192.168.10.3-192.168.10.5-88-49186-6,192.168.10.5,49186.0,192.168.10.3,88.0,6.0,4/7/2017 8:54,773.0,9.0,4.0,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN


In [12]:
# Rows that contain olny NaN values
nan_rows = all_data[all_data.isnull().all(axis=1)]
print(f"Number of rows that contain only NaN values: {nan_rows.shape[0]}")

# Drop rows that contain only NaN values
all_data = all_data.dropna(how='all')
print(f"Number of rows after dropping rows that contain only NaN values: {all_data.shape[0]}")

Number of rows that contain only NaN values: 288602
Number of rows after dropping rows that contain only NaN values: 2830743


In [14]:
all_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2830743 entries, 0 to 692702
Data columns (total 85 columns):
 #   Column                       Dtype  
---  ------                       -----  
 0   Flow ID                      object 
 1   Source IP                    object 
 2   Source Port                  float64
 3   Destination IP               object 
 4   Destination Port             float64
 5   Protocol                     float64
 6   Timestamp                    object 
 7   Flow Duration                float64
 8   Total Fwd Packets            float64
 9   Total Backward Packets       float64
 10  Total Length of Fwd Packets  float64
 11  Total Length of Bwd Packets  float64
 12  Fwd Packet Length Max        float64
 13  Fwd Packet Length Min        float64
 14  Fwd Packet Length Mean       float64
 15  Fwd Packet Length Std        float64
 16  Bwd Packet Length Max        float64
 17  Bwd Packet Length Min        float64
 18  Bwd Packet Length Mean       float64
 19  Bwd Pa

In [None]:
# Change dtype of the columns to reduce memory usage

int_columns = ['Source Port', 'Destination Port', 'Source IP', 'Destination IP', 'Flow Duration', 'Total Fwd Packets', 'Total Backward Packets',
               'Total Length of Fwd Packets', 'Total Length of Bwd Packets', 'Fwd Packet Length Max', 'Fwd Packet Length Min', 'Bwd Packet Length Max',
               'Bwd Packet Length Min', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Max',
               'Bwd IAT Min', ]

In [16]:
object_columns = all_data.select_dtypes(include=['object']).columns
numeric_columns = all_data.select_dtypes(include=['float64']).columns

all_data[numeric_columns] = all_data[numeric_columns].apply(pd.to_numeric)

In [18]:
# Take a sample of 600000  rows of the data, to faster and more stable visualization and processing

sample_data = all_data.sample(600000, random_state=42)

In [19]:
# Save the sample data to a csv file
sample_data.to_csv("sample_data.csv", index=False)


In [20]:
# Save the all data to a csv file
all_data.to_csv("all_data.csv", index=False)