# Analysis of CSECICIDS2018

In [9]:
# Data Loading
import pandas as pd
import os
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq

In [10]:
filenames = [f for f in os.listdir('data/CSECICIDS2018_improved') if f.endswith('.csv')]

In [11]:
active_filenames = [
    'Thursday-22-02-2018.csv',
    'Friday-16-02-2018.csv',
    # 'Thursday-15-02-2018.csv',
    'Wednesday-14-02-2018.csv',
    'Tuesday-20-02-2018.csv',
    'Wednesday-21-02-2018.csv',
    # 'Friday-23-02-2018.csv',
    # 'Wednesday-28-02-2018.csv',
    # 'Friday-02-03-2018.csv',
    # 'Thursday-01-03-2018.csv'
]

In [12]:
EDGE_COLS = [
    'Bwd Packet Length Min', 'Protocol', 'Bwd Packets/s', 'FWD Init Win Bytes',
    'Packet Length Std', 'FIN Flag Count',
    'Packet Length Min', 'Fwd Seg Size Min',
    'Bwd IAT Total', 'SYN Flag Count', 'Bwd Packet Length Std'
]
LABEL_COL = "Label"
ID_COLS = ['Src IP', 'Dst IP', 'Timestamp']
COLS_TO_KEEP = EDGE_COLS + [LABEL_COL] + ID_COLS

In [13]:
data_dir = 'data/CSECICIDS2018_improved'
output_dir = 'data/processed_chunks'
final_path = 'data/combined.parquet'
os.makedirs(output_dir, exist_ok=True)

In [None]:
chunksize = 200_000
temp_files = []

for i, filename in enumerate(active_filenames, 1):
    file_path = os.path.join(data_dir, filename)
    print(f'[{i}/{len(active_filenames)}] Processing: {filename}')

    for j, chunk in enumerate(pd.read_csv(file_path, usecols=COLS_TO_KEEP, chunksize=chunksize)):
        # Convert pandas â†’ Arrow table (avoids dtype extension issues)
        table = pa.Table.from_pandas(chunk, preserve_index=False)

        temp_file = os.path.join(output_dir, f'temp_{i:02d}_{j:03d}.parquet')
        pq.write_table(table, temp_file, compression='snappy')
        temp_files.append(temp_file)

print(f"Finished writing {len(temp_files)} Parquet chunks")

# Merge all small Parquet files into one final combined file
print("Merging all chunks into a single Parquet file...")

pq.write_table(
    pa.concat_tables([pq.read_table(f) for f in temp_files]),
    final_path,
    compression='snappy'
)

print(f"Combined dataset written to {final_path}")

In [14]:
dataset = pq.ParquetDataset(final_path)
table = dataset.read()

In [15]:
table.nbytes / (10 ** 9)  # Size in GB

5.240525175

In [16]:
COLS_TO_LEAVE = ID_COLS + [LABEL_COL]
df = table.select(COLS_TO_LEAVE).to_pandas()

In [17]:
df.head()

Unnamed: 0,Src IP,Dst IP,Timestamp,Label
0,172.31.66.11,23.15.8.121,2018-02-22 12:22:51.109054,BENIGN
1,172.31.66.11,23.52.91.27,2018-02-22 12:22:51.575427,BENIGN
2,172.31.66.11,72.21.91.29,2018-02-22 12:22:53.312634,BENIGN
3,172.31.66.11,108.174.11.1,2018-02-22 12:22:57.954629,BENIGN
4,172.31.66.11,52.179.17.38,2018-02-22 12:23:33.459731,BENIGN


In [18]:
df.shape

(32377064, 4)

In [19]:
df['target'] = np.where(df[LABEL_COL] == 'BENIGN', 0, 1)

In [23]:
df['target'].value_counts(normalize=True)

target
0    0.889708
1    0.110292
Name: proportion, dtype: float64

In [24]:
df['Timestamp'] = pd.to_datetime(df['Timestamp'])

In [25]:
df['day'] = df['Timestamp'].dt.day

In [38]:
active_filenames

['Thursday-22-02-2018.csv',
 'Friday-16-02-2018.csv',
 'Wednesday-14-02-2018.csv',
 'Tuesday-20-02-2018.csv',
 'Wednesday-21-02-2018.csv']

In [27]:
df['day'].unique()

array([22, 23, 16, 14, 15, 20, 21], dtype=int32)

In [28]:
df.sort_values('Timestamp', inplace=True)

In [34]:
df['Label'].value_counts(normalize=True)

Label
BENIGN                                  8.897082e-01
DoS Hulk                                5.569251e-02
DDoS-HOIC                               3.342777e-02
FTP-BruteForce - Attempted              9.231041e-03
DDoS-LOIC-HTTP                          8.936202e-03
SSH-BruteForce                          2.909374e-03
DDoS-LOIC-UDP                           7.804908e-05
DDoS-LOIC-UDP - Attempted               7.752402e-06
DoS Hulk - Attempted                    2.656201e-06
Web Attack - Brute Force - Attempted    2.347341e-06
Web Attack - Brute Force                2.131138e-06
Web Attack - XSS                        1.235442e-06
Web Attack - SQL                        4.941770e-07
Web Attack - SQL - Attempted            1.235442e-07
Web Attack - XSS - Attempted            9.265819e-08
Name: proportion, dtype: float64

In [29]:
df.head()

Unnamed: 0,Src IP,Dst IP,Timestamp,Label,target,day
17791854,172.31.66.58,239.255.255.250,2018-02-14 12:28:07.743746,BENIGN,0,14
17774250,172.31.66.46,239.255.255.250,2018-02-14 12:28:08.143839,BENIGN,0,14
17773964,172.31.66.46,169.254.169.254,2018-02-14 12:28:08.175858,BENIGN,0,14
17771218,107.217.94.48,172.31.66.46,2018-02-14 12:28:08.214861,BENIGN,0,14
17774447,172.31.66.46,172.31.0.2,2018-02-14 12:28:08.295304,BENIGN,0,14


In [30]:
train_ids = df.index.tolist()[:int(0.8 * len(df))]
test_ids =  df.index.tolist()[int(0.8 * len(df)):]
train_df = df.loc[train_ids]
test_df = df.loc[test_ids]

In [None]:
malicious_ratio_train = train_df['target'].value_counts(normalize=True)[1]
malicious_ratio_test = test_df['target'].value_counts(normalize=True)[1]

print(f"Malicious ratio in train set: {malicious_ratio_train:.4f}")
print(f"Malicious ratio in test set: {malicious_ratio_test:.4f}")

Malicious ratio in train set: 0.1379
Malicious ratio in test set: 0.0000


In [33]:
malicious_count_train = train_df['target'].value_counts()[1]
malicious_count_test = test_df['target'].value_counts()[1]

malicious_count_train, malicious_count_test

(np.int64(3570716), np.int64(208))

In [35]:
print(f"Days in the training set: {train_df['day'].unique()}")
print(f"Days in the test set: {test_df['day'].unique()}")

Days in the training set: [14 15 16 20 21]
Days in the test set: [21 22 23]
