In [1]:
import pandas as pd
from tqdm import tqdm
from pyarrow import csv
import pyarrow as pa

import numpy as np

In [2]:
def default_transform(df):
    return df

def read_chunk(name, chunk_id, opts=csv.ConvertOptions(), transform=default_transform, extra=1):
    chunk=None
    chunksize = extra*10 ** 6
    with csv.open_csv(
        name,
        convert_options = opts,
        read_options=csv.ReadOptions(
            use_threads=True,
            block_size=chunksize
        )) as reader:

        i=0
        for next_chunk in reader:
            if next_chunk is None:
                break
            chunk=next_chunk
            if i == chunk_id:
                return transform(chunk.to_pandas())
            i+=1
            
    return None

def prep_NF_UQ(df):
    features_to_remove = ['L4_SRC_PORT', 'L4_DST_PORT', 'Attack', 'Dataset', 'IPV4_SRC_ADDR', 'IPV4_DST_ADDR']
    df.drop(columns=features_to_remove, axis=1, inplace=True) 
    df['mask'] = 1
    return df

def prep_ddos(df):
    features_to_remove = ['Unnamed: 0', 'Flow ID', ' Source IP', ' Source Port', ' Destination IP', ' Destination Port', ' Timestamp']
    df.drop(columns=features_to_remove, axis=1, inplace=True) 
    
    df = df.rename(columns=
                   {' Protocol': 'PROTOCOL', 
                   ' Total Fwd Packets': 'IN_PKTS', 
                   ' Total Backward Packets': 'OUT_PKTS',
                   'Total Length of Fwd Packets': 'IN_BYTES',
                   ' Total Length of Bwd Packets': 'OUT_BYTES',
                   'Fwd IAT Total': 'DURATION_IN',
                   'Bwd IAT Total': 'DURATION_OUT',
                   ' Label': 'Attack',
                   ' Flow Duration': 'FLOW_DURATION_MILLISECONDS',
                   }, errors="raise")
    
    df['PROTOCOL'] = df['PROTOCOL'].astype(np.int64)
    df['IN_PKTS'] = df['IN_PKTS'].astype(np.int64)
    df['OUT_PKTS'] = df['OUT_PKTS'].astype(np.int64)
    df['IN_BYTES'] = df['IN_BYTES'].astype(np.int64)
    df['OUT_BYTES'] = df['OUT_BYTES'].astype(np.int64)
    df['DURATION_IN']= (df['DURATION_IN']/1000000).astype(np.int64)
    df['DURATION_OUT']= (df['DURATION_OUT']/1000000).astype(np.int64)
    df['Label'] = 1
    df['mask'] = 2
    return df

In [3]:
prep_NF_UQ(read_chunk('NF-UQ-NIDS-v2.csv', chunk_id=0)).columns

Index(['PROTOCOL', 'L7_PROTO', 'IN_BYTES', 'IN_PKTS', 'OUT_BYTES', 'OUT_PKTS',
       'TCP_FLAGS', 'CLIENT_TCP_FLAGS', 'SERVER_TCP_FLAGS',
       'FLOW_DURATION_MILLISECONDS', 'DURATION_IN', 'DURATION_OUT', 'MIN_TTL',
       'MAX_TTL', 'LONGEST_FLOW_PKT', 'SHORTEST_FLOW_PKT', 'MIN_IP_PKT_LEN',
       'MAX_IP_PKT_LEN', 'SRC_TO_DST_SECOND_BYTES', 'DST_TO_SRC_SECOND_BYTES',
       'RETRANSMITTED_IN_BYTES', 'RETRANSMITTED_IN_PKTS',
       'RETRANSMITTED_OUT_BYTES', 'RETRANSMITTED_OUT_PKTS',
       'SRC_TO_DST_AVG_THROUGHPUT', 'DST_TO_SRC_AVG_THROUGHPUT',
       'NUM_PKTS_UP_TO_128_BYTES', 'NUM_PKTS_128_TO_256_BYTES',
       'NUM_PKTS_256_TO_512_BYTES', 'NUM_PKTS_512_TO_1024_BYTES',
       'NUM_PKTS_1024_TO_1514_BYTES', 'TCP_WIN_MAX_IN', 'TCP_WIN_MAX_OUT',
       'ICMP_TYPE', 'ICMP_IPV4_TYPE', 'DNS_QUERY_ID', 'DNS_QUERY_TYPE',
       'DNS_TTL_ANSWER', 'FTP_COMMAND_RET_CODE', 'Label', 'mask'],
      dtype='object')

In [4]:
read_chunk('01-12/DrDoS_UDP.csv', chunk_id=0).columns

Index(['Unnamed: 0', 'Flow ID', ' Source IP', ' Source Port',
       ' Destination IP', ' Destination Port', ' Protocol', ' Timestamp',
       ' Flow Duration', ' Total Fwd Packets', ' Total Backward Packets',
       'Total Length of Fwd Packets', ' Total Length of Bwd Packets',
       ' Fwd Packet Length Max', ' Fwd Packet Length Min',
       ' Fwd Packet Length Mean', ' Fwd Packet Length Std',
       'Bwd Packet Length Max', ' Bwd Packet Length Min',
       ' Bwd Packet Length Mean', ' Bwd Packet Length Std', 'Flow Bytes/s',
       ' Flow Packets/s', ' Flow IAT Mean', ' Flow IAT Std', ' Flow IAT Max',
       ' Flow IAT Min', 'Fwd IAT Total', ' Fwd IAT Mean', ' Fwd IAT Std',
       ' Fwd IAT Max', ' Fwd IAT Min', 'Bwd IAT Total', ' Bwd IAT Mean',
       ' Bwd IAT Std', ' Bwd IAT Max', ' Bwd IAT Min', 'Fwd PSH Flags',
       ' Bwd PSH Flags', ' Fwd URG Flags', ' Bwd URG Flags',
       ' Fwd Header Length', ' Bwd Header Length', 'Fwd Packets/s',
       ' Bwd Packets/s', ' Min Packet Len

In [5]:
read_chunk('01-12/DrDoS_UDP.csv', chunk_id=0)[' Label']

0       DrDoS_UDP
1       DrDoS_UDP
2       DrDoS_UDP
3       DrDoS_UDP
4       DrDoS_UDP
          ...    
2064    DrDoS_UDP
2065    DrDoS_UDP
2066    DrDoS_UDP
2067    DrDoS_UDP
2068    DrDoS_UDP
Name:  Label, Length: 2069, dtype: object

In [6]:
prep_ddos(read_chunk('01-12/DrDoS_UDP.csv', chunk_id=0))

Unnamed: 0,PROTOCOL,FLOW_DURATION_MILLISECONDS,IN_PKTS,OUT_PKTS,IN_BYTES,OUT_BYTES,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,SimillarHTTP,Inbound,Attack,Label,mask
0,17,218395,6,0,2088,0,393.0,321.0,348.0,35.088460,...,0.0,0.0,0.0,0.0,0.0,0,1,DrDoS_UDP,1,2
1,17,108219,4,0,1398,0,369.0,330.0,349.5,22.516660,...,0.0,0.0,0.0,0.0,0.0,0,1,DrDoS_UDP,1,2
2,17,104579,4,0,1438,0,389.0,330.0,359.5,34.063666,...,0.0,0.0,0.0,0.0,0.0,0,1,DrDoS_UDP,1,2
3,17,110967,4,0,1544,0,389.0,383.0,386.0,3.464102,...,0.0,0.0,0.0,0.0,0.0,0,1,DrDoS_UDP,1,2
4,17,1,2,0,766,0,383.0,383.0,383.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0,1,DrDoS_UDP,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2064,17,1,2,0,750,0,375.0,375.0,375.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0,1,DrDoS_UDP,1,2
2065,17,1,2,0,750,0,375.0,375.0,375.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0,1,DrDoS_UDP,1,2
2066,17,1,2,0,802,0,401.0,401.0,401.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0,1,DrDoS_UDP,1,2
2067,17,2,2,0,802,0,401.0,401.0,401.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0,1,DrDoS_UDP,1,2


In [89]:
cols = prep_NF_UQ(read_chunk('NF-UQ-NIDS-v2.csv', chunk_id=0)).columns
cols = np.unique(np.concatenate((cols, prep_ddos(read_chunk('01-12/DrDoS_UDP.csv', chunk_id=0)).columns), axis=0))

def append(dataset, df):
    df[cols[np.invert(np.isin(cols, df.columns))]] = 0
    return dataset.append(df, ignore_index=True)

In [90]:
dataset = pd.DataFrame(columns=cols)

In [91]:
dataset

Unnamed: 0,ACK Flag Count,Active Max,Active Min,Active Std,Average Packet Size,Avg Bwd Segment Size,Avg Fwd Segment Size,Bwd Avg Bytes/Bulk,Bwd Avg Packets/Bulk,Bwd Header Length,...,SERVER_TCP_FLAGS,SHORTEST_FLOW_PKT,SRC_TO_DST_AVG_THROUGHPUT,SRC_TO_DST_SECOND_BYTES,SimillarHTTP,Subflow Fwd Packets,TCP_FLAGS,TCP_WIN_MAX_IN,TCP_WIN_MAX_OUT,mask


In [92]:
tmp = prep_NF_UQ(read_chunk('NF-UQ-NIDS-v2.csv', chunk_id=0))

In [93]:
append(append(dataset, tmp), prep_ddos(read_chunk('01-12/DrDoS_UDP.csv', chunk_id=0)))

  return dataset.append(df, ignore_index=True)
  return dataset.append(df, ignore_index=True)


Unnamed: 0,ACK Flag Count,Active Max,Active Min,Active Std,Average Packet Size,Avg Bwd Segment Size,Avg Fwd Segment Size,Bwd Avg Bytes/Bulk,Bwd Avg Packets/Bulk,Bwd Header Length,...,SERVER_TCP_FLAGS,SHORTEST_FLOW_PKT,SRC_TO_DST_AVG_THROUGHPUT,SRC_TO_DST_SECOND_BYTES,SimillarHTTP,Subflow Fwd Packets,TCP_FLAGS,TCP_WIN_MAX_IN,TCP_WIN_MAX_OUT,mask
0,0,0,0,0,0,0,0,0,0,0,...,0,140,1120000,140280.0,0,0,2,512,0,1
1,0,0,0,0,0,0,0,0,0,0,...,20,40,0,280.0,0,0,22,512,0,1
2,0,0,0,0,0,0,0,0,0,0,...,20,40,352000,44.0,0,0,22,1024,0,1
3,0,0,0,0,0,0,0,0,0,0,...,20,40,352000,44.0,0,0,22,1024,0,1
4,0,0,0,0,0,0,0,0,0,0,...,20,40,352000,44.0,0,0,22,1024,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7590,0,0.0,0.0,0.0,562.5,0.0,375.0,0,0,0,...,0,0,0,0.0,0,2,0,0,0,2
7591,0,0.0,0.0,0.0,562.5,0.0,375.0,0,0,0,...,0,0,0,0.0,0,2,0,0,0,2
7592,0,0.0,0.0,0.0,601.5,0.0,401.0,0,0,0,...,0,0,0,0.0,0,2,0,0,0,2
7593,0,0.0,0.0,0.0,601.5,0.0,401.0,0,0,0,...,0,0,0,0.0,0,2,0,0,0,2


In [74]:
cols[np.isin(cols, tmp.columns)]

array(['CLIENT_TCP_FLAGS', 'DNS_QUERY_ID', 'DNS_QUERY_TYPE',
       'DNS_TTL_ANSWER', 'DST_TO_SRC_AVG_THROUGHPUT',
       'DST_TO_SRC_SECOND_BYTES', 'DURATION_IN', 'DURATION_OUT',
       'FLOW_DURATION_MILLISECONDS', 'FTP_COMMAND_RET_CODE',
       'ICMP_IPV4_TYPE', 'ICMP_TYPE', 'IN_BYTES', 'IN_PKTS', 'L7_PROTO',
       'LONGEST_FLOW_PKT', 'Label', 'MAX_IP_PKT_LEN', 'MAX_TTL',
       'MIN_IP_PKT_LEN', 'MIN_TTL', 'NUM_PKTS_1024_TO_1514_BYTES',
       'NUM_PKTS_128_TO_256_BYTES', 'NUM_PKTS_256_TO_512_BYTES',
       'NUM_PKTS_512_TO_1024_BYTES', 'NUM_PKTS_UP_TO_128_BYTES',
       'OUT_BYTES', 'OUT_PKTS', 'PROTOCOL', 'RETRANSMITTED_IN_BYTES',
       'RETRANSMITTED_IN_PKTS', 'RETRANSMITTED_OUT_BYTES',
       'RETRANSMITTED_OUT_PKTS', 'SERVER_TCP_FLAGS', 'SHORTEST_FLOW_PKT',
       'SRC_TO_DST_AVG_THROUGHPUT', 'SRC_TO_DST_SECOND_BYTES',
       'TCP_FLAGS', 'TCP_WIN_MAX_IN', 'TCP_WIN_MAX_OUT', 'mask'],
      dtype=object)

In [75]:
tmp.columns.shape

(41,)

In [76]:
np.isin(cols, tmp.columns)

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
       False,  True,  True, False, False, False, False,  True,  True,
        True,  True, False, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
       False, False,  True,  True,  True,  True])

In [77]:
tmp[cols[np.invert(np.isin(cols, tmp.columns))]] = 0

In [78]:
tmp

Unnamed: 0,PROTOCOL,L7_PROTO,IN_BYTES,IN_PKTS,OUT_BYTES,OUT_PKTS,TCP_FLAGS,CLIENT_TCP_FLAGS,SERVER_TCP_FLAGS,FLOW_DURATION_MILLISECONDS,...,Bwd Packet Length Max,FIN Flag Count,Flow Bytes/s,Fwd Avg Bytes/Bulk,Fwd PSH Flags,Fwd Packets/s,Idle Mean,Init_Win_bytes_forward,SimillarHTTP,Subflow Fwd Packets
0,6,7.0,420,3,0,0,2,2,0,4293092,...,0,0,0,0,0,0,0,0,0,0
1,6,7.0,280,2,40,1,22,2,20,4294499,...,0,0,0,0,0,0,0,0,0,0
2,6,0.0,44,1,40,1,22,2,20,0,...,0,0,0,0,0,0,0,0,0,0
3,6,0.0,44,1,40,1,22,2,20,0,...,0,0,0,0,0,0,0,0,0,0
4,6,0.0,44,1,40,1,22,2,20,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5521,6,0.0,44,1,0,0,2,2,0,0,...,0,0,0,0,0,0,0,0,0,0
5522,6,0.0,44,1,0,0,2,2,0,0,...,0,0,0,0,0,0,0,0,0,0
5523,6,7.0,511,5,1147,5,219,219,27,4294931,...,0,0,0,0,0,0,0,0,0,0
5524,6,0.0,180,3,0,0,2,2,0,0,...,0,0,0,0,0,0,0,0,0,0


In [83]:
dataset.append(tmp)

  dataset.append(tmp)


Unnamed: 0,ACK Flag Count,Active Max,Active Min,Active Std,Average Packet Size,Avg Bwd Segment Size,Avg Fwd Segment Size,Bwd Avg Bytes/Bulk,Bwd Avg Packets/Bulk,Bwd Header Length,...,SERVER_TCP_FLAGS,SHORTEST_FLOW_PKT,SRC_TO_DST_AVG_THROUGHPUT,SRC_TO_DST_SECOND_BYTES,SimillarHTTP,Subflow Fwd Packets,TCP_FLAGS,TCP_WIN_MAX_IN,TCP_WIN_MAX_OUT,mask
0,0,0,0,0,0,0,0,0,0,0,...,0,140,1120000,140280.0,0,0,2,512,0,1
1,0,0,0,0,0,0,0,0,0,0,...,20,40,0,280.0,0,0,22,512,0,1
2,0,0,0,0,0,0,0,0,0,0,...,20,40,352000,44.0,0,0,22,1024,0,1
3,0,0,0,0,0,0,0,0,0,0,...,20,40,352000,44.0,0,0,22,1024,0,1
4,0,0,0,0,0,0,0,0,0,0,...,20,40,352000,44.0,0,0,22,1024,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5521,0,0,0,0,0,0,0,0,0,0,...,0,44,352000,44.0,0,0,2,1024,0,1
5522,0,0,0,0,0,0,0,0,0,0,...,0,44,352000,44.0,0,0,2,1024,0,1
5523,0,0,0,0,0,0,0,0,0,0,...,27,40,104000,511.0,0,0,219,65535,26883,1
5524,0,0,0,0,0,0,0,0,0,0,...,0,60,1440000,180.0,0,0,2,26883,0,1
