In [1]:
import pyarrow as pa
import numpy as np
import pandas as pd

from pyarrow import csv
from os import listdir
from os.path import isfile, join

In [11]:
def default_transform(df):
    return df

def read_chunk(name, chunk_id, opts=csv.ConvertOptions(), transform=default_transform, extra=1):
    chunk=None
    chunksize = extra*10 ** 7
    with csv.open_csv(
        name,
        convert_options = opts,
        read_options=csv.ReadOptions(
            use_threads=True,
            block_size=chunksize
        )) as reader:

        i=0
        for next_chunk in reader:
            if next_chunk is None:
                break
            chunk=next_chunk
            if i == chunk_id:
                return transform(chunk.to_pandas())
            i+=1
            
    return None

def prep_NF_UQ(df):
    features_to_remove = ['L4_SRC_PORT', 'L4_DST_PORT', 'Attack', 'L4_SRC_PORT', 'L4_DST_PORT']
    df.drop(columns=features_to_remove, axis=1, inplace=True) 
    df['mask'] = 1
    return df

def prep_ddos(df):
    features_to_remove = ['Unnamed: 0', 'Flow ID', ' Source IP', ' Source Port', ' Destination IP', ' Destination Port', ' Timestamp']
    df.drop(columns=features_to_remove, axis=1, inplace=True) 
    
    df = df.rename(columns=
                   {' Protocol': 'PROTOCOL', 
                   ' Total Fwd Packets': 'IN_PKTS', 
                   ' Total Backward Packets': 'OUT_PKTS',
                   'Total Length of Fwd Packets': 'IN_BYTES',
                   ' Total Length of Bwd Packets': 'OUT_BYTES',
                   'Fwd IAT Total': 'DURATION_IN',
                   'Bwd IAT Total': 'DURATION_OUT',
                   ' Label': 'Attack',
                   ' Flow Duration': 'FLOW_DURATION_MILLISECONDS',
                   }, errors="raise")
    
    df['PROTOCOL'] = df['PROTOCOL'].astype(np.int64)
    df['IN_PKTS'] = df['IN_PKTS'].astype(np.int64)
    df['OUT_PKTS'] = df['OUT_PKTS'].astype(np.int64)
    df['IN_BYTES'] = df['IN_BYTES'].astype(np.int64)
    df['OUT_BYTES'] = df['OUT_BYTES'].astype(np.int64)
    df['DURATION_IN']= (df['DURATION_IN']/1000000).astype(np.int64)
    df['DURATION_OUT']= (df['DURATION_OUT']/1000000).astype(np.int64)
    df['Label'] = 1
    df['mask'] = 2
    return df

In [12]:
def append_dataset(dataset, df):
    df[cols[np.invert(np.isin(cols, df.columns))]] = 0
    return dataset.append(df, ignore_index=True)

In [13]:
cols = prep_NF_UQ(pd.read_parquet('data/NFUQ/NF-UNSW-NB15-V2.parquet')).columns
cols = np.unique(np.concatenate((cols, prep_ddos(read_chunk('data/CIC_DDOS2019/01-12/DrDoS_UDP.csv', chunk_id=0)).columns), axis=0))

In [14]:
valid_dataset = pd.DataFrame(columns=cols)

In [15]:
valid_dataset

Unnamed: 0,ACK Flag Count,Active Max,Active Min,Active Std,Average Packet Size,Avg Bwd Segment Size,Avg Fwd Segment Size,Bwd Avg Bytes/Bulk,Bwd Avg Packets/Bulk,Bwd Header Length,...,SERVER_TCP_FLAGS,SHORTEST_FLOW_PKT,SRC_TO_DST_AVG_THROUGHPUT,SRC_TO_DST_SECOND_BYTES,SimillarHTTP,Subflow Fwd Packets,TCP_FLAGS,TCP_WIN_MAX_IN,TCP_WIN_MAX_OUT,mask


In [10]:
df = prep_NF_UQ(pd.read_parquet('data/NFUQ/NF-UNSW-NB15-V2.parquet')).sample(frac=0.05)
valid_dataset = append_dataset(valid_dataset, df)
del df

valid_dataset

valid_dataset.to_csv('data/validation-set/nfuq-tiny.csv')

  return dataset.append(df, ignore_index=True)


In [8]:
files = ['data/CIC_DDOS2019/01-12','data/CIC_DDOS2019/03-11']
for file in files:
    for f in listdir(file):
        if f[0] == '.':
            continue
        filename=file+"/"+f
        valid_dataset = append_dataset(valid_dataset, prep_ddos(read_chunk(filename, 0, extra=1)).sample(frac=0.2))

  return dataset.append(df, ignore_index=True)
  return dataset.append(df, ignore_index=True)
  return dataset.append(df, ignore_index=True)
  return dataset.append(df, ignore_index=True)
  return dataset.append(df, ignore_index=True)
  return dataset.append(df, ignore_index=True)
  return dataset.append(df, ignore_index=True)
  return dataset.append(df, ignore_index=True)
  return dataset.append(df, ignore_index=True)
  return dataset.append(df, ignore_index=True)
  return dataset.append(df, ignore_index=True)
  return dataset.append(df, ignore_index=True)
  return dataset.append(df, ignore_index=True)
  return dataset.append(df, ignore_index=True)
  return dataset.append(df, ignore_index=True)
  return dataset.append(df, ignore_index=True)
  return dataset.append(df, ignore_index=True)
  return dataset.append(df, ignore_index=True)


In [9]:
valid_dataset

Unnamed: 0,ACK Flag Count,Active Max,Active Min,Active Std,Average Packet Size,Avg Bwd Segment Size,Avg Fwd Segment Size,Bwd Avg Bytes/Bulk,Bwd Avg Packets/Bulk,Bwd Header Length,...,SERVER_TCP_FLAGS,SHORTEST_FLOW_PKT,SRC_TO_DST_AVG_THROUGHPUT,SRC_TO_DST_SECOND_BYTES,SimillarHTTP,Subflow Fwd Packets,TCP_FLAGS,TCP_WIN_MAX_IN,TCP_WIN_MAX_OUT,mask
0,0,0,0,0,0,0,0,0,0,0,...,27,52,11360000,24156.0,0,0,27,65160,56472,1
1,0,0,0,0,0,0,0,0,0,0,...,27,52,13472000,1684.0,0,0,27,11584,8688,1
2,0,0,0,0,0,0,0,0,0,0,...,27,52,19504000,2438.0,0,0,27,23168,14480,1
3,0,0,0,0,0,0,0,0,0,0,...,0,52,4352000,544.0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,24,52,5576000,928.0,0,0,24,10136,10136,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
181038,0,0.0,0.0,0.0,750.0,0.0,500.0,0,0,0,...,0,0,0,0.0,0,2,0,0,0,2
181039,0,0.0,0.0,0.0,601.5,0.0,401.0,0,0,0,...,0,0,0,0.0,0,2,0,0,0,2
181040,0,0.0,0.0,0.0,921.0,0.0,614.0,0,0,0,...,0,0,0,0.0,0,2,0,0,0,2
181041,0,0.0,0.0,0.0,610.5,0.0,407.0,0,0,0,...,0,0,0,0.0,0,2,0,0,0,2


In [16]:
benign = pd.read_csv('data/synthetic/benign.csv')[['IN_PACKETS', 'IN_OCTETS', 'TCP_FLAGS', 'PROTO']]
benign = benign.rename(columns={'IN_PACKETS': 'IN_PKTS', 'IN_OCTETS': 'IN_BYTES', 'TCP_FLAGS': 'TCP_FLAGS', 'PROTO': 'PROTOCOL'})
benign['Label'] = 0

dos = pd.read_csv('data/synthetic/dos.csv')[['IN_PACKETS', 'IN_OCTETS', 'TCP_FLAGS', 'PROTO']]
dos = dos.rename(columns={'IN_PACKETS': 'IN_PKTS', 'IN_OCTETS': 'IN_BYTES', 'TCP_FLAGS': 'TCP_FLAGS', 'PROTO': 'PROTOCOL'})
dos['Label'] = 1
dos['Attack'] = 1

In [17]:
valid_dataset = append_dataset(valid_dataset, benign)
del benign
valid_dataset = append_dataset(valid_dataset, dos)
del dos

  df[cols[np.invert(np.isin(cols, df.columns))]] = 0
  df[cols[np.invert(np.isin(cols, df.columns))]] = 0
  df[cols[np.invert(np.isin(cols, df.columns))]] = 0
  df[cols[np.invert(np.isin(cols, df.columns))]] = 0
  df[cols[np.invert(np.isin(cols, df.columns))]] = 0
  df[cols[np.invert(np.isin(cols, df.columns))]] = 0
  df[cols[np.invert(np.isin(cols, df.columns))]] = 0
  df[cols[np.invert(np.isin(cols, df.columns))]] = 0
  df[cols[np.invert(np.isin(cols, df.columns))]] = 0
  df[cols[np.invert(np.isin(cols, df.columns))]] = 0
  df[cols[np.invert(np.isin(cols, df.columns))]] = 0
  return dataset.append(df, ignore_index=True)
  df[cols[np.invert(np.isin(cols, df.columns))]] = 0
  df[cols[np.invert(np.isin(cols, df.columns))]] = 0
  df[cols[np.invert(np.isin(cols, df.columns))]] = 0
  df[cols[np.invert(np.isin(cols, df.columns))]] = 0
  df[cols[np.invert(np.isin(cols, df.columns))]] = 0
  df[cols[np.invert(np.isin(cols, df.columns))]] = 0
  df[cols[np.invert(np.isin(cols, df.columns))]] = 0

In [18]:
valid_dataset

Unnamed: 0,ACK Flag Count,Active Max,Active Min,Active Std,Average Packet Size,Avg Bwd Segment Size,Avg Fwd Segment Size,Bwd Avg Bytes/Bulk,Bwd Avg Packets/Bulk,Bwd Header Length,...,SERVER_TCP_FLAGS,SHORTEST_FLOW_PKT,SRC_TO_DST_AVG_THROUGHPUT,SRC_TO_DST_SECOND_BYTES,SimillarHTTP,Subflow Fwd Packets,TCP_FLAGS,TCP_WIN_MAX_IN,TCP_WIN_MAX_OUT,mask
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,24,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,18,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,2,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11080,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11081,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11082,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11083,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
valid_dataset.to_csv('data/validation-set/nfuq-ddos-tiny.csv')