In [12]:
import numpy as np
import pandas as pd 
import scipy.stats as ss
import random
import pyarrow as pa

from pyarrow import csv

In [13]:
def default_transform(df):
    return df

def split_file(in_file, train_file ,test_file, chunk_indexes, opts=csv.ConvertOptions(), create_new=True, transform=default_transform, skip_first=False):
    chunksize = 10 ** 6
    with csv.open_csv(
        in_file,
        convert_options = opts,
        read_options=csv.ReadOptions(
            use_threads=True,
            block_size=chunksize
        )) as reader:
        i = 0
        flag_train = create_new
        flag_test = create_new
        for chunk in reader:
            chunk = transform(chunk.to_pandas())
            if skip_first:
                chunk = chunk[chunk.columns[1:]]

            if i in chunk_indexes:
                print(i)
                if flag_test:
                    chunk.to_csv(test_file)
                    flag_test = False
                else:
                    chunk.to_csv(test_file, mode='a', header=False)
            else:
                if flag_train:
                    chunk.to_csv(train_file)
                    flag_train = False
                else:
                    chunk.to_csv(train_file, mode='a', header=False)
            i+=1
            if chunk is None:
                break

def get_size(in_file, opts=csv.ConvertOptions()):
    chunksize = 10 ** 6
    with csv.open_csv(
        in_file,
        convert_options = opts,
        read_options=csv.ReadOptions(
            use_threads=True,
            block_size=chunksize
        )) as reader:
        i = 0
        for chunk in reader:
            i+=1
            if chunk is None:
                break
        return i
    
def split(in_file, train_file ,test_file, opts=csv.ConvertOptions(), transform=default_transform, create_new=True, skip_first=False):
    size = get_size(in_file, opts=opts)
    test_size = int(size/15)
    indexes = list(range(0, size))
    random.shuffle(indexes)
    split_file(in_file, train_file ,test_file, indexes[:test_size], opts=opts, transform=transform, create_new=create_new, skip_first=skip_first)

In [14]:
def prep_NF_UQ(df):
    features_to_remove = ['L4_SRC_PORT', 'L4_DST_PORT', 'Attack', 'L4_SRC_PORT', 'L4_DST_PORT']
    df.drop(columns=features_to_remove, axis=1, inplace=True) 
    df['mask'] = 1
    return df

In [151]:
split('data/NFUQ/NF-UNSW-NB15-V2.csv', 'data/split/NF-UNSW-NB15-V2-TRAIN.csv', 'data/split/NF-UNSW-NB15-V2-TEST.csv', skip_first=True)

18
22
30
45
52
142
144
150
158
176
209
244
247
251
254
255
282
285
300
301
310
336


In [15]:
def prep_ddos(df):
    features_to_remove = ['Unnamed: 0', 'Flow ID', ' Source IP', ' Source Port', ' Destination IP', ' Destination Port', ' Timestamp', 'SimillarHTTP']
    df.drop(columns=features_to_remove, axis=1, inplace=True) 
    
    df = df.rename(columns=
                   {' Protocol': 'PROTOCOL', 
                   ' Total Fwd Packets': 'IN_PKTS', 
                   ' Total Backward Packets': 'OUT_PKTS',
                   'Total Length of Fwd Packets': 'IN_BYTES',
                   ' Total Length of Bwd Packets': 'OUT_BYTES',
                   'Fwd IAT Total': 'DURATION_IN',
                   'Bwd IAT Total': 'DURATION_OUT',
                   ' Label': 'Attack',
                   ' Flow Duration': 'FLOW_DURATION_MILLISECONDS',
                   }, errors="raise")
    
    df['PROTOCOL'] = df['PROTOCOL'].astype(np.int64)
    df['IN_PKTS'] = df['IN_PKTS'].astype(np.int64)
    df['OUT_PKTS'] = df['OUT_PKTS'].astype(np.int64)
    df['IN_BYTES'] = df['IN_BYTES'].astype(np.int64)
    df['OUT_BYTES'] = df['OUT_BYTES'].astype(np.int64)
    df['DURATION_IN']= (df['DURATION_IN']/1000000).astype(np.int64)
    df['DURATION_OUT']= (df['DURATION_OUT']/1000000).astype(np.int64)
    df['Label'] = 1
    df['mask'] = 2
    return df

In [16]:
from os import listdir
from os.path import isfile, join

is_first=True
files = ['data/CIC_DDOS2019/01-12','data/CIC_DDOS2019/03-11']
for file in files:
    for f in listdir(file):
        if f[0] == '.':
            continue
        filename=file+"/"+f
        print(filename)
        split(filename, 'data/split/ddos2019-TRAIN.csv', 'data/split/ddos2019-TEST.csv', transform=prep_ddos, opts=csv.ConvertOptions(column_types={"SimillarHTTP": pa.string()}), create_new=is_first)
        is_first=False

data/CIC_DDOS2019/01-12/DrDoS_MSSQL.csv
6
42
70
104
106
127
142
152
180
201
222
224
233
237
283
285
314
321
327
340
342
353
357
376
387
392
394
435
436
444
466
468
481
530
538
556
612
624
636
643
670
672
676
686
688
710
738
744
746
748
760
779
791
823
831
832
839
847
848
873
907
932
967
1055
1066
1071
1075
1080
1084
1096
1119
1128
1135
1149
1184
1228
1232
1260
1284
1312
1333
1342
1353
1370
1379
1399
1410
1420
1432
1460
1479
1484
1495
1515
1529
1546
1548
1554
1565
1593
1594
1611
1612
1620
1626
1628
1645
1646
1648
1669
1673
1681
1682
1692
1701
1746
1748
1761
1767
1792
1845
1847
1851
1859
1872
1881
data/CIC_DDOS2019/01-12/UDPLag.csv
4
6
34
39
65
93
124
141
147
151
data/CIC_DDOS2019/01-12/Syn.csv
8
32
98
105
115
129
142
157
160
171
181
185
188
193
254
268
270
275
308
321
322
330
374
403
421
434
441
446
452
475
509
512
514
523
541
578
584
588
607
618
620
634
data/CIC_DDOS2019/01-12/TFTP.csv
2
38
42
53
65
85
86
104
106
136
152
165
170
178
179
193
196
208
236
262
269
271
330
347
351
354
366
3

In [5]:
split('data/NFUQ/NF-UQ-NIDS-v2.csv', 'data/split/NF-UQ-NIDS-v2-TRAIN.csv', 'data/split/NF-UQ-NIDS-v2-TEST.csv', transform=prep_NF_UQ, skip_first=False)

6
44
72
78
98
99
105
106
123
141
152
154
163
175
179
181
182
193
198
244
260
265
289
297
312
332
334
336
344
352
357
379
410
418
426
437
438
443
448
456
459
460
466
468
479
490
495
500
517
534
548
568
570
588
589
630
640
649
664
674
679
693
694
702
755
762
763
772
793
818
827
847
854
856
861
868
876
890
908
939
942
944
975
977
982
1002
1011
1021
1029
1039
1070
1086
1092
1106
1120
1124
1147
1150
1157
1164
1183
1199
1200
1216
1222
1246
1252
1275
1298
1307
1346
1353
1370
1379
1381
1400
1415
1426
1427
1429
1436
1467
1474
1477
1481
1492
1505
1514
1519
1525
1529
1533
1534
1540
1547
1550
1552
1559
1661
1673
1674
1680
1704
1708
1739
1745
1749
1774
1776
1821
1830
1840
1856
1870
1871
1900
1902
1914
1933
1939
1944
1958
1961
1962
1968
1971
1997
2029
2037
2038
2057
2059
2060
2067
2078
2079
2086
2101
2102
2106
2108
2132
2138
2143
2150
2183
2227
2253
2261
2295
2309
2317
2327
2340
2367
2393
2399
2405
2409
2421
2422
2423
2437
2498
2517
2563
2572
2587
2612
2618
2643
2649
2672
2675
2706
2711
2717
2718
27

In [7]:
def read_chunk(name, chunk_id, opts=csv.ConvertOptions(), transform=default_transform, extra=1):
    chunk=None
    chunksize = extra*10 ** 7
    with csv.open_csv(
        name,
        convert_options = opts,
        read_options=csv.ReadOptions(
            use_threads=True,
            block_size=chunksize
        )) as reader:

        i=0
        for next_chunk in reader:
            if next_chunk is None:
                break
            chunk=next_chunk
            if i == chunk_id:
                return transform(chunk.to_pandas())
            i+=1
            
    return None

In [10]:
read_chunk('data/split/NF-UNSW-NB15-V2-TRAIN.csv', 0)

Unnamed: 0,Unnamed: 1,L4_SRC_PORT,L4_DST_PORT,PROTOCOL,L7_PROTO,IN_BYTES,IN_PKTS,OUT_BYTES,OUT_PKTS,TCP_FLAGS,...,TCP_WIN_MAX_IN,TCP_WIN_MAX_OUT,ICMP_TYPE,ICMP_IPV4_TYPE,DNS_QUERY_ID,DNS_QUERY_TYPE,DNS_TTL_ANSWER,FTP_COMMAND_RET_CODE,Label,Attack
0,0,1305,21,6,1.0,9,1,193,3,24,...,0,7240,0,0,0,0,0,331.0,0,Benign
1,1,1305,21,6,1.0,261,5,469,7,24,...,8688,8688,18944,74,0,0,0,230.0,0,Benign
2,2,1305,21,6,1.0,481,9,750,11,24,...,10136,10136,33792,132,0,0,0,229.0,0,Benign
3,3,1305,21,6,1.0,701,13,1054,15,24,...,11584,11584,48640,190,0,0,0,125.0,0,Benign
4,4,1305,21,6,1.0,1031,19,1474,21,24,...,14480,13032,64256,251,0,0,0,230.0,0,Benign
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61056,371,56298,21,6,1.0,481,9,750,11,24,...,10136,10136,33792,132,0,0,0,229.0,0,Benign
61057,372,14236,21,6,1.0,2059,37,2816,39,24,...,21720,18824,63744,249,0,0,0,125.0,0,Benign
61058,373,56298,21,6,1.0,701,13,1054,15,24,...,11584,11584,48640,190,0,0,0,125.0,0,Benign
61059,374,14236,21,6,1.0,2383,43,3234,45,24,...,24616,20272,13056,51,0,0,0,221.0,0,Benign
