In [1]:
import pandas as pd
from tqdm import tqdm
from pyarrow import csv
import pyarrow as pa

import numpy as np

In [6]:
def default_transform(df):
    return df

def read_chunk(name, chunk_id, opts=csv.ConvertOptions(), transform=default_transform, extra=1):
    chunk=None
    chunksize = extra*10 ** 6
    with csv.open_csv(
        name,
        convert_options = opts,
        read_options=csv.ReadOptions(
            use_threads=True,
            block_size=chunksize
        )) as reader:

        i=0
        for next_chunk in reader:
            if next_chunk is None:
                break
            chunk=next_chunk
            if i == chunk_id:
                return transform(chunk.to_pandas())
            i+=1
            
    return None

def prep_NF_UQ(df):
    cols =[   'PROTOCOL', 
              'IN_PKTS', 
              'OUT_PKTS',
              'IN_BYTES',
              'OUT_BYTES',
              'DURATION_IN',
              'DURATION_OUT',
              'Label' 
             ]
    return df[cols]

def prep_ddos(df):
    chose_cols_s=[
        ' Protocol', 
        ' Total Fwd Packets', 
        ' Total Backward Packets',
        'Total Length of Fwd Packets',
        ' Total Length of Bwd Packets',
        'Fwd IAT Total',
        'Bwd IAT Total',
        ' Label' 
        ]
    
    df = df[chose_cols_s]
    df = df.rename(columns=
                   {' Protocol': 'PROTOCOL', 
                   ' Total Fwd Packets': 'IN_PKTS', 
                   ' Total Backward Packets': 'OUT_PKTS',
                   'Total Length of Fwd Packets': 'IN_BYTES',
                   ' Total Length of Bwd Packets': 'OUT_BYTES',
                   'Fwd IAT Total': 'DURATION_IN',
                   'Bwd IAT Total': 'DURATION_OUT',
                   ' Label': 'Label'
                   }, errors="raise")
    
    df['PROTOCOL'] = df['PROTOCOL'].astype(np.int64)
    df['IN_PKTS'] = df['IN_PKTS'].astype(np.int64)
    df['OUT_PKTS'] = df['OUT_PKTS'].astype(np.int64)
    df['IN_BYTES'] = df['IN_BYTES'].astype(np.int64)
    df['OUT_BYTES'] = df['OUT_BYTES'].astype(np.int64)
    df['DURATION_IN']= (df['DURATION_IN']/1000000).astype(np.int64)
    df['DURATION_OUT']= (df['DURATION_OUT']/1000000).astype(np.int64)
    df = df.drop(['Label'],axis=1)
    df['Label'] = 1
    return df

In [3]:
def process_file(file, outname, opts=csv.ConvertOptions(), create_new=True, process=default_transform):
    chunk=None
    chunksize = 10 ** 6
    with csv.open_csv(
        file, 
        convert_options = opts,
        read_options=csv.ReadOptions(
            use_threads=True,
            block_size=chunksize
        )) as reader:

        i=0
        for chunk in tqdm (reader):
            if chunk is None:
                break
            if i==0 and create_new:
                process(chunk.to_pandas()).to_csv(outname)
            else:
                process(chunk.to_pandas()).to_csv(outname, mode='a', header=False)
            i+=1

In [4]:
process_file('NF-UQ-NIDS-v2.csv', outname='merged-ordered.csv', process=prep_NF_UQ)

13730it [05:20, 42.91it/s]


In [7]:
from os import listdir
from os.path import isfile, join

files = ['./01-12','./03-11']
for file in files:
    for f in listdir(file):
        if f[0] == '.':
            continue
        filename=file+"/"+f
        print(filename)
        process_file(filename, 
                     outname='merged-ordered.csv', 
                     process=prep_ddos, 
                     create_new=False, 
                     opts=csv.ConvertOptions(column_types={"SimillarHTTP": pa.string()}))


./01-12/DrDoS_MSSQL.csv


1890it [00:36, 51.53it/s]


./01-12/UDPLag.csv


158it [00:02, 52.95it/s]


./01-12/Syn.csv


638it [00:12, 51.55it/s]


./01-12/TFTP.csv


9303it [02:50, 54.58it/s]


./01-12/DrDoS_DNS.csv


2134it [00:41, 51.76it/s]


./01-12/DrDoS_LDAP.csv


918it [00:17, 52.00it/s]


./01-12/DrDoS_NTP.csv


646it [00:11, 55.94it/s]


./01-12/DrDoS_SNMP.csv


2173it [00:41, 52.39it/s]


./01-12/DrDoS_UDP.csv


1507it [00:27, 54.01it/s]


./01-12/DrDoS_NetBIOS.csv


1698it [00:32, 52.04it/s]


./01-12/DrDoS_SSDP.csv


1253it [00:23, 54.17it/s]


./03-11/UDPLag.csv


320it [00:06, 52.20it/s]


./03-11/Portmap.csv


79it [00:01, 52.44it/s]


./03-11/LDAP.csv


872it [00:16, 52.12it/s]


./03-11/NetBIOS.csv


1419it [00:27, 51.79it/s]


./03-11/MSSQL.csv


2387it [00:46, 51.23it/s]


./03-11/Syn.csv


1878it [00:36, 51.43it/s]


./03-11/UDP.csv


1793it [00:33, 53.61it/s]


In [8]:
read_chunk('merged-ordered.csv', chunk_id=15000)

In [9]:

chunksize = 1*10 ** 6
with csv.open_csv(
    "merged-ordered.csv",
    read_options=csv.ReadOptions(
        use_threads=True,
        block_size=chunksize
    )) as reader:

    size = 0
    for next_chunk in tqdm(reader):
        if next_chunk is None:
            break
        size+=len(next_chunk.to_pandas())

3613it [00:51, 70.57it/s]


In [10]:
size

146415613