In [1]:
import math
import json
import pandas as pd
import configparser

In [2]:
config = configparser.RawConfigParser()
config.read('../settings.ini')
sample_config = dict(config.items('SAMPLE'))
data_config = dict(config.items('DATA'))
chunk_size = int(data_config['chunk_size'])
sample_size = int(sample_config['sample_size'])

In [3]:
label_freq = {}
with open('../metadata/label_frequency.json', 'r') as f:
    label_freq = json.load(f)
label_percent = {l: freq / label_freq['total'] for l, freq in label_freq['labels'].items()}
print(label_percent)

{'LDAP': 0.06948956580131381, 'NetBIOS': 0.17960139016254983, 'BENIGN': 0.24552499996930938, 'MSSQL': 0.21053537953868307, 'Portmap': 0.006725420799159322, 'Syn': 0.1910921074761135, 'UDP': 0.14079164625740104, 'UDPLag': 9.197366498850329e-05}


In [4]:
sample_freq = {l: math.ceil(percent * sample_size) for l, percent in label_percent.items()}

In [5]:
print(sample_freq)

{'LDAP': 13898, 'NetBIOS': 35921, 'BENIGN': 49105, 'MSSQL': 42108, 'Portmap': 1346, 'Syn': 38219, 'UDP': 28159, 'UDPLag': 19}


In [6]:
def get_sample(filename, label, label_freq, sample_freq):
    all_df = pd.read_csv(filename, chunksize=chunk_size, low_memory=False)
    final_sample = pd.DataFrame()
    for df in all_df:
        labeled_df = df.loc[df[' Label'] == label]
        percent = len(labeled_df.index) / label_freq[label]
        sample_df = labeled_df.sample(n=math.ceil(percent * sample_freq[label]))
        final_sample = pd.concat([final_sample, sample_df])
    return final_sample

In [7]:
data_files = ['./03-11/LDAP.csv', './03-11/MSSQL.csv', './03-11/NetBIOS.csv', './03-11/Portmap.csv',
              './03-11/Syn.csv', './03-11/UDP.csv', './03-11/UDPLag.csv']
sample_df = pd.DataFrame()
for label in label_freq['labels'].keys():
    label_df = pd.DataFrame()
    for file in data_files:
        df = get_sample(file, label, label_freq['labels'], sample_freq)
        label_df = pd.concat([label_df, df])
    sample_df = pd.concat([sample_df, label_df])
sample_df.to_csv('sample-200k.csv')

In [8]:
print(len(sample_df.index))
print(label_freq)
print(sample_freq)

200048
{'labels': {'LDAP': 1415122, 'NetBIOS': 3657497, 'BENIGN': 5000000, 'MSSQL': 4287453, 'Portmap': 136960, 'Syn': 3891500, 'UDP': 2867155, 'UDPLag': 1873}, 'total': 20364525}
{'LDAP': 13898, 'NetBIOS': 35921, 'BENIGN': 49105, 'MSSQL': 42108, 'Portmap': 1346, 'Syn': 38219, 'UDP': 28159, 'UDPLag': 19}
