In [1]:
import math
import json
import pandas as pd
import configparser

In [2]:
config = configparser.RawConfigParser()
config.read('../settings.ini')
sample_config = dict(config.items('SAMPLE'))
data_config = dict(config.items('DATA'))
chunk_size = int(data_config['chunk_size'])
sample_size = int(sample_config['sample_size'])

In [3]:
label_freq = {}
with open('../metadata/label_frequency.json', 'r') as f:
    label_freq = json.load(f)
label_percent = {l: freq / label_freq['total'] for l, freq in label_freq['labels'].items()}
print(label_percent)

{'LDAP': 0.09404206579824474, 'NetBIOS': 0.17960139016254983, 'BENIGN': 0.002797266324650342, 'MSSQL': 0.2841928795294759, 'Portmap': 0.009180670798852417, 'Syn': 0.24019710746997536, 'UDP': 0.18989664625126293, 'UDPLag': 9.197366498850329e-05}


In [4]:
sample_freq = {l: math.ceil(percent * sample_size) for l, percent in label_percent.items()}

In [5]:
print(sample_freq)

{'LDAP': 95, 'NetBIOS': 180, 'BENIGN': 3, 'MSSQL': 285, 'Portmap': 10, 'Syn': 241, 'UDP': 190, 'UDPLag': 1}


In [6]:
def get_sample(filename, label, label_freq, sample_freq):
    all_df = pd.read_csv(filename, chunksize=chunk_size, low_memory=False)
    final_sample = pd.DataFrame()
    for df in all_df:
        labeled_df = df.loc[df[' Label'] == label]
        percent = len(labeled_df.index) / label_freq[label]
        sample_df = labeled_df.sample(n=math.ceil(percent * sample_freq[label]))
        final_sample = pd.concat([final_sample, sample_df])
    return final_sample

In [7]:
data_files = ['./03-11/LDAP.csv', './03-11/MSSQL.csv', './03-11/NetBIOS.csv', './03-11/Portmap.csv',
              './03-11/Syn.csv', './03-11/UDP.csv', './03-11/UDPLag.csv']
sample_df = pd.DataFrame()
for label in label_freq['labels'].keys():
    label_df = pd.DataFrame()
    for file in data_files:
        df = get_sample(file, label, label_freq['labels'], sample_freq)
        label_df = pd.concat([label_df, df])
    sample_df = pd.concat([sample_df, label_df])
sample_df.to_csv('sample.csv')

In [8]:
print(len(sample_df.index))
print(label_freq)
print(sample_freq)

1024
{'labels': {'LDAP': 1915122, 'NetBIOS': 3657497, 'BENIGN': 56965, 'MSSQL': 5787453, 'Portmap': 186960, 'Syn': 4891500, 'UDP': 3867155, 'UDPLag': 1873}, 'total': 20364525}
{'LDAP': 95, 'NetBIOS': 180, 'BENIGN': 3, 'MSSQL': 285, 'Portmap': 10, 'Syn': 241, 'UDP': 190, 'UDPLag': 1}
