In [1]:
import math
import json
import pandas as pd
import configparser
from metadata.const import class_label

In [2]:
config = configparser.RawConfigParser()
config.read('../settings.ini')
data_config = dict(config.items('DATA'))
chunk_size = int(data_config['chunk_size'])

In [3]:
label_freq = {}
with open('../metadata/label_frequency.json', 'r') as f:
    label_freq = json.load(f)
print(label_freq)

{'labels': {'LDAP': 1915122, 'NetBIOS': 3657497, 'BENIGN': 56965, 'MSSQL': 5787453, 'Portmap': 186960, 'Syn': 4891500, 'UDP': 3867155, 'UDPLag': 1873}, 'total': 20364525}


In [4]:
sample_config = {}
with open('../metadata/sample_config.json', 'r') as f:
    sample_config = json.load(f)
print(sample_config)

{'labels': {'LDAP': 0.06, 'NetBIOS': 0.12, 'BENIGN': 0.3, 'MSSQL': 0.19, 'Portmap': 0.006, 'Syn': 0.16, 'UDP': 0.13, 'UDPLag': 6.43815655e-05}, 'total': 100000}


In [5]:
def get_sample(filename, label, label_freq, sample_freq):
    all_df = pd.read_csv(filename, chunksize=chunk_size, low_memory=False)
    final_sample = pd.DataFrame()
    for df in all_df:
        labeled_df = df.loc[df[class_label] == label]
        percent = len(labeled_df.index) / label_freq
        sample_df = labeled_df.sample(n=math.ceil(percent * sample_freq))
        final_sample = pd.concat([final_sample, sample_df])
    return final_sample

In [None]:
data_files = ['./03-11/LDAP.csv', './03-11/MSSQL.csv', './03-11/NetBIOS.csv', './03-11/Portmap.csv',
              './03-11/Syn.csv', './03-11/UDP.csv', './03-11/UDPLag.csv']
sample_df = pd.DataFrame()
for l in label_freq['labels'].keys():
    label_df = pd.DataFrame()
    for file in data_files:
        l_sample_freq = min(label_freq['labels'][l], sample_config['total'] * sample_config['labels'][l])
        df = get_sample(file, l, label_freq['labels'][l], l_sample_freq)
        label_df = pd.concat([label_df, df])
    sample_df = pd.concat([sample_df, label_df])
sample_df.to_csv(f"sample-{sample_config['total']}.csv", index=False)

In [7]:
print(len(sample_df.index))

198060
