In [1]:
import json
import pandas as pd
import configparser
from metadata.const import class_label

In [2]:
config = configparser.RawConfigParser()
config.read('../settings.ini')
data_config = dict(config.items('DATA'))
chunk_size = int(data_config['chunk_size'])

In [3]:
def get_count_rows(filename):
    res = pd.read_csv(filename, chunksize=chunk_size, low_memory=False)
    count_rows = 0
    for chunk in res:
        count_rows += len(chunk.index)
    return count_rows

In [4]:
def join_label_frequency(labels1, labels2):
    result = {key: labels1[key] for key in labels1.keys()}
    for key in labels2.keys():
        if key not in result:
            result[key] = labels2[key]
        else:
            result[key] += labels2[key]
    return result

In [5]:
def get_label_frequency(filename):
    labels = {}
    res = pd.read_csv(filename, chunksize=chunk_size, low_memory=False)
    for chunk in res:
        chunk_labels = chunk[class_label].value_counts().to_dict()
        labels = join_label_frequency(labels, chunk_labels)
    return labels

In [6]:
data_files = ['../dataset/03-11/LDAP.csv', '../dataset/03-11/MSSQL.csv', '../dataset/03-11/NetBIOS.csv',
              '../dataset/03-11/Portmap.csv', '../dataset/03-11/Syn.csv', '../dataset/03-11/UDP.csv',
              '../dataset/03-11/UDPLag.csv']
labels = {}
for file in data_files:
    labels = join_label_frequency(labels, get_label_frequency(file))
print(labels)

{'LDAP': 1915122, 'NetBIOS': 3657497, 'BENIGN': 56965, 'MSSQL': 5787453, 'Portmap': 186960, 'Syn': 4891500, 'UDP': 3867155, 'UDPLag': 1873}


In [7]:
count_rows = 0
for file in data_files:
    count_rows += get_count_rows(file)
print(count_rows)

20364525


In [6]:
def display_as_json(labels_freq, count_rows, filename):
    result = {
        "labels": labels_freq,
        "total": count_rows
    }
    with open(filename, "w") as f:
        json.dump(result, f, indent=4)

In [7]:
display_as_json(labels, count_rows, 'label_frequency.json')

NameError: name 'labels' is not defined

In [8]:
sample_file = '../dataset/clean-sample-200000.csv'
sample_labels = get_label_frequency(sample_file)
sample_count_rows = get_count_rows(sample_file)
display_as_json(sample_labels, sample_count_rows, 'sample_label_frequency.json')