# Description
This notebook facilitates the generation of synthetic data through Agent-based Modeling, utilizing distributions derived from the KDD99 Cup dataset. The resulting synthetic data is exported for subsequent applications.

In [154]:
# Python
import os

# Agent based modeling
import mesa
from mesa.datacollection import DataCollector

# Data Handling
import pandas as pd
import numpy as np

# Helper functions

In [146]:
def get_distributions(directory_path):
    # Ensure the directory path ends with a '/'
    if not directory_path.endswith(os.sep):
        directory_path += os.sep

    try:
        all_files = os.listdir(directory_path)
    except FileNotFoundError:
        print(f"Directory {directory_path} not found.")
        return {}

    distributions = {}
    for file in all_files:
        if file.endswith('.parquet'):
            name = file.replace('.parquet', '')
            try:
                data = pd.read_parquet(os.path.join(directory_path, file))
            except Exception as e:
                print(f"Error reading {file}: {e}")
                continue

            var_type = 'cat' if ('service' in name or 'protocol_type' in name) else 'num'
            distributions[name] = {'data': data, 'var_type': var_type}

    return distributions

# Load distributions

## Normal

In [147]:
directory_path = f'distributions{os.sep}normal{os.sep}'
normal_agent_dist = get_distributions(directory_path)

## Smurf

In [148]:
directory_path = f'distributions{os.sep}smurf{os.sep}'
smurf_agent_dist = get_distributions(directory_path)

# Model

In [150]:
class NetworkModel(mesa.Model):
    def __init__(self):
        super().__init__()
        self.schedule = mesa.time.RandomActivation(self)

        self.datacollector = DataCollector(
            agent_reporters={
                "target": "type",
                "diff_srv_rate": "diff_srv_rate",
                "service": "service",
                "same_srv_rate": "same_srv_rate",
                "protocol_type": "protocol_type",
                "dst_host_same_src_port_rate": "dst_host_same_src_port_rate",
                "srv_count": "srv_count",
                "src_bytes": "src_bytes",
                "count": "count",
            }
        )

    def step(self):
        self.schedule.step()
        self.datacollector.collect(self)


# Agents

In [None]:
class KDDAgent(mesa.Agent):
    def __init__(self, unique_id, model, distributions, type_):
        super().__init__(unique_id, model)
        self.distributions = distributions
        self.type = type_

        # Variables
        self.conn_vars = None
        self.diff_srv_rate = None
        self.service = None
        self.same_srv_rate = None
        self.protocol_type = None
        self.dst_host_same_src_port_rate = None
        self.srv_count = None
        self.src_bytes = None
        self.count = None

    def step(self):
        self.conn_vars = self.get_conn_variables_from_dist()
        self.diff_srv_rate = self.conn_vars['diff_srv_rate']
        self.service = self.conn_vars['service']
        self.same_srv_rate = self.conn_vars['same_srv_rate']
        self.protocol_type = self.conn_vars['protocol_type']
        self.dst_host_same_src_port_rate = self.conn_vars['dst_host_same_src_port_rate']
        self.srv_count = self.conn_vars['srv_count']
        self.src_bytes = self.conn_vars['src_bytes']
        self.count = self.conn_vars['count']

    def get_conn_variables_from_dist(self):
        
        conn_variables = dict()
        
        for var in self.distributions:
        
            if self.distributions[var]['var_type'] == 'num':
                new_sample = np.random.choice(
                    self.distributions[var]['data']['values'], 
                    size=1, 
                    p=self.distributions[var]['data']['probs']
                )
                conn_variables[var] = new_sample[0]
        
            elif self.distributions[var]['var_type'] == 'cat':
                # Convert probabilities to cumulative probabilities
                temp_df = self.distributions[var]['data'].reset_index().copy()
                temp_df['cumulative_prob'] = temp_df['count'].cumsum()
                # Generate a random number between 0 and 1
                random_number = np.random.rand()
                # Find the index where the random number falls in the cumulative probabilities
                choice_index = temp_df['cumulative_prob'].searchsorted(random_number)
                # # Get the corresponding protocol type
                chosen_option = temp_df[var][choice_index]
            
                conn_variables[var] = chosen_option
                
        return conn_variables


# Instantiate Model & Agents

In [152]:
model = NetworkModel()

normal_agent = KDDAgent(unique_id=1, model=model, distributions=normal_agent_dist, type_=b'normal.')
smurf_agent = KDDAgent(unique_id=2, model=model, distributions=smurf_agent_dist, type_=b'smurf.')

model.schedule.add(normal_agent)
model.schedule.add(smurf_agent)

# Run ABM simulation & save results

In [153]:
iterations = 50000
for i in range(iterations):
    model.step()
    if i%5000==0:
        print(f'{i}/{iterations} simulations')

agent_data = model.datacollector.get_agent_vars_dataframe()
agent_data.to_parquet(f'data{os.sep}output{os.sep}simulated_data.parquet')

0/50000 simulations
5000/50000 simulations
10000/50000 simulations
15000/50000 simulations
20000/50000 simulations
25000/50000 simulations
30000/50000 simulations
35000/50000 simulations
40000/50000 simulations
45000/50000 simulations
