# Generating Multiple Benchmark Flow Traffic Sets

In this example, we write a script which will generate multiple benchmark traffic sets in a loop and save them in .pickle format. We will assume we are generating traffic for a `TrafPy` fat tree topology, although of course you can generate traffic for any arbitrary topology defined outside of `TrafPy` (see documentation and other examples).

We will generate the rack distribution sensitivity benchmark data set for loads 0.1-0.5.

In [1]:
import trafpy.generator as tpg
from trafpy.benchmarker import BenchmarkImporter

import numpy as np
import time
import os
from collections import defaultdict # use for initialising arbitrary length nested dict
from sqlitedict import SqliteDict
import json
from pathlib import Path
import gzip
import pickle

## 1. Define Generation Configuration

If you were writing this in a script rather than a Jupyter Notebook, you may want to e.g. put this next cell in a `config.py` file and import the file into a separate script for conciseness.

In [2]:
# -------------------------------------------------------------------------
# general configuration
# -------------------------------------------------------------------------
# define benchmark version
BENCHMARK_VERSION = 'v001'

# define minimum number of demands to generate (may generate more to meet jensen_shannon_distance_threshold and/or min_last_demand_arrival_time)
MIN_NUM_DEMANDS = None
MAX_NUM_DEMANDS = 5000 

# define maximum allowed Jenson-Shannon distance for flow size and interarrival time distributions (lower value -> distributions must be more similar -> higher number of demands will be generated) (must be between 0 and 1)
JENSEN_SHANNON_DISTANCE_THRESHOLD = 0.3

# define minimum time of last demand's arrival (helps define minimum simulation time)
MIN_LAST_DEMAND_ARRIVAL_TIME = None 

# define network load fractions
LOADS = [round(load, 3) for load in np.arange(0.1, 0.4, 0.1).tolist()] # ensure no python floating point arithmetic errors

# define whether or not to TrafPy packer should auto correct invalid node distribution(s)
AUTO_NODE_DIST_CORRECTION = True

# slot size (if None, won't generate slots_dict database)
# SLOT_SIZE = None 
SLOT_SIZE = 1000.0 # 50.0 1000.0 10.0


# -------------------------------------------------------------------------
# benchmark-specific configuration
# -------------------------------------------------------------------------

BENCHMARKS = ['rack_sensitivity_0',
              'rack_sensitivity_02',
              'rack_sensitivity_04',
              'rack_sensitivity_06',
              'rack_sensitivity_08']

# define network topology for each benchmark
net = tpg.gen_fat_tree(k=4, 
                       L=2, 
                       n=8, 
                       num_channels=1, 
                       server_to_rack_channel_capacity=1250, # 1250
                       rack_to_edge_channel_capacity=1000, 
                       edge_to_agg_channel_capacity=1000, 
                       agg_to_core_channel_capacity=2000)
NETS = {benchmark: net for benchmark in BENCHMARKS}

# define network capacity for each benchmark
NETWORK_CAPACITIES = {benchmark: net.graph['max_nw_capacity'] for benchmark in BENCHMARKS}
NETWORK_EP_LINK_CAPACITIES = {benchmark: net.graph['ep_link_capacity'] for benchmark in BENCHMARKS}

# define network racks for each benchmark
RACKS_DICTS = {benchmark: net.graph['rack_to_ep_dict'] for benchmark in BENCHMARKS}

## 2. Write a Function to Generate the Benchmark Traffic

This function should use the above configuration variables to generate traffic for each of our benchmarks as required.

In [3]:
def gen_benchmark_demands(path_to_save=None, 
                          load_prev_dists=True,
                          overwrite=False):
    '''
    If slot size is not None, will also generate an sqlite database for the slots_dict
    dictionary. This is useful if later during simulations want to have pre-computed
    slots_dict rather than computing & storing them in memory.

    '''
    if path_to_save[-1] == '/' or path_to_save[-1] == '\\':
        path_to_save = path_to_save[:-1]

    # init benchmark importer
    importer = BenchmarkImporter(BENCHMARK_VERSION, load_prev_dists=load_prev_dists)

    # load distributions for each benchmark
    benchmark_dists = {benchmark: {} for benchmark in BENCHMARKS}

    nested_dict = lambda: defaultdict(nested_dict)
    benchmark_demands = nested_dict()

    # begin generating data for each benchmark
    num_loads = len(LOADS)
    start_loops = time.time()
    print('\n~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*')
    print('Benchmarks to Generate: {}'.format(BENCHMARKS))
    print('Loads to generate: {}'.format(LOADS))
    for benchmark in BENCHMARKS:
        print('~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*')
        print('Generating demands for benchmark \'{}\'...'.format(benchmark))
        
        # get racks and endpoints
        racks_dict = RACKS_DICTS[benchmark]
        if racks_dict is not None:
            eps_racks_list = [eps for eps in racks_dict.values()]
            eps = []
            for rack in eps_racks_list:
                for ep in rack:
                    eps.append(ep)
        else:
            eps = NETS[benchmark].graph['endpoints']

        start_benchmark = time.time()
        load_counter = 1
        benchmark_dists[benchmark] = importer.get_benchmark_dists(benchmark, eps, racks_dict=racks_dict)
        for load in LOADS:
            start_load = time.time()
            network_load_config = {'network_rate_capacity': NETWORK_CAPACITIES[benchmark], 
                                   'ep_link_capacity': NETWORK_EP_LINK_CAPACITIES[benchmark],
                                   'target_load_fraction': load,
                                   'disable_timeouts': True}
            print('Generating demand data for benchmark {} load {}...'.format(benchmark, load))
            if benchmark_dists[benchmark]['num_ops_dist'] is not None:
                # job-centric
                use_multiprocessing = True
            else:
                # flow-centric
                use_multiprocessing = False
            demand_data = tpg.create_demand_data(min_num_demands=MIN_NUM_DEMANDS,
                                             max_num_demands=MAX_NUM_DEMANDS, 
                                             eps=eps,
                                             node_dist=benchmark_dists[benchmark]['node_dist'],
                                             flow_size_dist=benchmark_dists[benchmark]['flow_size_dist'],
                                             interarrival_time_dist=benchmark_dists[benchmark]['interarrival_time_dist'],
                                             num_ops_dist=benchmark_dists[benchmark]['num_ops_dist'],
                                             c=3,
                                             jensen_shannon_distance_threshold=JENSEN_SHANNON_DISTANCE_THRESHOLD,
                                             network_load_config=network_load_config,
                                             min_last_demand_arrival_time=MIN_LAST_DEMAND_ARRIVAL_TIME,
                                             auto_node_dist_correction=AUTO_NODE_DIST_CORRECTION,
                                             use_multiprocessing=use_multiprocessing,
                                             print_data=False)

            file_path = path_to_save + '/benchmark_{}_load_{}'.format(benchmark, load)
            tpg.pickle_data(path_to_save=file_path, data=demand_data, overwrite=overwrite)

            # reset benchmark demands dict to save memory
            benchmark_demands = nested_dict()

            if SLOT_SIZE is not None:
                # generate slots dict and save as database
                print('Creating slots_dict database with slot_size {}...'.format(SLOT_SIZE))
                s = time.time()
                demand = tpg.Demand(demand_data, eps=eps)
                with SqliteDict(file_path+'_slotsize_{}_slots_dict.sqlite'.format(SLOT_SIZE)) as slots_dict:
                    for key, val in demand.get_slots_dict(slot_size=SLOT_SIZE, include_empty_slots=True, print_info=True).items():
                        if type(key) is not str:
                            slots_dict[json.dumps(key)] = val
                        else:
                            slots_dict[key] = val
                    slots_dict.commit()
                    slots_dict.close()
                e = time.time()
                print('Created slots_dict database in {} s'.format(e-s))
            else:
                pass

            end_load = time.time()
            print('Generated \'{}\' demands for load {} of {} in {} seconds.'.format(benchmark, load_counter, num_loads, end_load-start_load))
            load_counter += 1

        end_benchmark = time.time()
        print('Generated demands for benchmark \'{}\' in {} seconds.'.format(benchmark, end_benchmark-start_benchmark))

    end_loops = time.time()
    print('Generated all benchmarks in {} seconds.'.format(end_loops-start_loops))

    return benchmark_demands


## 3. Generate the Benchmark Traffic

We will generate each of our traffic sets 2x to enable us to run 2 repeat experiments for each set

In [4]:
for _set in range(2):
    path_to_save = '../data/generate_multiple_benchmark_traffic_sets/set_{}_benchmark_data'.format(_set)
    Path(path_to_save).mkdir(exist_ok=True, parents=True)
    benchmark_demands = gen_benchmark_demands(path_to_save=path_to_save,
                                              load_prev_dists=False,
                                              overwrite=False)

load_prev_dist=False. Will re-generate dists with given network params and override any previously saved distributions.

~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*
Benchmarks to Generate: ['rack_sensitivity_0', 'rack_sensitivity_02', 'rack_sensitivity_04', 'rack_sensitivity_06', 'rack_sensitivity_08']
Loads to generate: [0.1, 0.2, 0.3, 0.4]
~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*
Generating demands for benchmark 'rack_sensitivity_0'...
Set to save benchmark rack_sensitivity_0 distribution data to /home/cwfparsonson/Insync/zciccwf@ucl.ac.uk/OneDriveBiz/ipes_cdt/phd_project/projects/trafpy/trafpy/benchmarker/versions/benchmark_v001/data/rack_sensitivity_0/
Saved node_dist distribution data to /home/cwfparsonson/Insync/zciccwf@ucl.ac.uk/OneDriveBiz/ipes_cdt/phd_project/projects/trafpy/trafpy/benchmarker/versions/benchmark_v001/data/rack_sensitivity_0/
Saved flow_size_dist distribution data to /home/cwfparsonson/Insync/zciccwf@ucl.ac.uk/OneDriveBiz/ipes_cdt/phd_proje

KeyError: 'num_ops_dist'