In [1]:
import os
import pickle
from natsort import natsorted

import sys
sys.path.append('..')


def load_data_as_dict(directory_path):
    """
    Return pickle file in directory_path as a list. 
    Also returns a list of all the unique id's in the dataset. 
    Why do we need unique id? 
        Lets say a task is randomly mapped 100 times. 
        All the 100 data (dict in pickle file) will have the same id. 
        So it is easier to retrieve all the mapping for a single task
    """
    entries = os.listdir(directory_path)
    files = natsorted([entry for entry in entries if os.path.isfile(
        os.path.join(directory_path, entry))])

    list_of_dicts = []
    list_of_uuids = []

    for file_name in files:
        file_path = os.path.join(directory_path, file_name)

        with open(file_path, 'rb') as file:
            data_dict = pickle.load(file)
            list_of_dicts.append(data_dict)

            uuid = data_dict['task_dag'].id

            if uuid not in list_of_uuids:
                list_of_uuids.append(uuid)

    return list_of_dicts, list_of_uuids

### Loading the dataset 
1. #### task_from_graph
_task_from_graph_ is generated by first creating random dags with 9 nodes (including the 'Start' and the 'Exit' node). Around 812 unique dags were created. 
Once the dags were created, each dag was put through the simulation pipeline 100 times. Each time with a unique random mapping to create a dataset containing a total of 812000 instances.  
  
Note:
<ul>
    <li>&emsp;Every 100th instance is a new dag in this dataset.</li>
    <li>&emsp;Each unique dag has a unique ID associated with it.</li>
</ul>

In [2]:
dataset, _ = load_data_as_dict('../data/task_from_graph')

In [3]:
"""
Grouping Elements for Training and Testing
list_of_unique_graphs(len = 812) is a list_of_same_graphs(len = 100)
"""
list_of_unique_graphs = []
list_of_same_graphs = []
for idx, data in enumerate(dataset):
    list_of_same_graphs.append(data)
    if (idx + 1) % 100 == 0:  # +1 because idx starts from 0
        list_of_unique_graphs.append(list_of_same_graphs)
        list_of_same_graphs = []
    if idx % 100 == 99:
        dag = data['task_dag']
        print(
            f"Idx: {idx}, UUID: {dag.id}, Latency: {data['network_processing_time']}")
    elif idx % 100 == 0:
        dag = data['task_dag']
        print(
            f"Idx: {idx}, UUID: {dag.id}, Latency: {data['network_processing_time']}")
    elif idx % 100 == 1:
        dag = data['task_dag']
        print(
            f"Idx: {idx}, UUID: {dag.id}, Latency: {data['network_processing_time']}")

Idx: 0, UUID: 9c474c6c-ff6d-4aaa-a6a4-a9442755d2c6, Latency: 1696
Idx: 1, UUID: 9c474c6c-ff6d-4aaa-a6a4-a9442755d2c6, Latency: 2003
Idx: 99, UUID: 9c474c6c-ff6d-4aaa-a6a4-a9442755d2c6, Latency: 1705
Idx: 100, UUID: 222f7574-0e98-4a58-ae2c-5f337dbca1d9, Latency: 2078
Idx: 101, UUID: 222f7574-0e98-4a58-ae2c-5f337dbca1d9, Latency: 2377
Idx: 199, UUID: 222f7574-0e98-4a58-ae2c-5f337dbca1d9, Latency: 2085
Idx: 200, UUID: ffbd834b-2746-489d-be50-60302a5b56c9, Latency: 1714
Idx: 201, UUID: ffbd834b-2746-489d-be50-60302a5b56c9, Latency: 1713
Idx: 299, UUID: ffbd834b-2746-489d-be50-60302a5b56c9, Latency: 1705
Idx: 300, UUID: 709c2246-c813-4744-aeae-1590a40d3358, Latency: 1696
Idx: 301, UUID: 709c2246-c813-4744-aeae-1590a40d3358, Latency: 1406
Idx: 399, UUID: 709c2246-c813-4744-aeae-1590a40d3358, Latency: 1405
Idx: 400, UUID: 8903aac8-be9a-415a-bbc4-c8c1be7c0e86, Latency: 2041
Idx: 401, UUID: 8903aac8-be9a-415a-bbc4-c8c1be7c0e86, Latency: 2044
Idx: 499, UUID: 8903aac8-be9a-415a-bbc4-c8c1be7c0e86,

In [4]:
"""Training Dataset and Testing Dataset"""
from sklearn.model_selection import train_test_split

random_seed = 42

training_dataset, testing_dataset = train_test_split(
    list_of_unique_graphs, test_size=0.012, random_state=random_seed)

print(f"Training size = {len(training_dataset)}, Testing size = {len(testing_dataset)}")

Training size = 802, Testing size = 10


In [5]:
from utils.Graph_Utils import GraphUtils
import torch

def process_dataset(dataset, folder_name, is_directed):
    count = 0
    if not os.path.exists(f'../data/{folder_name}'):
        os.makedirs(f'../data/{folder_name}')
    else:
        continue_prompt = input("Do you want to continue? (yes/no): ")
        if continue_prompt.lower() != "yes":
            sys.exit()
    for idx, list_of_same_graph in enumerate(dataset):
        print(f"Processing Dataset: {idx}")
        for dag in list_of_same_graph:
            data_network = dag['network']

            data_dag = dag['task_dag']
            data_map = dag['map']

            data_target = dag['network_processing_time']

            graph = GraphUtils(data_network, directed_=is_directed)
            dag_on_network, new_map = graph.dag_on_network(data_dag, data_map)

            graph_with_link_nodes = graph.create_link_nodes(
                dag_on_network, new_map)

            # graph_tensor = graph.generate_tensor(
            #     graph_with_link_nodes, target_=data_target, debug_=True)

            graph_tensor = graph.generate_heterogeneous_tensor(
                graph_with_link_nodes, target_=data_target, debug_=True)

            torch.save(graph_tensor, f'../data/{folder_name}/{count}.pt')
            count = count + 1
            

DIRECTED_GRAPH = True
process_dataset(training_dataset, folder_name='task_from_graph_dag_over_network_directed_hetero_train', is_directed=DIRECTED_GRAPH)
process_dataset(testing_dataset, folder_name='task_from_graph_dag_over_network_driected_hetero_test', is_directed=DIRECTED_GRAPH)

Processing Dataset: 0
Processing Dataset: 1
Processing Dataset: 2
Processing Dataset: 3
Processing Dataset: 4
Processing Dataset: 5
Processing Dataset: 6
Processing Dataset: 7
Processing Dataset: 8
Processing Dataset: 9
Processing Dataset: 10
Processing Dataset: 11
Processing Dataset: 12
Processing Dataset: 13
Processing Dataset: 14
Processing Dataset: 15
Processing Dataset: 16
Processing Dataset: 17
Processing Dataset: 18
Processing Dataset: 19
Processing Dataset: 20
Processing Dataset: 21
Processing Dataset: 22
Processing Dataset: 23
Processing Dataset: 24
Processing Dataset: 25
Processing Dataset: 26
Processing Dataset: 27
Processing Dataset: 28
Processing Dataset: 29
Processing Dataset: 30
Processing Dataset: 31
Processing Dataset: 32
Processing Dataset: 33
Processing Dataset: 34
Processing Dataset: 35
Processing Dataset: 36
Processing Dataset: 37
Processing Dataset: 38
Processing Dataset: 39
Processing Dataset: 40
Processing Dataset: 41
Processing Dataset: 42
Processing Dataset: 4