In [1]:
import os 
import pickle 
from natsort import natsorted

def load_data_as_dict(directory_path):
    """
    Return pickle file in directory_path as a list. 
    Also returns a list of all the unique id's in the dataset. 
    Why do we need unique id? 
        Lets say a task is randomly mapped 100 times. 
        All the 100 data (dict in pickle file) will have the same id. 
        So it is easier to retrieve all the mapping for a single task
    """
    entries = os.listdir(directory_path)
    files = natsorted([entry for entry in entries if os.path.isfile(os.path.join(directory_path, entry))])

    list_of_dicts = []
    list_of_uuids = []

    for file_name in files:
        file_path = os.path.join(directory_path, file_name)

        with open(file_path, 'rb') as file:
            data_dict = pickle.load(file)
            list_of_dicts.append(data_dict)

            uuid = data_dict['task_dag'].id

            if uuid not in list_of_uuids:
                list_of_uuids.append(uuid)

    return list_of_dicts, list_of_uuids

In [2]:
dataset, _ = load_data_as_dict('data/task_from_graph')

In [3]:
import networkx as nx
dataset[0]['task_dag'].graph
def get_node_attribute_types(G):
    attribute_types = {}

    for node, attributes in G.nodes(data=True):
        for attr_name, attr_value in attributes.items():
            attribute_types[attr_name] = type(attr_value)

    return attribute_types

get_node_attribute_types(dataset[0]['task_dag'].graph)

{'delay': int}

In [4]:
import pandas as pd 
df = pd.DataFrame(dataset)
df['network_processing_time'] = pd.to_numeric(df['network_processing_time'])
df['network_processing_time'].describe()

count    81200.000000
mean      2040.257993
std        354.290683
min       1027.000000
25%       1738.000000
50%       2033.000000
75%       2322.000000
max       3672.000000
Name: network_processing_time, dtype: float64

In [61]:
import os 
from torch_geometric.utils import from_networkx
from torch_geometric.data import Data, Batch
import torch

def convert_graph_to_tensor(graph, latency):
    # graph_tensor = from_networkx(graph, group_node_attrs=['weight', 'type'])
    graph_tensor = from_networkx(graph)
    graph_tensor.y = torch.tensor([latency])
    # graph_tensor.x[:, 0] = graph_tensor.x[:, 0] / 100
    # del graph_tensor.pos
    return graph_tensor

def convert_edge_index(edge_index, num_of_tasks):
    # Convertts 'Start' to 0
    # Convertts 'Exit' to num_of_tasks
    converted_edge_index = []
    node_mapping = {'Start': 0, 'Exit': num_of_tasks}

    for src, dest in edge_index:
        if src == 'Start':
            src = node_mapping[src]

        if dest == 'Exit':
            dest = node_mapping[dest]

        converted_edge_index.append((int(src), int(dest)))

    return converted_edge_index

def convert_directed_to_undirected_edge_index(edge_index):
    undirected_edge_index = []

    for src, dest in edge_index:
        undirected_edge_index.append((src, dest))
        undirected_edge_index.append((dest, src))

    return undirected_edge_index

def visualize_graph(graph, is_unidirected=False):
    import matplotlib.pyplot as plt
    from torch_geometric.utils import to_networkx
    import networkx as nx

    graph = to_networkx(graph, to_undirected=is_unidirected)

    pos = nx.spring_layout(graph, seed=42)
    nx.draw(graph, pos, with_labels=True)
    plt.show()


directory_path = 'data/task_from_graph_tensor'

if not os.path.exists(directory_path):
    os.makedirs(directory_path)
    print(f"Directory '{directory_path}' created successfully.")
else:
    print(f"Directory '{directory_path}' already exists.")

def add_node_attributes(graph):
    attribute_name = 'node_type'
    one_hot_mapping = {'Start': [1.0, 0.0, 0.0], 'Exit': [0.0, 0.0, 1.0], 'processing': [0.0, 1.0, 0.0]}
    attributes = {node: torch.tensor(one_hot_mapping['processing']) if node not in ['Start', 'Exit'] else torch.tensor(one_hot_mapping[node]) for node in graph.nodes()}
    nx.set_node_attributes(graph, attributes, attribute_name)

for idx, data in enumerate(dataset):
    
    task_dag = data['task_dag']
    task_processing_time = float(data['network_processing_time'])
    target_value = torch.tensor([task_processing_time]).float()

    task_graph = task_dag.graph
    add_node_attributes(task_graph)

    edge_index = list(task_graph.edges)
    
    total_tasks = len(task_graph.nodes)
    last_task = len(task_graph.nodes) - 1

    graph_tensor = from_networkx(task_graph)
    
    # To remove unnecessary node attributes
    data = Data(
        edge_index=graph_tensor.edge_index, 
        y=torch.tensor([float(target_value)]), 
        x=graph_tensor.node_type
    )

    if idx % 100 == 0:
        print(f"{idx}th graph data ->\t {data}")


    # print(data.x.shape)
    # break
    torch.save(data, f'{directory_path}/graph_{idx}.pt')

Directory 'data/task_from_graph_tensor' already exists.
0th graph data ->	 Data(x=[9, 3], edge_index=[2, 10], y=[1])
100th graph data ->	 Data(x=[9, 3], edge_index=[2, 10], y=[1])
200th graph data ->	 Data(x=[9, 3], edge_index=[2, 10], y=[1])
300th graph data ->	 Data(x=[9, 3], edge_index=[2, 11], y=[1])
400th graph data ->	 Data(x=[9, 3], edge_index=[2, 11], y=[1])
500th graph data ->	 Data(x=[9, 3], edge_index=[2, 12], y=[1])
600th graph data ->	 Data(x=[9, 3], edge_index=[2, 12], y=[1])
700th graph data ->	 Data(x=[9, 3], edge_index=[2, 13], y=[1])
800th graph data ->	 Data(x=[9, 3], edge_index=[2, 14], y=[1])
900th graph data ->	 Data(x=[9, 3], edge_index=[2, 10], y=[1])
1000th graph data ->	 Data(x=[9, 3], edge_index=[2, 10], y=[1])
1100th graph data ->	 Data(x=[9, 3], edge_index=[2, 10], y=[1])
1200th graph data ->	 Data(x=[9, 3], edge_index=[2, 14], y=[1])
1300th graph data ->	 Data(x=[9, 3], edge_index=[2, 13], y=[1])
1400th graph data ->	 Data(x=[9, 3], edge_index=[2, 13], y=[