In [20]:
%load_ext autoreload
from ddls.devices.processors.gpus.A100 import A100
from ddls.utils import ddls_graph_from_pbtxt_file
from ddls.plotting.plotting import plot_computation_graph
from ddls.environments.job_placing.job_placing_all_nodes_environment import JobPlacingAllNodesEnvironment
from ddls.demands.jobs.job import Job
from ddls.distributions.uniform import Uniform
from ddls.utils import seed_stochastic_modules_globally

import glob
import numpy as np

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Initialise a placement agent

In [21]:
class RandomJobPlacingAgent:
    def __init__(self, name: str = 'random'):
        self.name = name
    
    def select_action(self, obs):
        return np.random.choice(obs.action_set[obs.action_mask])

Initialise the DDLS jobs

In [22]:
%autoreload

# get file paths
path_to_files = '/scratch/datasets/ddls/jobs/tensorflow_synthetic_graphs/valid'
file_paths = glob.glob(path_to_files + '/*')
    
# create ddls graph
num_graphs = 1
ddls_computation_graphs = [ddls_graph_from_pbtxt_file(file_path, processor_type_profiled='A100', verbose=False) for file_path in file_paths[:num_graphs]]

# visualise
visualise = False
if visualise:
    for graph in ddls_computation_graphs:
        fig = plot_computation_graph(graph, scaling_factor=3, title='ddls_graph', show_fig=True, verbose=True)
        
# create ddls jobs from ddls graphs
jobs = [Job(computation_graph=graph, num_training_steps=2) for graph in ddls_computation_graphs]
for job in jobs:
    print(job)

Job ID: 139653723152880 | # nodes: 96 | # edges: 410 | # training steps: 2 | Total op mem cost: 0 | Total dep size: 13914


Initialise the DDLS environment

In [23]:
%autoreload


node_config = {'type_1':
                  {
                      'num_nodes': 16,
                      'workers_config': 
                          [
                              {
                               'num_workers': 4,
                               'worker': A100
                              }
                          ]
                  }
              }

topology_config = {'type':
                      'torus',
                   'kwargs':
                      {
                          'x_dims': 4,
                          'y_dims': 4
                      }
                  }

jobs_config = {'path_to_files': '/scratch/datasets/ddls/jobs/tensorflow_synthetic_graphs/valid',
               'job_interarrival_time_dist': Uniform(min_val=1, max_val=1000),
               'max_files': 1,
               'job_sampling_mode': 'remove'}

env = JobPlacingAllNodesEnvironment(topology_config=topology_config,
                                    node_config=node_config,
                                    jobs_config=jobs_config,
                                    continuous_action_mode=False,
                                    worker_selection='random',
                                    op_allocation='sequential',
                                    observation_function='default',
                                    reward_function='mean_job_completion_time',
                                    max_cluster_simulation_run_time=float('inf'),
                                    job_queue_capacity=10,
#                                     path_to_save='/scratch/datasets/ddls/sims',
                                    path_to_save=None,
                                    save_cluster_data=False,
                                    save_freq=100,
                                    use_sqlite_database=True)

Lets try using a random agent to place the nodes.

In [24]:
seeds = [0, 1, 2]
# seeds = [0]

agent = RandomJobPlacingAgent()

for seed in seeds:
    print(f'\n\n\n~~~~~~~~~~~~~~~~~~~~~~~ Seed {seed} ~~~~~~~~~~~~~~~~~~~~~~~')
    step_counter = 0
    obs, reward, done, info = env.reset(seed=seed)
    while not done:
        action = agent.select_action(obs)
        prev_obs = obs # save
        obs, reward, done, info = env.step(action)
        print(f'\nStep {step_counter}\nObs: {prev_obs}\nAction: {action}\nReward: {reward}\nDone: {done}\nInfo: {info}')
        step_counter += 1
print('\nFinished.')




~~~~~~~~~~~~~~~~~~~~~~~ Seed 0 ~~~~~~~~~~~~~~~~~~~~~~~

Step 0
Obs: Node feats: # nodes: 96 | # feats per node: 9 | # flattened feats per node: 16 || Edge feats: None || Global feats: # global features: 4 | # flattened global feats: 130 || Action info: action space: (65,) | # valid candidate actions: 65
Action: 47
Reward: -6572.0
Done: True
Info: None



~~~~~~~~~~~~~~~~~~~~~~~ Seed 1 ~~~~~~~~~~~~~~~~~~~~~~~

Step 0
Obs: Node feats: # nodes: 96 | # feats per node: 9 | # flattened feats per node: 16 || Edge feats: None || Global feats: # global features: 4 | # flattened global feats: 130 || Action info: action space: (65,) | # valid candidate actions: 65
Action: 12
Reward: -8282.0
Done: True
Info: None



~~~~~~~~~~~~~~~~~~~~~~~ Seed 2 ~~~~~~~~~~~~~~~~~~~~~~~

Step 0
Obs: Node feats: # nodes: 96 | # feats per node: 9 | # flattened feats per node: 16 || Edge feats: None || Global feats: # global features: 4 | # flattened global feats: 130 || Action info: action space: (65,) | # valid 