In [25]:
%load_ext autoreload
from ddls.devices.processors.gpus.A100 import A100
from ddls.plotting.plotting import plot_computation_graph
from ddls.environments.ramp_job_placement_shaping.ramp_job_placement_shaping_environment import RampJobPlacementShapingEnvironment
from ddls.demands.jobs.job import Job
from ddls.distributions.uniform import Uniform
from ddls.utils import seed_stochastic_modules_globally

import glob
import numpy as np

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [26]:
class RandomJobPlacementShapingAgent:
    def __init__(self, name: str = 'random'):
        self.name = name
    
    def select_action(self, obs):
        return np.random.choice(obs['action_set'][np.array(obs["action_mask"], dtype=bool)])

In [27]:
%autoreload

node_config = {'type_1':
                  {
                      'num_nodes': 16, # 8 16
                      'workers_config': 
                          [
                              {
                               'num_workers': 1, # NEED 1 WORKER PER SERVER FOR RAMP
                               'worker': A100
                              }
                          ]
                  }
              }

topology_config = {'type':
                      'ramp',
                   'kwargs':
                      {
                          'num_communication_groups': 2,
                          'num_racks_per_communication_group': 2,
                          'num_servers_per_rack': 4, # 2 4
                          'num_channels': 2
                      }
                  }

jobs_config = {'path_to_files': '/scratch/datasets/ddls/jobs/pipedream_graphs/image_classification/profiles/alexnet/',
               'job_interarrival_time_dist': Uniform(min_val=1, max_val=1000),
               'max_files': 20,
               'job_sampling_mode': 'remove',
               # 'job_sampling_mode': 'remove_and_replace',
               }

In [29]:
%autoreload

env = RampJobPlacementShapingEnvironment(topology_config=topology_config,
                                         node_config=node_config,
                                         jobs_config=jobs_config,
                                         op_partitioner='sip_ml_op_partitioner',
                                         op_partitioner_kwargs={'min_op_run_time_quantum': 10e-6, 'max_partitions_per_op': 2},
                                         # max_simulation_run_time=1e5,
                                         max_simulation_run_time=float('inf'),
                                         job_queue_capacity=100,
                                         path_to_save='/scratch/datasets/ddls/sims',
                                         save_freq=100,
                                         use_sqlite_database=True)
print(env)

<RampJobPlacementShapingEnvironment instance>


In [30]:
%autoreload

# seeds = [0, 1, 2]
seeds = [0]
# seeds = range(100)

num_epochs = 10

agent = RandomJobPlacementShapingAgent()

for seed in seeds:
    print(f'\n\n\n~~~~~~~~~~~~~~~~~~~~~~~ Seed {seed} ~~~~~~~~~~~~~~~~~~~~~~~')
    seed_stochastic_modules_globally(seed)
    for epoch in range(num_epochs):
        print(f'\n\nEpoch {epoch+1}/{num_epochs}')
        step_counter = 0
        # obs = env.reset(seed=seed, verbose=False)
        obs = env.reset()
        # print(obs)
        done = False
        while not done:
            action = agent.select_action(obs)
            # print(f'>>> Agent action: {action} <<<')
            prev_obs = obs # save
            obs, reward, done, info = env.step(action)
            # print(f'\nStep {step_counter}\nObs: {prev_obs}\nAction: {action}\nReward: {reward}\nDone: {done}\nInfo: {info}')
            print(f'\nStep {step_counter}\nEnv cluster stopwatch time: {env.cluster.stopwatch.time()}\nAction: {action}\nReward: {reward}\nDone: {done}\nInfo: {info}')
            print(f'Cluster jobs arrived: {env.cluster.num_jobs_arrived} | completed: {len(env.cluster.jobs_completed)} | blocked: {len(env.cluster.jobs_blocked)} | running: {len(env.cluster.jobs_running)} | queued: {len(env.cluster.job_queue)}')
            step_counter += 1
print('\nFinished.')




~~~~~~~~~~~~~~~~~~~~~~~ Seed 0 ~~~~~~~~~~~~~~~~~~~~~~~


Epoch 1/10

Step 0
Env cluster stopwatch time: 735.0002393434885
Action: 14
Reward: 0.0013605437746431383
Done: True
Info: {}
Cluster jobs arrived: 1 | completed: 1 | blocked: 0 | running: 0 | queued: 0


Epoch 2/10

Step 0
Env cluster stopwatch time: 730.2511081976
Action: 6
Reward: 0.0013693919650025483
Done: True
Info: {}
Cluster jobs arrived: 1 | completed: 1 | blocked: 0 | running: 0 | queued: 0


Epoch 3/10

Step 0
Env cluster stopwatch time: 730.2511081976
Action: 1
Reward: 0.0013693919650025483
Done: True
Info: {}
Cluster jobs arrived: 1 | completed: 1 | blocked: 0 | running: 0 | queued: 0


Epoch 4/10

Step 0
Env cluster stopwatch time: inf
Action: 4
Reward: -1.0
Done: True
Info: {}
Cluster jobs arrived: 1 | completed: 0 | blocked: 1 | running: 0 | queued: 0


Epoch 5/10

Step 0
Env cluster stopwatch time: 735.1436491485123
Action: 13
Reward: 0.0013602783634984268
Done: True
Info: {}
Cluster jobs arrived: 1 | complete