From DeepMind paper: https://arxiv.org/pdf/2012.13349.pdf

**Calibrated time:** The total evaluation workload across all datasets and comparisons requires
more than 160,000 MIP solves and nearly a million CPU and GPU hours. To meet the compute
requirements, we use a shared, heterogeneous compute cluster. Accurate running time measurement on such a cluster is difficult because the tasks may be scheduled on machines with different
hardware, and interference from other unrelated tasks on the same machine increases the variance
of solve times. To improve accuracy, for each solve task, we periodically solve a small calibration
MIP on a different thread from the solve task on the same machine. We use an estimate of the
number of calibration MIP solves during the solve task on the same machine to measure time,
which is significantly less sensitive to hardware heterogeneity and interference. This quantity is then
converted into a time value using the calibration MIP’s solve time on a reference machine. Section
12.7 gives the details. Results for four instances from MIPLIB show a 1.5× to 30× reduction in
the coefficient of variation of time measurements compared to measuring wall clock time.

# CPU calibrated time

In [None]:
%load_ext autoreload
%autoreload

from retro_branching.environments import EcoleConfiguring

import ecole
import time
import numpy as np
import json

In [None]:
%autoreload

num_solves = 1000
seed = 5

env = EcoleConfiguring(observation_function='default',
                       information_function='default',
                       scip_params='default')

ecole.seed(seed)
instance = next(ecole.instance.SetCoverGenerator(n_rows=500, n_cols=1000, density=0.05))
instance_before_reset = instance.copy_orig()

In [None]:
solving_times = []
for solve in range(1, num_solves+1):
    env.seed(seed)
    _, _, _, _, _ = env.reset(instance_before_reset.copy_orig())
    start_t = time.time()
    _, _, _, _, info = env.step({})
    solving_times.append(time.time() - start_t)
    print(f'Solved {solve} of {num_solves} completed in {solving_times[-1]:.3f} s')

In [None]:
%autoreload

calibration_config = {'seed': seed,
                      'device': 'cpu',
                      'nrows': 500,
                      'ncols': 1000,
                      'num_solves': num_solves,
                      'solving_times': solving_times,
                      'mean_solving_time': np.mean(solving_times)}
with open('cpu_calibration_config.json', 'w') as f:
    json.dump(calibration_config, f)

In [None]:
print(calibration_config['mean_solving_time'])

# GPU calibrated time

In [None]:
%load_ext autoreload
%autoreload

from retro_branching.environments import EcoleBranching
from retro_branching.agents import Agent

import ecole
import torch

import time
import numpy as np
import json

In [None]:
%autoreload

num_warmup_inferences = 1000
num_inferences = 10000
seed = 5
nrows = 7500
ncols = 7500

env = EcoleBranching(observation_function='default',
                     information_function='default',
                     reward_function='default',
                     scip_params='default')

ecole.seed(seed)
instance = next(ecole.instance.SetCoverGenerator(n_rows=nrows, n_cols=ncols, density=0.05))
instance_before_reset = instance.copy_orig()

# init calibration agent
device = 'cuda:0'
path = '/scratch/datasets/retro_branching/supervised_learner/gnn/gnn_343/checkpoint_233/'
config = path + 'config.json'
calibration_agent = Agent(device=device, config=config)
for network_name, network in calibration_agent.get_networks().items():
    if network_name == 'networks':
        # TEMPORARY: Fix typo
        network_name = 'network'
    if network is not None:
        try:
            # see if network saved under same var as 'network_name'
            calibration_agent.__dict__[network_name].load_state_dict(torch.load(path+f'/{network_name}_params.pkl', map_location=device))
        except KeyError:
            # network saved under generic 'network' var (as in Agent class)
            calibration_agent.__dict__['network'].load_state_dict(torch.load(path+f'/{network_name}_params.pkl', map_location=device))
    else:
        pass
calibration_agent.eval()

In [None]:
def extract_state_tensors_from_ecole_obs(obs, device):
    return (torch.from_numpy(obs.row_features.astype(np.float32)).to(device), 
            torch.LongTensor(obs.edge_features.indices.astype(np.int16)).to(device),
            torch.from_numpy(obs.column_features.astype(np.float32)).to(device))

In [None]:
inference_times = []

env.seed(seed)

# reset calibration env
calibration_agent.before_reset(instance_before_reset.copy_orig())
obs, action_set, reward, done, info = env.reset(instance_before_reset.copy_orig())

action_set = action_set.astype(int)
obs = extract_state_tensors_from_ecole_obs(obs, device)

# do num_warmup_inferences
for inference in range(num_warmup_inferences):
    action, _ = calibration_agent.action_select(action_set=action_set, obs=obs, munchausen_tau=0, epsilon=0, model=env.model, done=done, agent_idx=0)
    print(f'Completed {inference+1} of {num_warmup_inferences} warm-up inferences')
print(f'Completed all {num_warmup_inferences} warm-up inferences.\n')

# do num_inferences
for inference in range(num_inferences):
    inference_start_t = time.time_ns()
    action, _ = calibration_agent.action_select(action_set=action_set, obs=obs, munchausen_tau=0, epsilon=0, model=env.model, done=done, agent_idx=0)
    torch.cuda.synchronize(device=device)
    inference_times.append((time.time_ns() - inference_start_t)*1e-9)

    print(f'Completed {inference+1} of {num_inferences} inferences --> curr mean inference time: {np.mean(inference_times)} s')

In [None]:
# # warm-up solves
# for solve in range(1, num_warmup_solves+1):
#     env.seed(seed)
    
#     # reset calibration env
#     calibration_agent.before_reset(instance_before_reset.copy_orig())
#     obs, action_set, reward, done, info = env.reset(instance_before_reset.copy_orig())

#     # solve calibration instance
#     while not done:
#         action_set = action_set.astype(int)
#         obs = extract_state_tensors_from_ecole_obs(obs, device)
#         action, _ = calibration_agent.action_select(action_set=action_set, obs=obs, munchausen_tau=0, epsilon=0, model=env.model, done=done, agent_idx=0)
#         obs, action_set, reward, done, info = env.step(action)
#         print(info['num_nodes'])

#     print(f'Solved {solve} of {num_warmup_solves} warm-up instances with {info["num_nodes"]} nodes')
# print(f'Completed warm-up solves. Beginning to solve calibration instance {num_solves} times...\n')


# inference_times = []
# for solve in range(1, num_solves+1):
#     env.seed(seed)
    
#     # reset calibration env
#     calibration_agent.before_reset(instance_before_reset.copy_orig())
#     obs, action_set, reward, done, info = env.reset(instance_before_reset.copy_orig())

#     # solve calibration instance and save inference times
#     while not done:
#         action_set = action_set.astype(int)
#         obs = extract_state_tensors_from_ecole_obs(obs, device)
#         inference_start_t = time.time_ns()
#         action, _ = calibration_agent.action_select(action_set=action_set, obs=obs, munchausen_tau=0, epsilon=0, model=env.model, done=done, agent_idx=0)
#         torch.cuda.synchronize(device=device)
#         inference_times.append((time.time_ns() - inference_start_t)*1e-9)
#         obs, action_set, reward, done, info = env.step(action)

#     print(f'Solved {solve} of {num_solves} instances with {info["num_nodes"]} nodes --> curr mean inference time: {np.mean(inference_times)} s')

In [None]:
%autoreload

calibration_config = {'seed': seed,
                      'device': 'cuda',
                      'nrows': nrows,
                      'ncols': ncols,
                      'num_inferences': num_inferences,
                      'inference_times': inference_times,
                      'mean_inference_time': np.mean(inference_times)}
with open('gpu_calibration_config.json', 'w') as f:
    json.dump(calibration_config, f)
    print(f)

In [None]:
print(calibration_config['mean_inference_time'])

# Example of using calibrated time

N.B. In below we are recording the total calibrated solve time, but could also record the per-step calibrated solve time, which is helpful for plotting metric (e.g. primal dual gap) evolution throughout the instance solving process.

In [None]:
%autoreload

from retro_branching.environments import EcoleBranching
from retro_branching.agents import PseudocostBranchingAgent

import threading

In [None]:
seed = 0
num_episodes = 20
calibration_freq = 10 # num instances to solve with agent before starting new calibration threads
num_calibrations = 10 # num repeats of calibration to perform

In [None]:
%autoreload

# calibration
def solve_calibration_instance(calibration_config, calibration_solve_times):
    calibration_env = EcoleConfiguring(observation_function='default',
                                   information_function='default',
                                   scip_params='default')
    calibration_generator = ecole.instance.SetCoverGenerator(n_rows=calibration_config['nrows'], 
                                                             n_cols=calibration_config['ncols'], 
                                                             density=0.05)
    calibration_generator.seed(calibration_config['seed'])
    calibration_instance = next(calibration_generator)

    calibration_env.seed(calibration_config['seed'])
    _, _, _, _, info = calibration_env.reset(calibration_instance.copy_orig())
    
    start_t = time.time()
    _, _, _, _, info = calibration_env.step({})
    calibration_solve_times.append(time.time() - start_t)
    print('finished calibration')



# agent
env = EcoleBranching(observation_function='default',
                     information_function='default',
                     reward_function='default',
                     scip_params='default')

agent = PseudocostBranchingAgent()

ecole.seed(seed)
instances = ecole.instance.SetCoverGenerator(n_rows=500, n_cols=1000, density=0.05)

In [None]:
eval_solve_times = []
for episode_counter in range(num_episodes):
    if episode_counter % calibration_freq == 0:
        # perform calibration
        threads = list()
        calibration_solve_times = []
        for _ in range(10):
            x = threading.Thread(target=solve_calibration_instance, args=(calibration_config, calibration_solve_times,))
            threads.append(x)
            x.start()
    
    done = True
    while done:
        instance = next(instances)
        instance_before_reset = instance.copy_orig()
        agent.before_reset(instance_before_reset)
        env.seed(seed)
        obs, action_set, reward, done, info = env.reset(instance)
        
    start_t = time.time()
    while not done:
        action, _ = agent.action_select(action_set, env.model, done)
        obs, action_set, reward, done, info = env.step(action)
    solve_time = time.time() - start_t
    eval_solve_times.append(solve_time)

    if episode_counter % calibration_freq == 0:
        for thread in threads:
            thread.join()
            
    # calc calibrated time
    calibrated_solves = solve_time / np.mean(calibration_solve_times)
    calibrated_solve_time = calibrated_solves * calibration_config['mean_solving_time']
    print(f'Solved instance with {info["num_nodes"]} nodes in {solve_time:.3f} s --> calibrated_solves: {calibrated_solves} | calibrated_solve_time: {calibrated_solve_time}')

In [None]:
print(calibration_solve_times)