In [4]:
import numpy as np

In [5]:
# identify the architecture
architecture = 'DMSLSTM'

In [6]:
# identify the infrastructure
infrastructure = 'TPU'

In [7]:
# the required batch sizes, all multiples of 64, from 64 to 1024
batch_sizes = list(np.arange(1, 17)*64)

In [8]:
# the desired scaled learning rate for building the learning rate schedule
scaled_learning_rate = 0.01

In [9]:
# get the base learning rates to pass to training script, per batch size
base_learning_rates = [scaled_learning_rate*256/batch_size for batch_size in batch_sizes]

In [10]:
# the desired number of epochs (as in the 32/14062/0.08 combination)
num_epochs = 14062*32/4938

In [17]:
train_steps = list(np.round([num_epochs*4938/batch_size for batch_size in batch_sizes]).astype(int))

In [23]:
# a list to store the basic training parameter combinations
parameters = list()

In [25]:
for row in zip(batch_sizes, train_steps, base_learning_rates):
    parameters.append((row[0], row[1], float('%.8f'%(row[2]))))

In [26]:
parameters

[(64, 7031, 0.04),
 (128, 3516, 0.02),
 (192, 2344, 0.01333333),
 (256, 1758, 0.01),
 (320, 1406, 0.008),
 (384, 1172, 0.00666667),
 (448, 1004, 0.00571429),
 (512, 879, 0.005),
 (576, 781, 0.00444444),
 (640, 703, 0.004),
 (704, 639, 0.00363636),
 (768, 586, 0.00333333),
 (832, 541, 0.00307692),
 (896, 502, 0.00285714),
 (960, 469, 0.00266667),
 (1024, 439, 0.0025)]

In [27]:
# some of the above combinations have already been executed as
# parameters[0]: DMSLSTM_82
# parameters[1]: DMSLSTM_83
# parameters[3]: DMSLSTM_84
# parameters[7]: DMSLSTM_85
# parameters[15]: DMSLSTM_86

In [30]:
# then iterate on the remaining items
remaining_items = [2, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14]

In [32]:
combinations = [parameters[item] for item in remaining_items]

In [44]:
combinations

[(192, 2344, 0.01333333),
 (320, 1406, 0.008),
 (384, 1172, 0.00666667),
 (448, 1004, 0.00571429),
 (576, 781, 0.00444444),
 (640, 703, 0.004),
 (704, 639, 0.00363636),
 (768, 586, 0.00333333),
 (832, 541, 0.00307692),
 (896, 502, 0.00285714),
 (960, 469, 0.00266667)]

In [58]:
# get the names for the experiments on the above combinations
# skip 90 as it is already used
experiment_range_list = [87, 88, 89, 91, 92, 93, 94, 95, 96, 97, 98]

In [37]:
# define the experiment identifiers
# experiment_range_list = [x for x in range(70, 80)]

In [59]:
model_ids_list = ['{}_{}_{:02}'.format(architecture, infrastructure, item)\
                  for item in experiment_range_list]

In [60]:
model_ids_list

['DMSLSTM_TPU_87',
 'DMSLSTM_TPU_88',
 'DMSLSTM_TPU_89',
 'DMSLSTM_TPU_91',
 'DMSLSTM_TPU_92',
 'DMSLSTM_TPU_93',
 'DMSLSTM_TPU_94',
 'DMSLSTM_TPU_95',
 'DMSLSTM_TPU_96',
 'DMSLSTM_TPU_97',
 'DMSLSTM_TPU_98']

In [62]:
experiments_list = list()
for counter in range(len(model_ids_list)):
    experiments_list.append([model_ids_list[counter], combinations[counter][0],
                             combinations[counter][1],
                             combinations[counter][2]])

In [63]:
experiments_list

[['DMSLSTM_TPU_87', 192, 2344, 0.01333333],
 ['DMSLSTM_TPU_88', 320, 1406, 0.008],
 ['DMSLSTM_TPU_89', 384, 1172, 0.00666667],
 ['DMSLSTM_TPU_91', 448, 1004, 0.00571429],
 ['DMSLSTM_TPU_92', 576, 781, 0.00444444],
 ['DMSLSTM_TPU_93', 640, 703, 0.004],
 ['DMSLSTM_TPU_94', 704, 639, 0.00363636],
 ['DMSLSTM_TPU_95', 768, 586, 0.00333333],
 ['DMSLSTM_TPU_96', 832, 541, 0.00307692],
 ['DMSLSTM_TPU_97', 896, 502, 0.00285714],
 ['DMSLSTM_TPU_98', 960, 469, 0.00266667]]

In [64]:
num_executions = 10

In [65]:
data_dir = 'gs://cbidmltsf/sldbs/CPE04015_desbI_H_2017-04-01_00:00:00_2018-02-28_23:00:00_H008001001_D008024001_W004168001'

In [66]:
use_tpu = 'true'

In [67]:
iterations_per_loop = 20000

In [68]:
skip_host_call = 'false'

In [69]:
save_summary_steps = 20000

In [70]:
log_step_count_steps = 20000

In [71]:
mode = 'train'

In [72]:
precision = 'bfloat16'

In [None]:
# programmatically build the training commands

In [85]:
for experiment in experiments_list:
    for execution in range(num_executions):
        print("python3 train.py --model_dir='gs://cbidmltsf/models/{}_{:02d}' --data_dir='{}' --use_tpu={} --train_batch_size={} --train_steps={} --base_learning_rate={} --iterations_per_loop={} --skip_host_call={} --save_summary_steps={} --log_step_count_steps={} --mode='{}' --precision='{}' --persist_parameters={} 1>../logs/{}_{:02d}.txt 2>&1".\
              format(experiment[0],
                     execution,
                     data_dir,
                     use_tpu,
                     experiment[1],
                     experiment[2],
                     experiment[3],
                     iterations_per_loop,
                     skip_host_call,
                     save_summary_steps,
                     log_step_count_steps,
                     mode,
                     precision,
                     # turn on persist_parameters only for the first execution
                     execution==0,
                     experiment[0],
                     execution))

python3 train.py --model_dir='gs://cbidmltsf/models/DMSLSTM_TPU_87_00' --data_dir='gs://cbidmltsf/sldbs/CPE04015_desbI_H_2017-04-01_00:00:00_2018-02-28_23:00:00_H008001001_D008024001_W004168001' --use_tpu=true --train_batch_size=192 --train_steps=2344 --base_learning_rate=0.01333333 --iterations_per_loop=20000 --skip_host_call=false --save_summary_steps=20000 --log_step_count_steps=20000 --mode='train' --precision='bfloat16' --persist_parameters=True 1>../logs/DMSLSTM_TPU_87_00.txt 2>&1
python3 train.py --model_dir='gs://cbidmltsf/models/DMSLSTM_TPU_87_01' --data_dir='gs://cbidmltsf/sldbs/CPE04015_desbI_H_2017-04-01_00:00:00_2018-02-28_23:00:00_H008001001_D008024001_W004168001' --use_tpu=true --train_batch_size=192 --train_steps=2344 --base_learning_rate=0.01333333 --iterations_per_loop=20000 --skip_host_call=false --save_summary_steps=20000 --log_step_count_steps=20000 --mode='train' --precision='bfloat16' --persist_parameters=False 1>../logs/DMSLSTM_TPU_87_01.txt 2>&1
python3 train.p