In [1]:
import os
import json
import numpy as np
import time
from tensorboard.backend.event_processing import event_accumulator

In [2]:
# first round of models set skip_host_call to true, then they have no summaries
# build a script to set wall_time to None

In [3]:
# second round of models set skip_host_call to false
# and recorded wall times as Tensor Events of TensorBoard
# get them as a list

In [4]:
models_list = os.listdir('/home/developer/gcp/cbidmltsf/models')
models_list.sort()

In [5]:
len(models_list)

908

In [6]:
# find the row that separates first and second training rounds
models_list[379]

'ARTRFDC_CPU_000_00'

In [7]:
first_round_list = models_list[:379]

In [8]:
len(first_round_list)

379

In [9]:
# build a snippet to produce json files for models in the first training round, from TPU_10 to TPU_52

In [10]:
# data structure: names of model directories and number of executions for each model
# to build the paths to all of the model directories
targets = [
    ['166404_083202_043201_TPU_10', 10],
    ['086404_043202_043201_TPU_11', 10],
    ['086404_043202_043201_TPU_13', 10],
    ['086404_086402_043201_TPU_14', 10],
    ['086404_086402_043201_TPU_15', 10],
    ['086403_083202_043201_TPU_16', 10],
    ['086401_083201_043201_TPU_17', 10],    
    ['086404_086404_043202_TPU_18', 10],
    ['086404_086404_043202_TPU_19', 10],
    ['086404_086404_043202_TPU_20', 10],
    ['086404_086404_046404_TPU_21', 10],
    ['086404_086404_043202_TPU_22', 10],
    ['086404_086404_043202_TPU_23', 10],
    ['086404_086404_043202_TPU_24', 10],
    ['089604_089604_043202_TPU_25', 10],
    ['084804_084804_043202_TPU_26', 10],
    ['0812804_0812804_046402_TPU_27', 10],
    ['0812804_0812804_046402_TPU_30', 10],
    ['0812804_0812804_046402_TPU_31', 10],
    ['0812804_0812804_046402_TPU_32', 10],
    ['0812804_0812804_046402_TPU_33', 10],
    ['0812804_0812804_046402_TPU_34', 10],
    ['0812804_0812804_046402_TPU_35', 10],
    ['0812804_0812804_046402_TPU_36', 9],
    ['0812804_0812804_046402_TPU_37', 10],
    ['0812804_0812804_046402_TPU_38', 10],
    ['0812804_0812804_046402_TPU_39', 10],
    ['0812804_0812804_046402_TPU_40', 10],
    ['0812804_0812804_046402_TPU_41', 10],
    ['0812804_0812804_046402_TPU_42', 10],
    ['0812804_0812804_046402_TPU_50', 30],
    ['0812804_0812804_046402_TPU_51', 30],
    ['0812804_0812804_046402_TPU_52', 20],
]

In [11]:
def get_wall_time(path_to_logdir):
    '''
    receives a UNIX path to a TensorBoard logdir of a model
    returns the wall time for the model training process
    '''
    # an event accumulator to the logdir
    ea = event_accumulator.EventAccumulator(path_to_logdir,
                                            size_guidance={ # see below regarding this argument
                                                # event_accumulator.COMPRESSED_HISTOGRAMS: 500, # not used
                                                # event_accumulator.IMAGES: 4, # not used
                                                # event_accumulator.AUDIO: 4, # not used
                                                event_accumulator.SCALARS: 0, # retrieve all
                                                event_accumulator.TENSORS: 0, # retrieve all
                                                # event_accumulator.HISTOGRAMS: 1 # not used
                                            }
                                           )
    # loads events from file
    ea.Reload()
    
    # wall time is end time - start time
    wall_time = ea.Tensors('loss')[-1][0] - ea.Tensors('loss')[0][0]
    print("Wall time for model in '{}' is {} seconds.".format(path_to_logdir,
                                                            wall_time))
    return wall_time

In [15]:
# test the above function
get_wall_time('/home/developer/gcp/cbidmltsf/models/DMSLSTM_TPU_007_00')

Found more than one graph event per run, or there was a metagraph containing a graph_def, as well as one or more graph events.  Overwriting the graph with the newest event.
Found more than one metagraph event per run. Overwriting the metagraph with the newest event.


Wall time for model in '/home/developer/gcp/cbidmltsf/models/DMSLSTM_TPU_007_00' is 25.48059105873108 seconds.


25.48059105873108

In [43]:
# build a Python script that inputs the flag --model_dir to produce and persist the json files
# use the following snippet as a base

In [None]:
# update wall times interactively, code the Python script later...

In [16]:
current_list = [
    'ARTRFDC_TPU_000_00',
    'ARTRFDC_TPU_000_01',
    'ARTRFDC_TPU_000_02',
    'ARTRFDC_TPU_000_03',
    'ARTRFDC_TPU_000_04',
    'ARTRFDC_TPU_000_05',
    'ARTRFDC_TPU_000_06',
    'ARTRFDC_TPU_000_07',
    'ARTRFDC_TPU_000_08',
    'ARTRFDC_TPU_000_09',
    'DMSLSTM_TPU_006_00',
    'DMSLSTM_TPU_006_01',
    'DMSLSTM_TPU_006_02',
    'DMSLSTM_TPU_006_03',
    'DMSLSTM_TPU_006_04',
    'DMSLSTM_TPU_006_05',
    'DMSLSTM_TPU_006_06',
    'DMSLSTM_TPU_006_07',
    'DMSLSTM_TPU_006_08',
    'DMSLSTM_TPU_006_09',
    'EDSLSTM_TPU_013_00',
    'EDSLSTM_TPU_013_01',
    'EDSLSTM_TPU_013_02',
    'EDSLSTM_TPU_013_03',
    'EDSLSTM_TPU_013_04',
    'EDSLSTM_TPU_013_05',
    'EDSLSTM_TPU_013_06',
    'EDSLSTM_TPU_013_07',
    'EDSLSTM_TPU_013_08',
    'EDSLSTM_TPU_013_09',
               ]

In [17]:
model_dirs = ['/home/developer/gcp/cbidmltsf/models/{}'.format(model_id) for model_id in current_list]

In [18]:
model_dirs

['/home/developer/gcp/cbidmltsf/models/ARTRFDC_TPU_000_00',
 '/home/developer/gcp/cbidmltsf/models/ARTRFDC_TPU_000_01',
 '/home/developer/gcp/cbidmltsf/models/ARTRFDC_TPU_000_02',
 '/home/developer/gcp/cbidmltsf/models/ARTRFDC_TPU_000_03',
 '/home/developer/gcp/cbidmltsf/models/ARTRFDC_TPU_000_04',
 '/home/developer/gcp/cbidmltsf/models/ARTRFDC_TPU_000_05',
 '/home/developer/gcp/cbidmltsf/models/ARTRFDC_TPU_000_06',
 '/home/developer/gcp/cbidmltsf/models/ARTRFDC_TPU_000_07',
 '/home/developer/gcp/cbidmltsf/models/ARTRFDC_TPU_000_08',
 '/home/developer/gcp/cbidmltsf/models/ARTRFDC_TPU_000_09',
 '/home/developer/gcp/cbidmltsf/models/DMSLSTM_TPU_006_00',
 '/home/developer/gcp/cbidmltsf/models/DMSLSTM_TPU_006_01',
 '/home/developer/gcp/cbidmltsf/models/DMSLSTM_TPU_006_02',
 '/home/developer/gcp/cbidmltsf/models/DMSLSTM_TPU_006_03',
 '/home/developer/gcp/cbidmltsf/models/DMSLSTM_TPU_006_04',
 '/home/developer/gcp/cbidmltsf/models/DMSLSTM_TPU_006_05',
 '/home/developer/gcp/cbidmltsf/models/D

In [19]:
for model_dir in model_dirs:
    # get 'TPU_XX_XX' as model identifier
    model_id = model_dir[-18:]

    wall_time = get_wall_time(model_dir)

    # create a Python dictionary with wall_time as null
    wt_dictionary = {
        'wall_time': wall_time
    }
    # build a path to stats/training_wall_times/ to persist the json file
    local_bucket_path = '/home/developer/gcp/cbidmltsf'
    output_file_name = '{}/stats/training_wall_times/{}.json'.format(local_bucket_path,
                                                                     model_id)
    # and persist the dictionary as json file
    with open(output_file_name, 'w') as outfile:
        json.dump(wt_dictionary, outfile, indent=4)

Found more than one graph event per run, or there was a metagraph containing a graph_def, as well as one or more graph events.  Overwriting the graph with the newest event.
Found more than one metagraph event per run. Overwriting the metagraph with the newest event.
Found more than one graph event per run, or there was a metagraph containing a graph_def, as well as one or more graph events.  Overwriting the graph with the newest event.
Found more than one metagraph event per run. Overwriting the metagraph with the newest event.


Wall time for model in '/home/developer/gcp/cbidmltsf/models/ARTRFDC_TPU_000_00' is 66.73649597167969 seconds.


Found more than one graph event per run, or there was a metagraph containing a graph_def, as well as one or more graph events.  Overwriting the graph with the newest event.
Found more than one metagraph event per run. Overwriting the metagraph with the newest event.


Wall time for model in '/home/developer/gcp/cbidmltsf/models/ARTRFDC_TPU_000_01' is 66.11948895454407 seconds.


Found more than one graph event per run, or there was a metagraph containing a graph_def, as well as one or more graph events.  Overwriting the graph with the newest event.
Found more than one metagraph event per run. Overwriting the metagraph with the newest event.


Wall time for model in '/home/developer/gcp/cbidmltsf/models/ARTRFDC_TPU_000_02' is 67.497230052948 seconds.


Found more than one graph event per run, or there was a metagraph containing a graph_def, as well as one or more graph events.  Overwriting the graph with the newest event.
Found more than one metagraph event per run. Overwriting the metagraph with the newest event.


Wall time for model in '/home/developer/gcp/cbidmltsf/models/ARTRFDC_TPU_000_03' is 66.49502611160278 seconds.


Found more than one graph event per run, or there was a metagraph containing a graph_def, as well as one or more graph events.  Overwriting the graph with the newest event.
Found more than one metagraph event per run. Overwriting the metagraph with the newest event.


Wall time for model in '/home/developer/gcp/cbidmltsf/models/ARTRFDC_TPU_000_04' is 65.9960880279541 seconds.


Found more than one graph event per run, or there was a metagraph containing a graph_def, as well as one or more graph events.  Overwriting the graph with the newest event.
Found more than one metagraph event per run. Overwriting the metagraph with the newest event.


Wall time for model in '/home/developer/gcp/cbidmltsf/models/ARTRFDC_TPU_000_05' is 69.26961302757263 seconds.


Found more than one graph event per run, or there was a metagraph containing a graph_def, as well as one or more graph events.  Overwriting the graph with the newest event.
Found more than one metagraph event per run. Overwriting the metagraph with the newest event.


Wall time for model in '/home/developer/gcp/cbidmltsf/models/ARTRFDC_TPU_000_06' is 68.42630100250244 seconds.


Found more than one graph event per run, or there was a metagraph containing a graph_def, as well as one or more graph events.  Overwriting the graph with the newest event.
Found more than one metagraph event per run. Overwriting the metagraph with the newest event.


Wall time for model in '/home/developer/gcp/cbidmltsf/models/ARTRFDC_TPU_000_07' is 65.96745014190674 seconds.


Found more than one graph event per run, or there was a metagraph containing a graph_def, as well as one or more graph events.  Overwriting the graph with the newest event.
Found more than one metagraph event per run. Overwriting the metagraph with the newest event.


Wall time for model in '/home/developer/gcp/cbidmltsf/models/ARTRFDC_TPU_000_08' is 66.14505696296692 seconds.


Found more than one graph event per run, or there was a metagraph containing a graph_def, as well as one or more graph events.  Overwriting the graph with the newest event.
Found more than one metagraph event per run. Overwriting the metagraph with the newest event.


Wall time for model in '/home/developer/gcp/cbidmltsf/models/ARTRFDC_TPU_000_09' is 65.80707907676697 seconds.
Wall time for model in '/home/developer/gcp/cbidmltsf/models/DMSLSTM_TPU_006_00' is 24.622194051742554 seconds.


Found more than one graph event per run, or there was a metagraph containing a graph_def, as well as one or more graph events.  Overwriting the graph with the newest event.
Found more than one metagraph event per run. Overwriting the metagraph with the newest event.
Found more than one graph event per run, or there was a metagraph containing a graph_def, as well as one or more graph events.  Overwriting the graph with the newest event.
Found more than one metagraph event per run. Overwriting the metagraph with the newest event.


Wall time for model in '/home/developer/gcp/cbidmltsf/models/DMSLSTM_TPU_006_01' is 25.001632928848267 seconds.
Wall time for model in '/home/developer/gcp/cbidmltsf/models/DMSLSTM_TPU_006_02' is 24.876068115234375 seconds.


Found more than one graph event per run, or there was a metagraph containing a graph_def, as well as one or more graph events.  Overwriting the graph with the newest event.
Found more than one metagraph event per run. Overwriting the metagraph with the newest event.
Found more than one graph event per run, or there was a metagraph containing a graph_def, as well as one or more graph events.  Overwriting the graph with the newest event.
Found more than one metagraph event per run. Overwriting the metagraph with the newest event.


Wall time for model in '/home/developer/gcp/cbidmltsf/models/DMSLSTM_TPU_006_03' is 24.82333493232727 seconds.
Wall time for model in '/home/developer/gcp/cbidmltsf/models/DMSLSTM_TPU_006_04' is 24.68921685218811 seconds.


Found more than one graph event per run, or there was a metagraph containing a graph_def, as well as one or more graph events.  Overwriting the graph with the newest event.
Found more than one metagraph event per run. Overwriting the metagraph with the newest event.
Found more than one graph event per run, or there was a metagraph containing a graph_def, as well as one or more graph events.  Overwriting the graph with the newest event.
Found more than one metagraph event per run. Overwriting the metagraph with the newest event.


Wall time for model in '/home/developer/gcp/cbidmltsf/models/DMSLSTM_TPU_006_05' is 24.5868980884552 seconds.
Wall time for model in '/home/developer/gcp/cbidmltsf/models/DMSLSTM_TPU_006_06' is 24.339043855667114 seconds.


Found more than one graph event per run, or there was a metagraph containing a graph_def, as well as one or more graph events.  Overwriting the graph with the newest event.
Found more than one metagraph event per run. Overwriting the metagraph with the newest event.
Found more than one graph event per run, or there was a metagraph containing a graph_def, as well as one or more graph events.  Overwriting the graph with the newest event.
Found more than one metagraph event per run. Overwriting the metagraph with the newest event.


Wall time for model in '/home/developer/gcp/cbidmltsf/models/DMSLSTM_TPU_006_07' is 24.257344007492065 seconds.


Found more than one graph event per run, or there was a metagraph containing a graph_def, as well as one or more graph events.  Overwriting the graph with the newest event.
Found more than one metagraph event per run. Overwriting the metagraph with the newest event.


Wall time for model in '/home/developer/gcp/cbidmltsf/models/DMSLSTM_TPU_006_08' is 24.316033124923706 seconds.


Found more than one graph event per run, or there was a metagraph containing a graph_def, as well as one or more graph events.  Overwriting the graph with the newest event.
Found more than one metagraph event per run. Overwriting the metagraph with the newest event.


Wall time for model in '/home/developer/gcp/cbidmltsf/models/DMSLSTM_TPU_006_09' is 24.376765966415405 seconds.
Wall time for model in '/home/developer/gcp/cbidmltsf/models/EDSLSTM_TPU_013_00' is 28.576439142227173 seconds.


Found more than one graph event per run, or there was a metagraph containing a graph_def, as well as one or more graph events.  Overwriting the graph with the newest event.
Found more than one metagraph event per run. Overwriting the metagraph with the newest event.
Found more than one graph event per run, or there was a metagraph containing a graph_def, as well as one or more graph events.  Overwriting the graph with the newest event.
Found more than one metagraph event per run. Overwriting the metagraph with the newest event.


Wall time for model in '/home/developer/gcp/cbidmltsf/models/EDSLSTM_TPU_013_01' is 28.590346097946167 seconds.
Wall time for model in '/home/developer/gcp/cbidmltsf/models/EDSLSTM_TPU_013_02' is 28.598254919052124 seconds.


Found more than one graph event per run, or there was a metagraph containing a graph_def, as well as one or more graph events.  Overwriting the graph with the newest event.
Found more than one metagraph event per run. Overwriting the metagraph with the newest event.
Found more than one graph event per run, or there was a metagraph containing a graph_def, as well as one or more graph events.  Overwriting the graph with the newest event.
Found more than one metagraph event per run. Overwriting the metagraph with the newest event.


Wall time for model in '/home/developer/gcp/cbidmltsf/models/EDSLSTM_TPU_013_03' is 28.64410400390625 seconds.
Wall time for model in '/home/developer/gcp/cbidmltsf/models/EDSLSTM_TPU_013_04' is 28.65162682533264 seconds.


Found more than one graph event per run, or there was a metagraph containing a graph_def, as well as one or more graph events.  Overwriting the graph with the newest event.
Found more than one metagraph event per run. Overwriting the metagraph with the newest event.
Found more than one graph event per run, or there was a metagraph containing a graph_def, as well as one or more graph events.  Overwriting the graph with the newest event.
Found more than one metagraph event per run. Overwriting the metagraph with the newest event.


Wall time for model in '/home/developer/gcp/cbidmltsf/models/EDSLSTM_TPU_013_05' is 28.654091119766235 seconds.
Wall time for model in '/home/developer/gcp/cbidmltsf/models/EDSLSTM_TPU_013_06' is 28.659013032913208 seconds.


Found more than one graph event per run, or there was a metagraph containing a graph_def, as well as one or more graph events.  Overwriting the graph with the newest event.
Found more than one metagraph event per run. Overwriting the metagraph with the newest event.
Found more than one graph event per run, or there was a metagraph containing a graph_def, as well as one or more graph events.  Overwriting the graph with the newest event.
Found more than one metagraph event per run. Overwriting the metagraph with the newest event.


Wall time for model in '/home/developer/gcp/cbidmltsf/models/EDSLSTM_TPU_013_07' is 28.676450967788696 seconds.
Wall time for model in '/home/developer/gcp/cbidmltsf/models/EDSLSTM_TPU_013_08' is 28.672755002975464 seconds.


Found more than one graph event per run, or there was a metagraph containing a graph_def, as well as one or more graph events.  Overwriting the graph with the newest event.
Found more than one metagraph event per run. Overwriting the metagraph with the newest event.


Wall time for model in '/home/developer/gcp/cbidmltsf/models/EDSLSTM_TPU_013_09' is 28.656620979309082 seconds.
