In [1]:
import sys
sys.path.append("../h2o-py/build/main")
import h2o
import time
import os
import statistics
import random

versionFromGradle='3.37.0',projectVersion='3.37.0.99999',branch='valenad-PUBDEV-8470-checkpointing',lastCommitHash='271a91cd10b36e841f7e96cbe26ff6155993250b',gitDescribe='jenkins-master-5866-20-g271a91cd10',compiledOn='2022-06-27 01:55:35',compiledBy='adam'


In [2]:
def get_folder_size(start_path = '.'):
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(start_path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            if not os.path.islink(fp):
                total_size += os.path.getsize(fp)

    return total_size


def to_MB(size_in_bytes):
    return size_in_bytes / 1024**2



def measure_model(ntrees, max_depth, checkpoints_path=None):
    times = []
    file_sizes = []
    
    for seed in range(0,10):
        if checkpoints_path!=None: 
            checkpoints_path_unique = checkpoints_path + "_" + str(ntrees) + "_" + str(max_depth) + "_" + str(seed)
            print("Checkpoints path:", checkpoints_path_unique)
        else:
            checkpoints_path_unique=None
            
        creditcards_gbm = H2OGradientBoostingEstimator(ntrees=ntrees,
                                            max_depth=max_depth,
                                            min_rows=1,
                                            score_tree_interval=ntrees+1,
                                            in_training_checkpoints_dir=checkpoints_path_unique,
                                            seed = seed)
        start = time.time()
        creditcards_gbm.train(x=predictors,
                       y=response,
                       training_frame=train)
        end = time.time()
        training_time = end - start
        print("Took", training_time, "s")
        times.append(training_time)
        model_path = h2o.save_model(creditcards_gbm, "models/")
        file_stats = os.stat(model_path)
        model_size = to_MB(file_stats.st_size)
        file_sizes.append(model_size)
        
    print("Mean training times: ", statistics.mean(times))
    print("Mean model size in MegaBytes is", statistics.mean(file_sizes))

    if checkpoints_path != None:
        checkpoints_folder_size = to_MB(get_folder_size(f"{checkpoints_path_unique}"))
        print("Size of all checkpoints is: ", checkpoints_folder_size, 'MB')
        print("Its ", checkpoints_folder_size / statistics.mean(file_sizes), "times more that size of final model")
        
    return times, file_sizes

In [3]:
from h2o.estimators import H2OGradientBoostingEstimator
h2o.init()

creditcards = h2o.import_file("../bigdata/laptop/creditcardfraud/creditcardfraud.csv")

columns = creditcards.columns
predictors = columns[:len(columns)-1]
response = "Class"

creditcards[response] = creditcards[response].asfactor()

train = creditcards

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: java version "1.8.0_311"; Java(TM) SE Runtime Environment (build 1.8.0_311-b11); Java HotSpot(TM) 64-Bit Server VM (build 25.311-b11, mixed mode)
  Starting server from /Users/adam/h2o-3/build/h2o.jar
  Ice root: /var/folders/2z/9gdhqbns0djdj9crb242sgc00000gn/T/tmp10_iht84
  JVM stdout: /var/folders/2z/9gdhqbns0djdj9crb242sgc00000gn/T/tmp10_iht84/h2o_adam_started_from_python.out
  JVM stderr: /var/folders/2z/9gdhqbns0djdj9crb242sgc00000gn/T/tmp10_iht84/h2o_adam_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,02 secs
H2O_cluster_timezone:,Europe/Prague
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.37.0.99999
H2O_cluster_version_age:,4 minutes
H2O_cluster_name:,H2O_from_python_adam_dpmb2h
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.556 Gb
H2O_cluster_total_cores:,12
H2O_cluster_allowed_cores:,12


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [4]:
slowdowns = []
for ntrees in [100, 200, 500, 1000, 1500, 2000]:
    model = measure_model(ntrees, 30)
    model_checkpoints = measure_model(ntrees, 30, "checkpoints_gbm")
    display("Without checkpoints times", model)
    display("With checkpoints times", model_checkpoints)
    slowdown = statistics.mean(model_checkpoints[0])/statistics.mean(model[0]) - 1
    slowdowns.append(slowdown)
    print("Training time is:", slowdown*100, "% different with checkpoints")
    
display("Slowdowns", slowdowns)
print("Training time slowdown average is:", statistics.mean(slowdowns))

gbm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
Took 33.4317569732666 s
gbm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
Took 33.27121710777283 s
gbm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
Took 31.909560203552246 s
gbm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
Took 31.636754989624023 s
gbm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
Took 31.72126603126526 s
gbm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
Took 31.708994150161743 s
gbm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
Took 34.28730392456055 s
gbm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
Took 35.95831298828125 s
gbm Model Build progress: |███████████

'Without checkpoints times'

([33.4317569732666,
  33.27121710777283,
  31.909560203552246,
  31.636754989624023,
  31.72126603126526,
  31.708994150161743,
  34.28730392456055,
  35.95831298828125,
  33.62814712524414,
  33.56273412704468],
 [1.4883661270141602,
  1.4884281158447266,
  1.4884319305419922,
  1.4884357452392578,
  1.4884471893310547,
  1.4884347915649414,
  1.4884347915649414,
  1.4884471893310547,
  1.4884510040283203,
  1.488450050354004])

'With checkpoints times'

([33.843544006347656,
  33.82450771331787,
  34.1566059589386,
  34.20288801193237,
  35.25788402557373,
  34.07809495925903,
  35.543802976608276,
  34.49162411689758,
  33.99303317070007,
  34.301186084747314],
 [1.488499641418457,
  1.4884929656982422,
  1.488485336303711,
  1.4884824752807617,
  1.4884862899780273,
  1.4884700775146484,
  1.4884977340698242,
  1.4884977340698242,
  1.4884700775146484,
  1.488490104675293])

Training time is: 3.79840345822009 % different with checkpoints
gbm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
Took 67.62316489219666 s
gbm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
Took 67.97757983207703 s
gbm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
Took 67.77299809455872 s
gbm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
Took 67.0569839477539 s
gbm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
Took 67.00074529647827 s
gbm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
Took 66.39607286453247 s
gbm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
Took 66.57682704925537 s
gbm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
To

'Without checkpoints times'

([67.62316489219666,
  67.97757983207703,
  67.77299809455872,
  67.0569839477539,
  67.00074529647827,
  66.39607286453247,
  66.57682704925537,
  67.47103214263916,
  66.03261804580688,
  66.46323800086975],
 [2.9566116333007812,
  2.956605911254883,
  2.956597328186035,
  2.956601142883301,
  2.9565792083740234,
  2.956608772277832,
  2.956599235534668,
  2.9566240310668945,
  2.9565982818603516,
  2.956575393676758])

'With checkpoints times'

([65.93310117721558,
  68.41800117492676,
  66.71239399909973,
  66.87696313858032,
  66.51057314872742,
  65.6205370426178,
  67.06651997566223,
  66.41373205184937,
  65.71719098091125,
  65.47262287139893],
 [2.9566478729248047,
  2.9566287994384766,
  2.956650733947754,
  2.9566125869750977,
  2.9566659927368164,
  2.9566688537597656,
  2.9566287994384766,
  2.956639289855957,
  2.956624984741211,
  2.9566469192504883])

Training time is: -0.839777141368403 % different with checkpoints
gbm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
Took 106.10653328895569 s
gbm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
Took 104.93350768089294 s
gbm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
Took 103.66442489624023 s
gbm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
Took 106.11266803741455 s
gbm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
Took 103.90525794029236 s
gbm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
Took 104.36680793762207 s
gbm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
Took 107.04533791542053 s
gbm Model Build progress: |██████████████████████████████████████████████████████| (don

'Without checkpoints times'

([106.10653328895569,
  104.93350768089294,
  103.66442489624023,
  106.11266803741455,
  103.90525794029236,
  104.36680793762207,
  107.04533791542053,
  104.5396499633789,
  104.64195489883423,
  105.40122699737549],
 [4.370203971862793,
  4.370180130004883,
  4.370175361633301,
  4.370210647583008,
  4.370141983032227,
  4.370172500610352,
  4.37020206451416,
  4.370199203491211,
  4.370168685913086,
  4.370205879211426])

'With checkpoints times'

([104.3037109375,
  104.41359901428223,
  106.48323392868042,
  103.90191793441772,
  104.59812617301941,
  104.8479528427124,
  103.31968832015991,
  102.27428579330444,
  104.49757814407349,
  102.8906261920929],
 [4.370213508605957,
  4.370241165161133,
  4.370232582092285,
  4.370251655578613,
  4.370242118835449,
  4.370234489440918,
  4.3702497482299805,
  4.370230674743652,
  4.370217323303223,
  4.370224952697754])

Training time is: -0.8743217293592842 % different with checkpoints
gbm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
Took 154.10553908348083 s
gbm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
Took 151.9739511013031 s
gbm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
Took 154.96620202064514 s
gbm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
Took 152.19303607940674 s
gbm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
Took 152.7576458454132 s
gbm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
Took 151.31441593170166 s
gbm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
Took 152.94519186019897 s
gbm Model Build progress: |██████████████████████████████████████████████████████| (done

'Without checkpoints times'

([154.10553908348083,
  151.9739511013031,
  154.96620202064514,
  152.19303607940674,
  152.7576458454132,
  151.31441593170166,
  152.94519186019897,
  151.56276488304138,
  152.70845985412598,
  150.1232409477234],
 [4.964383125305176,
  4.964374542236328,
  4.964394569396973,
  4.964398384094238,
  4.964356422424316,
  4.96435546875,
  4.964405059814453,
  4.964398384094238,
  4.964360237121582,
  4.964398384094238])

'With checkpoints times'

([156.2925500869751,
  153.95087718963623,
  155.4369673728943,
  154.43201112747192,
  156.85681986808777,
  154.48422408103943,
  156.45426106452942,
  154.18208003044128,
  155.9257550239563,
  153.9195852279663],
 [4.964405059814453,
  4.96440315246582,
  4.964430809020996,
  4.964460372924805,
  4.964448928833008,
  4.964426040649414,
  4.964468955993652,
  4.964385032653809,
  4.964483261108398,
  4.964459419250488])

Training time is: 1.7895697672067223 % different with checkpoints
gbm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
Took 205.30768418312073 s
gbm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
Took 205.47759318351746 s
gbm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
Took 203.3487730026245 s
gbm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
Took 204.82065105438232 s
gbm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
Took 204.58983516693115 s
gbm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
Took 202.52561116218567 s
gbm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
Took 204.97925209999084 s
gbm Model Build progress: |██████████████████████████████████████████████████████| (done

'Without checkpoints times'

([205.30768418312073,
  205.47759318351746,
  203.3487730026245,
  204.82065105438232,
  204.58983516693115,
  202.52561116218567,
  204.97925209999084,
  205.38920283317566,
  202.96853971481323,
  206.36109709739685],
 [5.581172943115234,
  5.581241607666016,
  5.5811052322387695,
  5.5811004638671875,
  5.5811309814453125,
  5.581109046936035,
  5.581120491027832,
  5.581153869628906,
  5.581157684326172,
  5.5811357498168945])

'With checkpoints times'

([213.13341689109802,
  211.46776390075684,
  213.54406094551086,
  214.3740200996399,
  210.64056301116943,
  213.6485161781311,
  212.85376119613647,
  213.08496594429016,
  212.69236707687378,
  213.2327880859375],
 [5.581132888793945,
  5.581255912780762,
  5.581235885620117,
  5.581194877624512,
  5.581178665161133,
  5.581205368041992,
  5.581183433532715,
  5.58125114440918,
  5.581118583679199,
  5.581155776977539])

Training time is: 4.052462162172565 % different with checkpoints
gbm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
Took 264.62720680236816 s
gbm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
Took 265.13933968544006 s
gbm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
Took 262.63359093666077 s
gbm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
Took 261.78533720970154 s
gbm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
Took 260.9616448879242 s
gbm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
Took 261.19680190086365 s
gbm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
Took 262.99491786956787 s
gbm Model Build progress: |██████████████████████████████████████████████████████| (done)

'Without checkpoints times'

([264.62720680236816,
  265.13933968544006,
  262.63359093666077,
  261.78533720970154,
  260.9616448879242,
  261.19680190086365,
  262.99491786956787,
  264.33857774734497,
  260.40155386924744,
  262.5372657775879],
 [6.2241668701171875,
  6.224016189575195,
  6.224138259887695,
  6.224076271057129,
  6.224038124084473,
  6.224081039428711,
  6.224004745483398,
  6.224102973937988,
  6.224065780639648,
  6.224059104919434])

'With checkpoints times'

([272.817174911499,
  271.63794898986816,
  274.88929200172424,
  277.2193431854248,
  277.44274616241455,
  280.67250895500183,
  281.95521306991577,
  281.5670781135559,
  281.25603103637695,
  282.94080686569214],
 [6.224216461181641,
  6.22420597076416,
  6.224215507507324,
  6.224140167236328,
  6.224234580993652,
  6.224040985107422,
  6.224025726318359,
  6.224079132080078,
  6.224143028259277,
  6.224170684814453])

Training time is: 5.930897115037825 % different with checkpoints


'Slowdowns'

[0.0379840345822009,
 -0.00839777141368403,
 -0.008743217293592842,
 0.017895697672067223,
 0.040524621621725654,
 0.059308971150378254]

Training time slowdown average is: 0.02309538938651586
