In [1]:
experimentsPath = "/raid/elhamod/CIFAR_HGNN/experiments/" #"/raid/elhamod/Fish/experiments/"
dataPath = "/raid/elhamod/" #"/raid/elhamod/Fish/"
experimentName = "CIFAR_phylogeny_tripletloss_SGDTest_staticlargelambda_updateArchi"
device = 1
detailed_reporting = False

In [2]:
import matplotlib.pyplot as plt
import torch
import sys
import os
from sklearn.metrics import f1_score
import pandas as pd
from tqdm import tqdm
from tqdm.auto import trange
import wandb

import warnings
# warnings.filterwarnings("ignore")

try:
    import wandb
except:
    print('wandb not found')

from myhelpers import config_plots, TrialStatistics
from myhelpers.try_warning import try_running
from HGNN.train import CNN, dataLoader
from myhelpers import cifar_dataLoader
from HGNN.train.configParser import ConfigParser, getModelName, getDatasetName
config_plots.global_settings()

experimetnsFileName = "experiments.csv"
WANDB_message="wandb not working"


# For logging to server
try_running(lambda : wandb.login(), WANDB_message)

experimentPathAndName = os.path.join(experimentsPath, experimentName)
# set cuda
if device is not None:
    print("using cuda", device)
    torch.cuda.set_device(device)

else:
    print("using cpu")

# get experiment params
config_parser = ConfigParser(experimentsPath, dataPath, experimentName)

# init experiments file
experimentsFileNameAndPath = os.path.join(experimentsPath, experimetnsFileName)

paramsIterator = config_parser.getExperiments()  
number_of_experiments = sum(1 for e in paramsIterator)  
experiment_index = 0

# Loop through experiments
# with progressbar.ProgressBar(max_value=number_of_experiments) as bar:
with tqdm(total=number_of_experiments, desc="experiment") as bar:
    for experiment_params in config_parser.getExperiments():
        print(experiment_params)
        experimentHash =TrialStatistics.getTrialName(experiment_params)

        # load images 
        if experiment_params['image_path'] == 'cifar-100-python':
            datasetManager = cifar_dataLoader.datasetManager(experimentPathAndName, dataPath)
        else:
            datasetManager = dataLoader.datasetManager(experimentPathAndName, dataPath)
        datasetManager.updateParams(config_parser.fixPaths(experiment_params))
        train_loader, validation_loader, test_loader = datasetManager.getLoaders()
        architecture = {
            "fine": len(train_loader.dataset.csv_processor.getFineList()),
            "coarse" : len(train_loader.dataset.csv_processor.getCoarseList())
        }

        # Loop through n trials
        for i in trange(experiment_params["numOfTrials"], desc="trial"):
            modelName = getModelName(experiment_params, i)
            trialName = os.path.join(experimentPathAndName, modelName)
            trialHash = TrialStatistics.getTrialName(experiment_params, i)

            row_information = {
                'experimentName': experimentName,
                'modelName': modelName,
                'datasetName': getDatasetName(config_parser.fixPaths(experiment_params)),
                'experimentHash': experimentHash,
                'trialHash': trialHash
            }
            row_information = {**row_information, **experiment_params} 
            print(row_information)

            run = try_running(lambda : wandb.init(project='HGNN', group=experimentName+"-"+experimentHash, name=trialHash, config=row_information), WANDB_message) #, reinit=True

            # Train/Load model
            model = CNN.create_model(architecture, experiment_params, device=device)
#             print(model)

#             try_running(lambda : wandb.watch(model, log="all"), WANDB_message)

            if os.path.exists(CNN.getModelFile(trialName)):
                print("Model {0} found!".format(trialName))
            else:
                initModelPath = CNN.getInitModelFile(experimentPathAndName)
                if os.path.exists(initModelPath):
                    model.load_state_dict(torch.load(initModelPath))
                    print("Init Model {0} found!".format(initModelPath))
                CNN.trainModel(train_loader, validation_loader, experiment_params, model, trialName, test_loader, device=device, detailed_reporting=detailed_reporting)

            # Add to experiments file
            if os.path.exists(experimentsFileNameAndPath):
                experiments_df = pd.read_csv(experimentsFileNameAndPath)
            else:
                experiments_df = pd.DataFrame()

            record_exists = not (experiments_df[experiments_df['modelName'] == modelName][experiments_df['experimentName'] == experimentName]).empty if not experiments_df.empty else False
            if record_exists:
                experiments_df.drop(experiments_df[experiments_df['modelName'] == modelName][experiments_df['experimentName'] == experimentName].index, inplace = True) 

            experiments_df = experiments_df.append(pd.DataFrame(row_information, index=[0]), ignore_index = True)
            experiments_df.to_csv(experimentsFileNameAndPath, header=True, index=False)

            try_running(lambda : run.finish(), WANDB_message)

        bar.update()

        experiment_index = experiment_index + 1



# if __name__ == "__main__":
#     torch.multiprocessing.set_start_method('spawn')
    
#     import argparse

#     parser = argparse.ArgumentParser()
#     parser.add_argument('--cuda', required=True, type=int)
#     parser.add_argument('--experiments', required=True)
#     parser.add_argument('--data', required=True)
#     parser.add_argument('--name', required=True)
#     parser.add_argument('--detailed', required=False, action='store_true')
#     args = parser.parse_args()
#     main(experimentName=args.name, experimentsPath=args.experiments, dataPath=args.data, device=args.cuda, detailed_reporting=args.detailed)


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mmndhamod[0m (use `wandb login --relogin` to force relogin)
experiment:   0%|          | 0/1 [00:00<?, ?it/s]

using cuda 1
{'image_path': 'cifar-100-python', 'suffix': None, 'img_res': 32, 'augmented': False, 'batchSize': 128, 'learning_rate': 0.01, 'numOfTrials': 1, 'fc_layers': 1, 'modelType': 'BB', 'lambda': 10, 'tl_model': 'CIFAR', 'link_layer': 'avgpool', 'adaptive_smoothing': False, 'adaptive_lambda': 0.1, 'adaptive_alpha': 0.001, 'tripletEnabled': True, 'tripletSamples': 3, 'tripletSelector': 'semihard', 'tripletMargin': 0.2, 'phylogeny_loss': False, 'displayName': 'CIFAR large lambda static updated architecture', 'noSpeciesBackprop': False, 'phylogeny_loss_epsilon': 0.03}
Creating dataset...
Loading dataset...
/raid/elhamod/cifar-100-python/
Files already downloaded and verified
CIFAR normalization
Loading dataset...
/raid/elhamod/cifar-100-python/
Files already downloaded and verified
CIFAR normalization
Creating dataset... Done.
Loading saved indices...
Creating loaders...
Creating loaders... Done.


HBox(children=(FloatProgress(value=0.0, description='trial', max=1.0, style=ProgressStyle(description_width='i…

{'experimentName': 'CIFAR_phylogeny_tripletloss_SGDTest_staticlargelambda_updateArchi', 'modelName': 'models/a9ee2e3cdf53aa802cd1b304abfae30309014d126cf3ec35118d2f50', 'datasetName': 'datasplits/dd10c35154ee995db5de0276a21ca1a15a77964ee53811d98f089e89', 'experimentHash': '3793d1aac98cb6961773428741793544abfa53544522635b53889f30', 'trialHash': 'a9ee2e3cdf53aa802cd1b304abfae30309014d126cf3ec35118d2f50', 'image_path': 'cifar-100-python', 'suffix': None, 'img_res': 32, 'augmented': False, 'batchSize': 128, 'learning_rate': 0.01, 'numOfTrials': 1, 'fc_layers': 1, 'modelType': 'BB', 'lambda': 10, 'tl_model': 'CIFAR', 'link_layer': 'avgpool', 'adaptive_smoothing': False, 'adaptive_lambda': 0.1, 'adaptive_alpha': 0.001, 'tripletEnabled': True, 'tripletSamples': 3, 'tripletSelector': 'semihard', 'tripletMargin': 0.2, 'phylogeny_loss': False, 'displayName': 'CIFAR large lambda static updated architecture', 'noSpeciesBackprop': False, 'phylogeny_loss_epsilon': 0.03}


[34m[1mwandb[0m: wandb version 0.10.30 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


layer4 is not in ['conv1', 'bn1', 'relu', 'layer1', 'layer2', 'layer3', 'avgpool']




iteration:   0%|          | 0/500 [00:00<?, ?it/s][A[A

layer4 is not in ['conv1', 'bn1', 'relu', 'layer1', 'layer2', 'layer3', 'avgpool']
Training started...




iteration:   0%|          | 0/500 [00:28<?, ?it/s, min_val_loss=inf, train=0.959, val=0.962, val_loss=3.68][A[A

iteration:   0%|          | 1/500 [00:28<3:57:42, 28.58s/it, min_val_loss=inf, train=0.959, val=0.962, val_loss=3.68][A[A

iteration:   0%|          | 1/500 [02:53<3:57:42, 28.58s/it, min_val_loss=inf, train=0.072, val=0.0693, val_loss=4.58][A[A

iteration:   0%|          | 2/500 [02:53<8:46:30, 63.44s/it, min_val_loss=inf, train=0.072, val=0.0693, val_loss=4.58][A[A

iteration:   0%|          | 2/500 [05:45<8:46:30, 63.44s/it, min_val_loss=14.4, train=0.109, val=0.1, val_loss=4.56]  [A[A

iteration:   1%|          | 3/500 [05:45<13:15:28, 96.03s/it, min_val_loss=14.4, train=0.109, val=0.1, val_loss=4.56][A[A

iteration:   1%|          | 3/500 [10:31<13:15:28, 96.03s/it, min_val_loss=9.96, train=0.187, val=0.17, val_loss=4.51][A[A

iteration:   1%|          | 4/500 [10:31<21:04:26, 152.96s/it, min_val_loss=9.96, train=0.187, val=0.17, val_loss=4.51][A[A

it

iteration:   6%|▋         | 32/500 [2:09:25<43:47:34, 336.87s/it, min_val_loss=2.68, train=0.552, val=0.373, val_loss=4.33][A[A

iteration:   6%|▋         | 32/500 [2:15:21<43:47:34, 336.87s/it, min_val_loss=2.68, train=0.553, val=0.374, val_loss=4.33][A[A

iteration:   7%|▋         | 33/500 [2:15:21<44:26:39, 342.61s/it, min_val_loss=2.68, train=0.553, val=0.374, val_loss=4.33][A[A

iteration:   7%|▋         | 33/500 [2:21:00<44:26:39, 342.61s/it, min_val_loss=2.67, train=0.553, val=0.375, val_loss=4.33][A[A

iteration:   7%|▋         | 34/500 [2:21:00<44:12:23, 341.51s/it, min_val_loss=2.67, train=0.553, val=0.375, val_loss=4.33][A[A

iteration:   7%|▋         | 34/500 [2:26:54<44:12:23, 341.51s/it, min_val_loss=2.66, train=0.557, val=0.376, val_loss=4.33][A[A

iteration:   7%|▋         | 35/500 [2:26:54<44:37:05, 345.43s/it, min_val_loss=2.66, train=0.557, val=0.376, val_loss=4.33][A[A

iteration:   7%|▋         | 35/500 [2:32:40<44:37:05, 345.43s/it, min_val_loss=2.66

Early stopping
total number of epochs:  39


iteration:   8%|▊         | 40/500 [2:56:11<33:46:09, 264.28s/it, min_val_loss=2.66, train=0.56, val=0.372, val_loss=4.33]


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,39.0
learning rate,2e-05
validation_fine_f1,0.37577
training_fine_f1,0.56011
test_fine_f1,0.36214
validation_loss,4.33169
_runtime,10565.0
_timestamp,1620698706.0
_step,12246.0
loss,3.74421


0,1
epoch,▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
learning rate,████▄▄▄▄▃▃▃▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
validation_fine_f1,█▁▁▂▂▃▂▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃
training_fine_f1,█▁▁▂▂▃▂▃▃▄▄▄▄▄▄▄▄▄▄▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅
test_fine_f1,█▁▁▂▃▃▃▄▃▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄
validation_loss,▁██▇▇▇▇▇▇▇▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆
_runtime,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
_timestamp,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
loss,██▇▅▅▄▅▃▃▃▂▂▃▂▃▃▃▂▂▁▁▂▃▁▂▁▂▂▂▁▁▂▂▂▂▁▂▁▂▂


experiment: 100%|██████████| 1/1 [2:56:22<00:00, 10582.96s/it]





