/
train_clearml_pytorch_ignite_caltech_birds.py
199 lines (163 loc) · 7.84 KB
/
train_clearml_pytorch_ignite_caltech_birds.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
#!/usr/bin/env python
# coding: utf-8
from __future__ import print_function, division
import pathlib
# Clear ML experiment
from clearml import Task, Dataset
# Local modules
from cub_tools.trainer.IgniteClearML import ClearML_Ignite_Trainer
from cub_tools.args import get_parser
from cub_tools.config import get_cfg_defaults, get_key_value_dict
# Get the arguments from the command line, including configuration file and any overrides.
parser = get_parser()
# Add any additional arguments you'd like to pass
parser.add_argument(
'--clearml-project',
dest='clearml_project',
type=str,
help='Name of the ClearML project that you want the experiment to be logged in. [Caltech Birds/Training]',
default='Caltech Birds/Training')
parser.add_argument(
'--clearml-dataset-project',
dest='clearml_dataset_project',
type=str,
help='Name of the ClearML project where the dataset for training and test is located. [Caltech Birds/Datasets]',
default='Caltech Birds/Datasets')
parser.add_argument(
'--clearml-dataset-train',
dest='clearml_dataset_train',
type=str,
help='Name of the ClearML training dataset. [cub200_2011_train_dataset]',
default='cub200_2011_train_dataset')
parser.add_argument(
'--clearml-dataset-test',
dest='clearml_dataset_test',
type=str,
help='Name of the ClearML testing dataset. [cub200_2011_test_dataset]',
default='cub200_2011_test_dataset')
parser.add_argument(
'--clearml-output-url',
dest='clearml_output_url',
type=str,
help='Location of where the output files should be stored. Default is Azure Blob Storage. Format is azure://storage_account/container [azure://clearmllibrary/artefacts]',
default='azure://clearmllibrary/artefacts')
parser.add_argument(
'--clearml-task-clone',
dest='clearml_task_clone',
action='store_true',
help='Create a clone of the task to be run on the remote resource, rather than running this experiment. [False]',
default=False
)
parser.add_argument(
'--clearml-queue',
dest='clearml_queue',
type=str,
help='Name of the ClearML-Server queue that the task will be enqueded with for remote execution. Use "none" for no queuing of job. Default [gpu]',
default='gpu')
parser.add_argument(
'--clearml-experiment-name',
dest='clearml_experiment_name',
type=str,
help=' Preamble of name of the ClearML experiment that you want the experiment to be logged in. []',
default='')
parser.print_help()
args = parser.parse_args()
# If no queue provided by specifying 'none', then make it None
if args.clearml_queue.lower() == 'none':
args.clearml_queue = None
if args.run_local:
print('[INFO] Running the job locally and logging to ClearML-Server')
#print('[INFO] Optional Arguments from CLI:: {}'.format(args.opts))
#if args.opts == '[]':
# args.opts = list()
# print('[INFO] Setting empty CLI args to an explicit empty list')
## CLEAR ML
# Tmp config load for network name
cfg = get_cfg_defaults()
cfg.merge_from_file(args.config)
params = get_key_value_dict(cfg)
# Connecting with the ClearML process
# First add the repo package requirements that aren't on CONDA / PYPI
Task.add_requirements('git+https://github.com/ecm200/caltech_birds.git#egg=cub_tools&subdirectory=cub_tools/')
Task.add_requirements('git+https://github.com/rwightman/pytorch-image-models.git')
# Now connect the script to ClearML Server as an experiment.
task = Task.init(
project_name=args.clearml_project,
task_name=args.clearml_experiment_name+'TRAIN [Network: '+cfg.MODEL.MODEL_NAME+', Library: '+cfg.MODEL.MODEL_LIBRARY+'] Ignite Train PyTorch CNN on CUB200',
task_type=Task.TaskTypes.training,
output_uri=args.clearml_output_url
)
# Add tags to the experiment to show in the ClearML GUI for better grouping
task.add_tags(['CUB200', cfg.MODEL.MODEL_NAME, cfg.MODEL.MODEL_LIBRARY, 'PyTorch', 'Ignite', 'Deployable', 'Azure Blob Storage'])
# Setup ability to add configuration parameters control.
# Pass the YACS configuration object directly to task object for storting of all parameters with model on clearml-server
params = task.connect(cfg, name='YACS') # enabling configuration override by clearml
#print(params) # printing actual configuration (after override in remote mode)
# Convert Params dictionary into a set of key value pairs in a list
params_list = []
for key in params:
params_list.extend([key,params[key]])
# Run the training remotely on ClearML Server.
# If True, this will create an experiment argument on the ClearML Server and terminate running locally.
# If False, the model will train locally, and logging and artefacts will be capture as normal.
if not args.run_local:
# Execute task remotely, control whether task is cloned, or queued.
task.execute_remotely(
queue_name=args.clearml_queue,
clone=args.clearml_task_clone,
exit_process=True)
# Get the dataset from the clearml-server and cache locally.
print('[INFO] Getting a local copy of the CUB200 birds datasets')
# Train
train_dataset = Dataset.get(dataset_project=args.clearml_dataset_project, dataset_name=args.clearml_dataset_train)
#train_dataset.get_mutable_local_copy(target_folder='./data/images/train')
print('[INFO] Default location of training dataset:: {}'.format(train_dataset.get_default_storage()))
train_dataset_base = train_dataset.get_local_copy()
print('[INFO] Default location of training dataset:: {}'.format(train_dataset_base))
# Test
test_dataset = Dataset.get(dataset_project=args.clearml_dataset_project, dataset_name=args.clearml_dataset_test)
#train_dataset.get_mutable_local_copy(target_folder='./data/images/train')
print('[INFO] Default location of testing dataset:: {}'.format(test_dataset.get_default_storage()))
test_dataset_base = test_dataset.get_local_copy()
print('[INFO] Default location of testing dataset:: {}'.format(test_dataset_base))
# Amend the input data directories and output directories for remote execution
# Modify experiment root dir
params_list = params_list + ['DIRS.ROOT_DIR', '']
# Add data root dir
params_list = params_list + ['DATA.DATA_DIR', str(pathlib.PurePath(train_dataset_base).parent)]
# Add data train dir
params_list = params_list + ['DATA.TRAIN_DIR', str(pathlib.PurePath(train_dataset_base).name)]
# Add data test dir
params_list = params_list + ['DATA.TEST_DIR', str(pathlib.PurePath(test_dataset_base).name)]
# Add working dir
params_list = params_list + ['DIRS.WORKING_DIR', str(task.cache_dir)]
print('[INFO] Task output destination:: {}'.format(task.get_output_destination()))
print('[INFO] Final parameter list passed to Trainer object:: {}'.format(params_list))
# Create the trainer object
trainer = ClearML_Ignite_Trainer(task=task, config=args.config, cmd_args=params_list) # NOTE: disabled cmd line argument passing but using it to pass ClearML configs.
# Setup the data transformers
print('[INFO] Creating data transforms...')
trainer.create_datatransforms()
# Setup the dataloaders
print('[INFO] Creating data loaders...')
trainer.create_dataloaders()
# Setup the model
print('[INFO] Creating the model...')
trainer.create_model()
# Setup the optimizer
print('[INFO] Creating optimizer...')
trainer.create_optimizer()
# Setup the scheduler
print('[INFO] Creating LR Scheduler...')
trainer.create_scheduler()
# Train the model
print('[INFO] Training the model...')
trainer.run()
# Create the deployment script and add it to the experiment.
print('[INFO] Creating deployment configuration...')
trainer.create_config_pbtxt(config_pbtxt_file='config.pbtxt')
task.connect_configuration(configuration=pathlib.Path('config.pbtxt'), name='config.pbtxt')
# NOTE: Placeholder here, call conversion to torchscript and save to file, and upload to clearml-server / remote storage.
trainer.trace_model_for_torchscript()
## Save the best model
#trainer.save_best_model()