This repository has been archived by the owner on Mar 19, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 330
/
defaults.yaml
1274 lines (1228 loc) · 62.2 KB
/
defaults.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# Copyright (c) Facebook, Inc. and its affiliates.
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
######################### How to use Hydra configs ###################################
# VISSL uses hydra for configuration management. The usage looks like:
# python tools/<binary-name>.py config=<config path>
# Example:
# python tools/run_distributed_engines.py config=pretrain/simclr/simclr_8node_resnet
#
#
# If you create sub-folders in config folder to override parameters, you can use the
# config files in the subfolder by adding at a "+" sign to the command line input.
# For example:
# python tools/run_distributed_engines.py \
# config=pretrain/simclr/simclr_8node_resnet \
# +config/pretrain/simclr/optimization=bs32_16nodes \
# +config/pretrain/simclr/my_new_subfolder=my_file_in_subfolder \
#
#
# If you want to override single values in the config, you can achieve that with:
# For example:
# python tools/run_distributed_engines.py \
# config=pretrain/simclr/simclr_8node_resnet \
# +config/pretrain/simclr/my_sub_folder=my_file_name \
# config.MODEL.WEIGHTS_INIT.PARAMS_FILE=<weights_path.torch>
#
# If you want to add single key to a dictionary in the config, you can achieve that with:
# For example:
# python tools/run_distributed_engines.py \
# config=pretrain/simclr/simclr_8node_resnet \
# +config/pretrain/simclr/my_sub_folder=my_file_name \
# +config.MY_NEW_KEY=MY_VALUE
defaults:
# you must specify the base config you want to run
- config: ???
######################### versioning ###################################
# this config version is checked with the VISSL latest config version in
# vissl/config/__init__.py
# Users are recommended to keep a config version in their config file so vissl can
# take care of upgrades to config files as the version evolves.
VERSION: 1
################## some command line options to decide workflow ###############
# automatically inferred node_id of the current machine. In case of distribute training
# across machines, the node_id is 0, 1, .... and is automatically inferred.
node_id: 0
# we support 2 types of engines: train | extract_features.
# The engines have the following roles:
# train: performs training (and validation is specified). Useful for evaluation or
# pre-training workflows.
# extract_features: if you want to extract features using a pre-trained model, set the
# workflow type to be feature extraction. This will set the full model in eval
# mode and extract features as specified by user.
engine_name: train
# training hyperparams setup
config:
# ----------------------------------------------------------------------------------- #
# GLOBAL DEFAULTS
# ----------------------------------------------------------------------------------- #
VERBOSE: False
# how frequently to log training stats like batch time, loss, training eta etc.
LOG_FREQUENCY: 10
# if the workflow is only test and not training
TEST_ONLY: False
# if the model should be test as well. If set to False, only training will be done.
TEST_MODEL: True
# how frequently should the validation be done.
# 1 = after every epoch and N = after every N epochs
TEST_EVERY_NUM_EPOCH: 1
SEED_VALUE: 0
# Use the forkserver or spawn
# https://github.com/pytorch/pytorch/blob/master/torch/nn/parallel/distributed.py#L142
MULTI_PROCESSING_METHOD: "forkserver"
# Debugging utilities
REPRODUCIBILITY:
CUDDN_DETERMINISTIC: False
# ----------------------------------------------------------------------------------- #
# HOOKS
# ----------------------------------------------------------------------------------- #
HOOKS:
# ----------------------------------------------------------------------------------- #
# Perf hooks for several steps of model training
# ----------------------------------------------------------------------------------- #
PERF_STATS:
# monitoring training statistics like: forward time, backward time, loss time, etc
MONITOR_PERF_STATS: False
# we print perf stats (if enabled) after every phase. If we want to print every few
# batches, set the frequency here.
PERF_STAT_FREQUENCY: -1
# if we want to print the rolling average batch time, set the value below to number of
# training iterations over which we want to print average. The average is printed for
# master gpu.
ROLLING_BTIME_FREQ: -1
# ----------------------------------------------------------------------------------- #
# torch.cuda.memory_summary()
# ----------------------------------------------------------------------------------- #
MEMORY_SUMMARY:
# set this to true if you want to print memory summary. useful for profiling
# memory consumption of model
PRINT_MEMORY_SUMMARY: True
# at what iteration number should the memory summary be printed. usually
# set to 1 for very large models
LOG_ITERATION_NUM: 0
# set this to true if you want to print the tensor residing in memory
# in event of an exception (such as out of memory exception)
DUMP_MEMORY_ON_EXCEPTION: False
# ----------------------------------------------------------------------------------- #
# nvidia-smi print
# ----------------------------------------------------------------------------------- #
# whether to log nvidia-smi or not. we make it optional in case nvidia-smi is not
# valid for some systems.
LOG_GPU_STATS: True
# ----------------------------------------------------------------------------------- #
# MODEL_COMPLEXITY (#flops, #params, #activations in your model)
# ----------------------------------------------------------------------------------- #
MODEL_COMPLEXITY:
# set this to True if you want to compute #flops, #params, #activations in your model.
COMPUTE_COMPLEXITY: False
# the dummy input shape passed to the model to compute the complexity. Only forward pass
# is done for complexity calculation.
INPUT_SHAPE: [3, 224, 224]
# ----------------------------------------------------------------------------------- #
# TENSORBOARD (visualization)
# ----------------------------------------------------------------------------------- #
TENSORBOARD_SETUP:
# whether to use tensorboard for the visualization
USE_TENSORBOARD: False
# log directory for tensorboard events
LOG_DIR: "."
EXPERIMENT_LOG_DIR: "tensorboard"
# flush logs every n minutes
FLUSH_EVERY_N_MIN: 5
# whether to log the model parameters to tensorboard
LOG_PARAMS: True
# whether to log the model parameters gradients to tensorboard
LOG_PARAMS_GRADIENTS: True
# if we want to log the model parameters every few iterations, set the iteration
# frequency. -1 means the params will be logged only at the end of epochs.
LOG_PARAMS_EVERY_N_ITERS: 310
# ----------------------------------------------------------------------------------- #
# MONITORING
# ----------------------------------------------------------------------------------- #
MONITORING:
# At which frequency do we monitor statistics on the activations:
# - 0 means that we do not monitor statistics
# - N > 0 means we monitor every N iterations
MONITOR_ACTIVATION_STATISTICS: 0
# ----------------------------------------------------------------------------------- #
# PROFILING
# ----------------------------------------------------------------------------------- #
PROFILING:
# How many iterations do we wait before starting the profiler
START_ITERATION: 0
# How any iterations does the profiler run while not collecting outputs
# Data will start to be collected after START_ITERATION + WARMUP_ITERATIONS
WARMUP_ITERATIONS: 0
# How many iterations do we run the profiler for: after this number
# of iteration is reached the profiling is disabled
NUM_ITERATIONS: 10
# Whether or not to interrupt the training after reaching the last
# profiling iteration (after the profiling is done)
STOP_TRAINING_AFTER_PROFILING: False
# Folder where the traces will be generated
OUTPUT_FOLDER: "."
# Ranks on which the profiling will be performed
# The rank is the index of the GPU in the overall distributed training
PROFILED_RANKS: [0, 1]
# The available memory profiling options
MEMORY_PROFILING:
# Track the memory usage through the forward/backward pass, and outputs
# the traces complemented by estimations of the memory usage due to
# activations and associated activation gradients
TRACK_BY_LAYER_MEMORY: False
# The available options for the runtime profiler
RUNTIME_PROFILING:
# To enable the runtime profiler
USE_PROFILER: False
# Whether or not to profile the CPU activities
PROFILE_CPU: True
# Whether or not to profile the GPU activities
PROFILE_GPU: True
# To force the use of the legacy autograd profiler even if
# the new pytorch profiler based on kineto is available
LEGACY_PROFILER: False
# ----------------------------------------------------------------------------------- #
# DATA
# ----------------------------------------------------------------------------------- #
DATA:
# Common data options
NUM_DATALOADER_WORKERS: 4 # Set this depending on the number of CPUs you have
PIN_MEMORY: true # Makes CPU->GPU copy of the data faster
# whether to overlap the data copy from host to GPU with the previous iteration.
ENABLE_ASYNC_GPU_COPY: true
# buffer size for gradient reduction. Set to 25 which is pytorch default.
DDP_BUCKET_CAP_MB: 25
# Training Data Options
TRAIN:
# A sampler that cuts the dataset in a deterministic way
USE_DEBUGGING_SAMPLER: False
# if we want to resume the data sampler as well from a previous iteration. By default
# pytorch sampler resumes from every epoch.
USE_STATEFUL_DISTRIBUTED_SAMPLER: False
# whether to drop the last incomplete batch per process
DROP_LAST: False
# if users want to replace certain prefixes from the image paths and replace them with
# some other prefix, they can do so here.
REMOVE_IMG_PATH_PREFIX: ""
# what prefix to replace the old prefix with. Could stay empty too
NEW_IMG_PATH_PREFIX: ""
# Base dataset to use to wrap the datasets defined. The default and only current oss supported
# default is the generic_ssl_dataset.
BASE_DATASET: "generic_ssl"
# name of the dataset. Meaningful and used to do lookup in the dataset_catalog.json
# it has the advantage that user needs to full the dataset_catalog.json once
# and then simply use the dataset name without having to specify data paths every time.
DATASET_NAMES: ["imagenet1k_folder"]
# Sources for reading data.
# Currently supports: disk_folder and disk_filelist
# Parallel aligned with DATA_PATHS argument.
# can be user specified or filled in configs/dataset_catalog.json file
DATA_SOURCES: []
DATA_PATHS: []
LABEL_SOURCES: []
LABEL_PATHS: []
# either standard | sample_index | zero
# sample_index is a common practice in self-supervised learning and sample_index = id of the
# sample in the data.
# standard label type is used for supervised learning and user specifis the labels to use.
# zero sets all labels to 0, which is necessary when using necessary
# when cutmixup_collator is being used for self-supervised training.
# Note that if LABEL_SOURCES (see above) is provided, it will override
# LABEL_TYPE. For example, if SSL training on a labeled dataset (e.g
# ImageNet imagefolders) and
# LABEL_SOURCES: [DISK_FOLDER]
# LABEL_TYPE: "zero"
# the label type will not be zero, but the label associated with the
# image folder.
LABEL_TYPE: "standard"
# whether to memory map the input data.
MMAP_MODE: True
# if the images are invalid for whatever reason, we return the gray image of specified size.
# we allow using a queue to capture the valid and seen images if users prefer to
# not use the gray images during training. See `ENABLE_QUEUE_DATASET` option below.
DEFAULT_GRAY_IMG_SIZE: 224
# number of unique samples in minibatch per gpu (or per device)
BATCHSIZE_PER_REPLICA: 256
# list of data transforms to apply on the data
# Example: using RandAugment (https://arxiv.org/abs/1909.13719)
# :param magnitude: integer magnitude of rand augment
# :param magnitude_std: standard deviation of magnitude. If > 0,
# introduces random variability in the augmentation magnitude.
# :param num_layers: integer number of transforms
# :param increasing_severity: boolean that indicates whether to use
# augmentations that increase severity w/ increasing magnitude. Some
# augmentations do this by default.
# :param choice_weights: Index of pre-determined probability distribution
# over augmentations. Currently only one such distribution available (i.e.
# no valid values other than 0 or None), unclear if beneficial. Default =
# None.
# TRANSFORMS:
# - name: RandAugment
# magnitude: 9
# magnitude_std: 0.5
# num_layers: 2
# increasing_severity: True
#
#
# Example: using AutoAugment (https://arxiv.org/abs/1805.09501). This
# autoaugment differs from the torchvision implementation by allowing
# variability in the augmentation intensity.
# ":param policy_name: String. One of 'v0', 'v0r', 'original', 'originalr'.
# One of a set of learned augmentation sequences.
# :param magnitude_std: standard deviation of magnitude. If > 0, introduces
# random variability in the augmentation magnitude.
# TRANSFORMS:
# - name: VisslAutoAugment
# policy_name: v0
# magnitude_std: 0
TRANSFORMS: []
# collator to use: either pytorch default or user defined custom collator.
# Using the cutmixup_collator in a supervised setting requires the use
# of the cross_entropy_multiple_output_single_target loss (see LOSS
# section below in order to accomodate label-smoothing. Using the
# cutmixup_collator in a self-supervised setting requires setting
# DATA.{TRAIN/TEST}.LABEL_TYPE: zero
COLLATE_FUNCTION: "default_collate"
# parameters taken by the collator function (if any).
COLLATE_FUNCTION_PARAMS: {}
# Example: params for cutmixup_collator to implement CutMix and MixUp
# COLLATE_FUNCTION: "cutmixup_collator"
# COLLATE_FUNCTION_PARAMS: {
# # Adjust collator output to accomodate SSL method.
# # Currently supports "moco" or "simclr".
# # No argument needed if using vissl or supervised.
# "ssl_method": "moco"
# "mixup_alpha": 1.0, # mixup alpha value, mixup is active if > 0.
# "cutmix_alpha": 0.0, # cutmix alpha value, cutmix is active if > 0.
# "cutmix_minmax": None, # cutmix min/max image ratio, cutmix is active and uses this vs alpha if not None.
# "prob": 1.0, # probability of applying mixup or cutmix per batch or element
# "switch_prob": 0.5, # probability of switching to cutmix instead of mixup when both are active
# "mode": "batch", # how to apply mixup/cutmix params (per 'batch', 'pair' (pair of elements), 'elem' (element)
# "correct_lam": True, # apply lambda correction when cutmix bbox clipped by image borders
# "label_smoothing": 0.1, # apply label smoothing to the mixed target tensor
# "num_classes": 2 # number of classes for target. Labels aren't
# actually used for SSL, so set to a small number to avoid shuffling
# large vectors around unnecessarily.
# }
# Also note that using the CutMixUp collator in a supervised context
# requires using the cross_entropy_multiple_output_single_target to
# accomodate the smoothed labels. See
# LOSS.cross_entropy_multiple_output_single_target for more information.
#
# Limit the amount of data used in training. If set to -1, full dataset is used.
#
DATA_LIMIT: -1
#
# Specifies how the DATA_LIMIT samples are sampled
#
# Example: to select a range of 500 samples for validation, skipping the first 1000 samples (say these are
# already used in the training split) and sub-sampling these elements such that each class appears equally:
# DATA_LIMIT: 500
# DATA_LIMIT_SAMPLING:
# SEED: 0
# IS_BALANCED: True
# SKIP_NUM_SAMPLES: 1000
#
DATA_LIMIT_SAMPLING:
SEED: 0
IS_BALANCED: False
SKIP_NUM_SAMPLES: 0
# whether the data specified (whether file list or directory) should be copied locally
# on the machine where training is happening.
COPY_TO_LOCAL_DISK: False
# if copying the data to a local directory, the destination to use. Otherwise,
# temporary destination directory will be created and set.
COPY_DESTINATION_DIR: ""
# keys that specify what `keys' in a sample dictionary
# correspond to input and target
INPUT_KEY_NAMES: ["data"]
TARGET_KEY_NAMES: ["label"]
# set this to True if you want to handle the invalid images using QueueDataset.
# In case of an invalid image, by default a mean image is returned. But using
# QueueDataset, you can instead return a valid and previously seen image.
ENABLE_QUEUE_DATASET: False
TEST:
# A sampler that cuts the dataset in a deterministic way
USE_DEBUGGING_SAMPLER: False
# if we want to resume the data sampler as well from a previous iteration
USE_STATEFUL_DISTRIBUTED_SAMPLER: False
# if users want to replace certain prefixes from the image paths and replace them with
# some other prefix, they can do so here.
REMOVE_IMG_PATH_PREFIX: ""
# what prefix to replace the old prefix with. Could stay empty too
NEW_IMG_PATH_PREFIX: ""
DROP_LAST: False
DATA_SOURCES: []
DATA_PATHS: []
LABEL_SOURCES: []
LABEL_PATHS: []
MMAP_MODE: True
DEFAULT_GRAY_IMG_SIZE: 224
BATCHSIZE_PER_REPLICA: 256
TRANSFORMS: []
COLLATE_FUNCTION: "default_collate"
COLLATE_FUNCTION_PARAMS: {}
DATA_LIMIT: -1
DATA_LIMIT_SAMPLING:
SEED: 0
IS_BALANCED: False
SKIP_NUM_SAMPLES: 0
# Base dataset to use to wrap the datasets defined. The default and only current oss supported
# default is the generic_ssl_dataset.
BASE_DATASET: "generic_ssl"
DATASET_NAMES: ["imagenet1k_folder"]
COPY_TO_LOCAL_DISK: False
COPY_DESTINATION_DIR: ""
# either standard | sample_index
LABEL_TYPE: "standard"
# keys that specify what `keys' in a sample dictionary
# correspond to input and target
INPUT_KEY_NAMES: ["data"]
TARGET_KEY_NAMES: ["label"]
# set this to True if you want to handle the invalid images using QueueDataset.
# In case of an invalid image, by default a mean image is returned. But using
# QueueDataset, you can instead return a valid and previously seen image.
ENABLE_QUEUE_DATASET: False
# ----------------------------------------------------------------------------------- #
# METERS
# ----------------------------------------------------------------------------------- #
# what meters to attach. The mentioned meters will be calculated.
# Currently supports 2 types of meters: accuracy_list_meter | mean_ap_list_meter
# The meters operation on multiple output and single target fashion. i.e.
# multiple meters are calculated for multiple model outputs (for example: multiple
# layers output) and metric is calculated on the same input target.
METERS:
name: ""
# whether to calculate the meter during training as well. Sometimes, if the training
# data size is too big, it could be hard to compute the meter on training set. Hence
# we might want to disable it.
enable_training_meter: True
# calculate top-k meter on single target multiple ouput setting
accuracy_list_meter:
# number of accuracy meters. In cases like linear evaluation of feature, we perform
# evaluation of several layers and there's a separate meter for each layer.
# num_meters basically specifices number of meters.
num_meters: 1
# what topk values to calculate. Example topk_values = [1, 5] means top1 and top5
# both will be calculated
topk_values: [1]
# names of the meter. Useful in cases where we have several meters. For the linear
# feature evaluation workflows, meter name is automatically inferred.
meter_names: []
# calculate mean average precion meter on single target multiple output.
mean_ap_list_meter:
# number of classes over which mean AP is being calculated. 9605 corresponds to
# openimages v6 dataset.
num_classes: 9605
# number of accuracy meters. In cases like linear evaluation of feature, we perform
# evaluation of several layers and there's a separate meter for each layer.
# num_meters basically specifices number of meters.
num_meters: 1
# maximum number of samples to have in the meter. This is a global variable. Ideally
# set it to number of examples in test set.
max_cpu_capacity: -1
# names of the meter. Useful in cases where we have several meters. For the linear
# feature evaluation workflows, meter name is automatically inferred.
meter_names: []
# ----------------------------------------------------------------------------------- #
# MACHINE (cpu, gpu)
# ----------------------------------------------------------------------------------- #
MACHINE:
DEVICE: "gpu"
# ----------------------------------------------------------------------------------- #
# MODEL
# ----------------------------------------------------------------------------------- #
MODEL:
# sometimes we can avoid CUDA OOM issues by clearing out the cache. Clearing out cache
# is slow so chose wisely.
CUDA_CACHE:
CLEAR_CUDA_CACHE: False
CLEAR_FREQ: 100
# the model parameter names that should not be trained
NON_TRAINABLE_PARAMS: []
# the model parameters that should be frozen for certain specific number of iterations.
# i.e the parameters are frozen for specified iterations and then start training.
TEMP_FROZEN_PARAMS_ITER_MAP: []
# Colorization models take lab input. Everything else takes rgb. Options:
# lab | rgb | bgr
INPUT_TYPE: "rgb"
# Multi-input model: input keys in the sample dictionary and which head
# uses them for example, input contains "images" and "patches" and there
# is one separate head applied to images and another to patches
MULTI_INPUT_HEAD_MAPPING: []
# In case of mult-resolution inputs, we combine the same resolution inputs and
# run forward pass. However, for a very large model where gpu memory is bottleneck,
# we can optimize memory a bit by running forward pass through each crop
# separately.
SINGLE_PASS_EVERY_CROP: False
# ----------------------------------------------------------------------------------- #
# Activation checkpointing from PyTorch
# ----------------------------------------------------------------------------------- #
# Use activation checkpointing in the training phase. This is very for training
# large models that require a lot of memory.
ACTIVATION_CHECKPOINTING:
USE_ACTIVATION_CHECKPOINTING: false
# how many times the model should be checkpointed. User should tune this parameter
# and find the number that offers best memory saving and compute tradeoff.
NUM_ACTIVATION_CHECKPOINTING_SPLITS: 2
# ----------------------------------------------------------------------------------- #
# ZeRO2 sharded DDP from Fairscale https://github.com/facebookresearch/fairscale
# ----------------------------------------------------------------------------------- #
SHARDED_DDP_SETUP:
# set this to true if you want to use SDP instead of DDP.
# VISSL will automatically set optimizer = zero and
# configure the settings required to run SDP successfully.
USE_SDP: False
reduce_buffer_size: -1
# ----------------------------------------------------------------------------------- #
# FSDP from Fairscale https://github.com/facebookresearch/fairscale
# These options should match FSDP classes init options.
# ----------------------------------------------------------------------------------- #
FSDP_CONFIG:
# set this option to True to enable FSDP and automatically determine the config
# for FSDP based on AMP true/false.
AUTO_SETUP_FSDP: False
# Set this option to a positive number to automatically wrap "big" layers with
# a dedicated FSDP wrapping: the number provided here is the number of
# parameters that serves as threshold to decide if a layer is "big"
AUTO_WRAP_THRESHOLD: 0
# Parameters of fairscale FSDP
flatten_parameters: True
mixed_precision: True
fp32_reduce_scatter: False # Only makes sense to be True when mixed_precision is True.
compute_dtype: float32 # Choose "float32" or "float16"
bucket_cap_mb: 0
clear_autocast_cache: True
verbose: True
# ----------------------------------------------------------------------------------- #
# Feature evaluation settings
# ----------------------------------------------------------------------------------- #
FEATURE_EVAL_SETTINGS:
# for evaluating the features on any evaluation task/benchmark, set this to True
EVAL_MODE_ON: False
# if you want to evaluate several feature layers of the pre-trained model on
# benchmark tasks like linear classification, set this to True. This freezes the model
# trunk for feature evaluation.
FREEZE_TRUNK_ONLY: False
# if you want to evaluate the full self-supervised model including the trunk and heads,
# and want to freeze trunk and head both, set this to True
FREEZE_TRUNK_AND_HEAD: False
# if you want to extract features of trunk only, set this to True
EXTRACT_TRUNK_FEATURES_ONLY: False
# if we want to evaluate the full model, this requires loading the head weights as well
# from model weights file. In this case, set the following to True.
EVAL_TRUNK_AND_HEAD: False
# whether features should be flattened to result in N x D feature shape
SHOULD_FLATTEN_FEATS: True
# model features that should be evaluated for linear classification and what
# pooling to apply on features. Could be any pooling operation or Identity.
#
# Example: for evaluating 5 layers of ResNet-50,
# LINEAR_EVAL_FEAT_POOL_OPS_MAP: [
# ["conv1", ["AvgPool2d", [[10, 10], 10, 4]]],
# ["res2", ["AvgPool2d", [[16, 16], 8, 0]]],
# ["res3", ["AvgPool2d", [[13, 13], 5, 0]]],
# ["res4", ["AvgPool2d", [[8, 8], 3, 0]]],
# ["res5", ["AvgPool2d", [[6, 6], 1, 0]]],
# ]
LINEAR_EVAL_FEAT_POOL_OPS_MAP: []
# ----------------------------------------------------------------------------------- #
# GRADIENT CLIPPING. Used by Dosovitskiy et al. in their Vision
# Transformer paper.
# ----------------------------------------------------------------------------------- #
GRAD_CLIP: # See TORCH.NN.UTILS.CLIP_GRAD_NORM_
USE_GRAD_CLIP: False
NORM_TYPE: 2 # Float, int, or 'inf'
MAX_NORM: 1
# ----------------------------------------------------------------------------------- #
# MODEL TRUNK
# ----------------------------------------------------------------------------------- #
TRUNK:
NAME: "resnet"
# ------------------------------------------------------------- #
# ResNe(X)t params
# ------------------------------------------------------------- #
RESNETS:
DEPTH: 50
WIDTH_MULTIPLIER: 1
NORM: BatchNorm # BatchNorm | LayerNorm | GroupNorm
# If using GroupNorm, this sets number of groups. Recommend 32 as a
# naive suggestion. GroupNorm only available for ResNe(X)t.
GROUPNORM_GROUPS: 32
# Use weight-standardized convolutions
STANDARDIZE_CONVOLUTIONS: False
GROUPS: 1
ZERO_INIT_RESIDUAL: False
WIDTH_PER_GROUP: 64
# Colorization model uses stride=1 for last layer to retain higher spatial resolution
# for the pixel-wise task. Torchvision default is stride=2 and all other models
# use this so we set the default as 2.
LAYER4_STRIDE: 2
# ------------------------------------------------------------- #
# EfficientNet params
# ------------------------------------------------------------- #
# follow classy vision for efficientNet settings
EFFICIENT_NETS: {}
# ------------------------------------------------------------- #
# RegNet params
# ------------------------------------------------------------- #
REGNET: {}
# ------------------------------------------------------------- #
# Vision Transformer/DeiT params. Using a name will
# override/ignore all other VISION_TRANSFORMERS parameters. Named
# options include vit_b_32, vit_b_16, vit_l_32, vit_l_16, vit_h_14.
# Using
# ------------------------------------------------------------- #
VISION_TRANSFORMERS:
name:
IMAGE_SIZE: 224
PATCH_SIZE: 16
NUM_LAYERS: 12
NUM_HEADS: 12
HIDDEN_DIM: 768
MLP_DIM: 3072
# MLP and projection layer dropout rate
DROPOUT_RATE: 0
# Attention dropout rate
ATTENTION_DROPOUT_RATE: 0
# Use the token for classification. Currently no alternatives
# supported
CLASSIFIER: token
# Stochastic depth dropout rate. Turning on stochastic depth and
# using aggressive augmentation is essentially the difference
# between a DeiT and a ViT.
DROP_PATH_RATE: 0
QKV_BIAS: False # Bias for QKV in attention layers.
QK_SCALE: False # Scale
# ------------------------------------------------------------- #
# Parameters unique to the ConViT and not used for standard vision
# transformers
# -------------------------------------------------------------
CONVIT:
# Notes on ConViT (D'Ascoli et al., TODO insert arxiv link):
# Ideally, the number of heads should be a square.
# The paper used: 4, 9, or 16 heads;
# HIDDEN_DIM = 64 * # NUM_HEADS, so 256, 576, or 1024, respectively;
# MLP_DIM = 4 * HIDDEN_DIM, so 1024, 2304, or 4096, respectively.
#
# ConViT Params:
# Number of gated positional self-attention
# layers. These are self-attention layers that have separate
# positional and content-based attention components, allowing them
# to function as convolutional layers if initialized to be local
# (see USE_LOCAL_INIT below). They're what make ConViTs ConViTs!
# n_gpsa_layers out of NUM_LAYERS will be gated positional self
# attention layers.
N_GPSA_LAYERS: 10
# Whether to add class token to gpsa layers
CLASS_TOKEN_IN_LOCAL_LAYERS: False
# Determines how much the positional attention is focused on the
# patch of maximal attention. "Alpha" in the paper. Equivalent to
# the temperature of positional attention softmax.
LOCALITY_STRENGTH: 1.
# Dimensionality of the relative positional embeddings
LOCALITY_DIM: 10
# Whether to initialize the positional component of the GPSAs
# locally. This is necessary for them to be convolutional.
USE_LOCAL_INIT: True
# ----------------------------------------------------------------------------------- #
# MODEL HEAD
# ----------------------------------------------------------------------------------- #
HEAD:
# PARAMS is a List of Pairs:
# Pair[0] = Name of Head.
# Pair[1] = kwargs passed to head constructor.
# Example of heads:
# Case1: Simple Head containing single module - Single Input, Single output
# PARAMS: [
# ["mlp", {"dims": [2048, 128]}]
# ]
# Case2: Complex Head containing chain of head modules - Single Input, Single output
# PARAMS: [
# ["mlp", {"dims": [2048, 1000], "use_bn": False, "use_relu": False}],
# ["siamese_concat_view", {"num_towers": 9}],
# ["mlp", {"dims": [9000, 128]}]
# ]
# Case3: Multiple Heads (example 2 heads) - Single input, multiple output
# Can be used for multi-task learning
# PARAMS: [
# # head 0
# [
# ["mlp", {"dims": [2048, 128], "use_bn": False, "use_relu": False}],
# ["siamese_concat_view", {"num_towers": 9}],
# ["mlp", {"dims": [1152, 128]}],
# ],
# # head 1
# [
# ["mlp", {"dims": [2048, 128]}]
# ],
# ]
# Case4: Multiple Heads (example 5 simple heads) - Single input, multiple output.
# PARAMS: [
# ["eval_mlp", {"in_channels": 64, "dims": [9216, 1000]}],
# ["eval_mlp", {"in_channels": 256, "dims": [9216, 1000]}],
# ["eval_mlp", {"in_channels": 512, "dims": [8192, 1000]}],
# ["eval_mlp", {"in_channels": 1024, "dims": [9216, 1000]}],
# ["eval_mlp", {"in_channels": 2048, "dims": [8192, 1000]}],
# ]
PARAMS: []
# epsilon for the batchnorm. Set to default pytorch value.
BATCHNORM_EPS: 1e-5
# momentum for the batchnorm. Set to default pytorch value.
BATCHNORM_MOMENTUM: 0.1
# if we want to multiply the initialization of the head parameters by a factor,
# specify the multiplier. By default, set to 1.0
# this setting is helpful for scaling the model output.
PARAMS_MULTIPLIER: 1.0
# ----------------------------------------------------------------------------------- #
# Synchronized BatchNorm Setup
# ----------------------------------------------------------------------------------- #
# if we want to convert all the batch norm layers in the model to use SyncBN.
# There are two options: APEX syncBN and PyTorch SyncBN.
SYNC_BN_CONFIG:
CONVERT_BN_TO_SYNC_BN: False
SYNC_BN_TYPE: "pytorch" # apex | pytorch
# 1) if group_size=-1 -> use the VISSL default setting. We synchronize within a
# machine and hence will set group_size=num_gpus per node. This gives the best
# speedup.
# 2) if group_size>0 -> will set group_size=value set by user.
# 3) if group_size=0 -> no groups are created and process_group=None. This means
# global sync is done.
GROUP_SIZE: -1
# ----------------------------------------------------------------------------------- #
# MIXED PRECISION SETUP
# ----------------------------------------------------------------------------------- #
AMP_PARAMS:
USE_AMP: False
# Use O1 as it is robust and stable than O3. If you want to use O3, we recommend
# the following setting:
# {"opt_level": "O3", "keep_batchnorm_fp32": True, "master_weights": True, "loss_scale": "dynamic"}
AMP_ARGS: {"opt_level": "O1"}
# we support pytorch amp as well which is availale in pytorch>=1.6.
AMP_TYPE: "apex" # apex | pytorch
# ----------------------------------------------------------------------------------- #
# MODEL WEIGHTS INIT from a weights file
# ----------------------------------------------------------------------------------- #
# parameters for initializing a model from a pre-trained model file
WEIGHTS_INIT:
# path to the .torch weights files
PARAMS_FILE: ""
# name of the state dict. checkpoint = {"classy_state_dict": {layername:value}}. Options:
# 1. classy_state_dict - if model is trained and checkpointed with VISSL.
# checkpoint = {"classy_state_dict": {layername:value}}
# 2. "" - if the model_file is not a nested dictionary for model weights i.e.
# checkpoint = {layername:value}
# 3. key name that your model checkpoint uses for state_dict key name.
# checkpoint = {"your_key_name": {layername:value}}
STATE_DICT_KEY_NAME: "classy_state_dict"
# specify what layer should not be loaded. Layer names with this key are not copied
# By default, set to BatchNorm stats "num_batches_tracked" to be skipped.
SKIP_LAYERS: ["num_batches_tracked"]
####### If loading a non-VISSL trained model, set the following two args carefully #########
# to make the checkpoint compatible with VISSL, if you need to remove some names
# from the checkpoint keys, specify the name
REMOVE_PREFIX: ""
# In order to load the model (if not trained with VISSL) with VISSL, there are 2 scenarios:
# 1. If you are interested in evaluating the model features and freeze the trunk.
# Set APPEND_PREFIX="trunk.base_model." This assumes that your model is compatible
# with the VISSL trunks. The VISSL trunks start with "_feature_blocks." prefix. If
# your model doesn't have these prefix you can append them. For example:
# For TorchVision ResNet trunk, set APPEND_PREFIX="trunk.base_model._feature_blocks."
# 2. where you want to load the model simply and finetune the full model.
# Set APPEND_PREFIX="trunk."
# This assumes that your model is compatible with the VISSL trunks. The VISSL
# trunks start with "_feature_blocks." prefix. If your model doesn't have these
# prefix you can append them.
# For TorchVision ResNet trunk, set APPEND_PREFIX="trunk._feature_blocks."
# NOTE: the prefix is appended to all the layers in the model
APPEND_PREFIX: ""
# ----------------------------------------------------------------------------------- #
# LOSS
# ----------------------------------------------------------------------------------- #
LOSS:
# name of the loss to use. Supports all PyTorch loss functions and custom defined
# losses in VISSL.
name: "CrossEntropyLoss"
# ----------------------------------------------------------------------------------- #
# Standard PyTorch Cross-Entropy Loss. Use the loss name exactly as in PyTorch.
# pass any variables that the loss takes.
# ----------------------------------------------------------------------------------- #
CrossEntropyLoss:
ignore_index: -1
# ----------------------------------------------------------------------------------- #
# Cross-Entropy Loss for multiple outputs and same target. For a single
# output, this is equivalent to the cross-entropy loss. For multiple
# outputs, this computes the sum of the cross-entropy losses for each
# tensor in the list against the target. Can also accomodate target
# vectors in addition to single integer targets, for example when using
# label smoothing. Note that the internally, cross_entropy_multiple_output_single_target
# determines whether each sample is associated with a single target or
# whether each sample is associated with a target vector, and uses vanilla
# CrossEntropyLoss for the single-target case and a custom cross entropy
# function for the multi-target case.
# ----------------------------------------------------------------------------------- #
cross_entropy_multiple_output_single_target:
weight: null
reduction: "mean"
ignore_index: -1
# generic flag to enable L2 normalization in a loss function. Currently supported
# for cross_entropy_multiple_output_single_target loss only.
normalize_output: False
# if we want to use softmax with temperature i.e. the NLL(log_softmax(input/temperature))
# then set the desired temperature value.
temperature: 1.0
# ----------------------------------------------------------------------------------- #
# BCELogits for multiple input and same target.
# Applicable for multi-label classification problems.
# ----------------------------------------------------------------------------------- #
bce_logits_multiple_output_single_target:
reduction: "none"
world_size: 1 # automatically inferred
normalize_output: False
# ----------------------------------------------------------------------------------- #
# NCE LOSS (Noise Contrastive Estimator)
# ----------------------------------------------------------------------------------- #
nce_loss_with_memory:
# setting below to "cross_entropy" yields the InfoNCE loss
loss_type: "nce"
norm_embedding: True
temperature: 0.07
# if the NCE loss is computed between multiple pairs, we can set a loss weight per term
# can be used to weight different pair contributions differently.
loss_weights: [1.0]
norm_constant: -1
update_mem_with_emb_index: -100
negative_sampling_params:
num_negatives: 16000
type: "random"
memory_params:
memory_size: -1
embedding_dim: 128
momentum: 0.5
norm_init: True
update_mem_on_forward: True
# following parameters are auto-filled before the loss is created.
num_train_samples: -1 # @auto-filled
# ----------------------------------------------------------------------------------- #
# SimCLR InfoNCE LOSS (Specific to SimCLR https://arxiv.org/abs/2002.05709)
# ----------------------------------------------------------------------------------- #
simclr_info_nce_loss:
temperature: 0.1
buffer_params:
world_size: 64 # automatically inferred
embedding_dim: 128
effective_batch_size: 4096 # automatically inferred
# ----------------------------------------------------------------------------------- #
# Multi-crop version of SimCLR InfoNCE LOSS (supports multicrop augmentation proposed
# in https://arxiv.org/abs/2006.09882)
# ----------------------------------------------------------------------------------- #
multicrop_simclr_info_nce_loss:
temperature: 0.1
num_crops: 2 # automatically inferred from data transforms
buffer_params:
world_size: 64 # automatically inferred
embedding_dim: 128
effective_batch_size: 4096 # automatically inferred
# ----------------------------------------------------------------------------------- #
# SwAV LOSS (Specific to SwAV https://arxiv.org/abs/2006.09882)
# ----------------------------------------------------------------------------------- #
swav_loss:
embedding_dim: 128 # automatically inferred from HEAD params
temperature: 0.1
use_double_precision: False
normalize_last_layer: True
num_iters: 3
epsilon: 0.05
num_crops: 2 # automatically inferred from data transforms
crops_for_assign: [0, 1]
num_prototypes: [3000] # automatically inferred from model HEAD settings
temp_hard_assignment_iters: 0
# for dumping the debugging info in case loss becomes NaN
output_dir: "." # automatically inferred and set to checkpoint dir
queue:
queue_length: 0 # automatically adjusted to ensure queue_length % global batch size = 0
start_iter: 0
local_queue_length: 0 # automatically inferred to queue_length // world_size
# ----------------------------------------------------------------------------------- #
# SwAV MOMENTUM LOSS
# ----------------------------------------------------------------------------------- #
swav_momentum_loss:
momentum: 0.99
momentum_eval_mode_iter_start: 0
embedding_dim: 128 # automatically inferred from HEAD params
temperature: 0.1
use_double_precision: False
normalize_last_layer: True
num_iters: 3
epsilon: 0.05
num_crops: 2 # automatically inferred from data transforms
crops_for_assign: [0, 1]
num_prototypes: [3000] # automatically inferred from model HEAD settings
queue:
queue_length: 0 # automatically adjusted to ensure queue_length % global batch size = 0
start_iter: 0
local_queue_length: 0 # automatically inferred to queue_length // world_size
# ----------------------------------------------------------------------------------- #
# DINO LOSS (Specific to DINO https://arxiv.org/abs/2104.14294)
# ----------------------------------------------------------------------------------- #
dino_loss:
momentum: 0.996
student_temp: 0.1
teacher_temp_min: 0.04
teacher_temp_max: 0.07
teacher_temp_warmup_iters: 37500 # 30 epochs
crops_for_teacher: [0, 1]
ema_center: 0.9
normalize_last_layer: true
output_dim: 65536 # automatically inferred from model HEAD settings
# -----------------------------------------------------------------------------------#
# DeepCluster V2 LOSS (baselines in SwAV https://arxiv.org/abs/2006.09882)
# -----------------------------------------------------------------------------------#
deepclusterv2_loss:
DROP_LAST: True # automatically inferred from DATA.TRAIN.DROP_LAST
BATCHSIZE_PER_REPLICA: 256 # automatically inferred from DATA.TRAIN.BATCHSIZE_PER_REPLICA
num_crops: 2 # automatically inferred from DATA.TRAIN.TRANSFORMS
temperature: 0.1
num_clusters: [3000, 3000, 3000]
kmeans_iters: 10
memory_params:
crops_for_mb: [0]
embedding_dim: 128
# following parameters are auto-filled before the loss is created.
num_train_samples: -1 # @auto-filled
# ----------------------------------------------------------------------------------- #
# MoCo Loss (http://arxiv.org/abs/1911.05722)
# ----------------------------------------------------------------------------------- #
moco_loss:
embedding_dim: 128
queue_size: 65536
momentum: 0.999
temperature: 0.2
# ----------------------------------------------------------------------------------- #
# Barlow Twins Loss (https://arxiv.org/abs/2103.03230v1)
# ----------------------------------------------------------------------------------- #
barlow_twins_loss:
lambda_: 0.0051
scale_loss: 0.024
embedding_dim: 8192
# ----------------------------------------------------------------------------------- #
# OPTIMIZER
# ----------------------------------------------------------------------------------- #
OPTIMIZER:
name: "sgd"
# whether to shard optimizer state as per ZeRO https://arxiv.org/abs/1910.02054
use_zero: False
use_larc: False # supported for SGD only for now
larc_config:
clip: False
eps: 1e-08
trust_coefficient: 0.001
weight_decay: 0.0001
momentum: 0.9
nesterov: False
# for how many epochs to do training. only counts training epochs.
num_epochs: 90
betas: [.9, .999] # for Adam/AdamW
# whether to regularize batch norm. if set to False, weight decay of batch norm params is 0.
regularize_bn: False
# whether to regularize bias parameter. if set to False, weight decay of bias params is 0.
regularize_bias: True
# Parameters to omit from regularization. Any named parameter whose name
# contains any of these strings will be omitted from regularization.
# For example, we don't want to regularize the class token or position
# embeddings in the vision transformer, so we pass:
# non_regularized_parameters: ['class_token', 'pos_embedding']
non_regularized_parameters: []
# we support using a different LR and weight decay for head and trunk.
# one needs to set the flag "use_different_values: True" in order to enable
# this functionality. We use the same type of param scheduler for the trunk and head
# but allow different LR and weight decay values.
head_optimizer_params:
# if the head should use a different LR than the trunk. If yes, then specify the
# param_schedulers.lr_head settings. Otherwise if set to False, the
# param_scheduelrs.lr will be used automatically.
use_different_lr: False
# if the head should use a different weight decay value than the trunk.
use_different_wd: False
# if using different weight decay value for the head, set here. otherwise, the
# same value as trunk will be automatically used.
weight_decay: 0.0001
param_schedulers:
lr:
# we make it convenient to scale Learning rate automatically as per the scaling
# rule specified in https://arxiv.org/abs/1706.02677 (ImageNet in 1Hour).
auto_lr_scaling:
# if set to True, learning rate will be scaled.
auto_scale: False
# base learning rate value that will be scaled.
base_value: 0.1
# batch size for which the base learning rate is specified. The current batch size
# is used to determine how to scale the base learning rate value.
# scaled_lr = ((batchsize_per_gpu * world_size) * base_value ) / base_lr_batch_size
base_lr_batch_size: 256
# scaling_type can be set to "sqrt" to reduce the impact of scaling on the base value
scaling_type: "linear"