xcit integration (#187)

Summary: Integration of XCiT into VISSL :) Pull Request resolved: fairinternal/ssl_scaling#187 Reviewed By: iseessel Differential Revision: D31378757 Pulled By: prigoyal fbshipit-source-id: d383390a3ae4def585eaf7bef6d743e9c5c059e8
facebookresearch · Oct 6, 2021 · 056e695 · 056e695
1 parent af8e879
commit 056e695
Show file tree

Hide file tree

Showing 4 changed files with 561 additions and 1 deletion.
diff --git a/configs/config/pretrain/dino/dino_16gpus_xcit_small_12_p16.yaml b/configs/config/pretrain/dino/dino_16gpus_xcit_small_12_p16.yaml
@@ -0,0 +1,140 @@
+# @package _global_
+config:
+  VERBOSE: False
+  LOG_FREQUENCY: 10
+  TEST_ONLY: False
+  TEST_MODEL: False
+  SEED_VALUE: 0
+  MULTI_PROCESSING_METHOD: forkserver
+  HOOKS:
+    PERF_STATS:
+      MONITOR_PERF_STATS: True
+      PERF_STAT_FREQUENCY: 40
+      ROLLING_BTIME_FREQ: 5
+  DATA:
+    NUM_DATALOADER_WORKERS: 10
+    TRAIN:
+      DATA_SOURCES: [disk_folder]
+      DATASET_NAMES: [imagenet1k_folder]
+      BATCHSIZE_PER_REPLICA: 64
+      LABEL_TYPE: sample_index    # just an implementation detail. Label isn't used
+      TRANSFORMS:
+        - name: ImgPilToMultiCrop
+          total_num_crops: 10
+          size_crops: [224, 96]
+          num_crops: [2, 8]
+          crop_scales: [[0.3, 1], [0.05, 0.3]]
+        - name: RandomHorizontalFlip
+          p: 0.5
+        - name: ImgPilColorDistortion
+          strength: 0.5
+        - name: ImgPilMultiCropRandomApply
+          transforms: [{"name": "ImgPilGaussianBlur", "p": 1., "radius_min": 0.1, "radius_max": 2.0}]
+          prob: [1., 0.1, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5]
+        - name: ImgPilMultiCropRandomApply
+          transforms: [{"name": "ImgPilRandomSolarize", "p": 1.}]
+          prob: [0., 0.2, 0., 0., 0, 0, 0, 0, 0, 0]
+        - name: ToTensor
+        - name: Normalize
+          mean: [0.485, 0.456, 0.406]
+          std: [0.229, 0.224, 0.225]
+      COLLATE_FUNCTION: multicrop_collator
+      MMAP_MODE: True
+      COPY_TO_LOCAL_DISK: False
+      COPY_DESTINATION_DIR: /tmp/imagenet1k/
+      DROP_LAST: True
+  TRAINER:
+    TRAIN_STEP_NAME: standard_train_step
+  METERS:
+    name: ""
+  MODEL:
+    TRUNK:
+      NAME: xcit
+      XCIT:
+        IMAGE_SIZE: 224
+        PATCH_SIZE: 16
+        HIDDEN_DIM: 384
+        NUM_LAYERS: 12
+        NUM_HEADS: 8
+        DROPOUT_RATE: 0
+        ATTENTION_DROPOUT_RATE: 0
+        DROP_PATH_RATE: 0.05
+        ETA: 1
+        TOKENS_NORM: True
+        QKV_BIAS: True
+        QK_SCALE: False
+    HEAD:
+      PARAMS: [
+              ["swav_head", {"use_weight_norm_prototypes": True, "dims": [384, 2048, 2048, 256], "use_bn": False, "return_embeddings": False, "activation_name": "GELU", "num_clusters": [65536]}],
+      ]
+    TEMP_FROZEN_PARAMS_ITER_MAP: [
+      ['module.heads.0.prototypes0.weight_v', 1251],
+      ['module.heads.0.prototypes0.weight_g', 1251],
+      ]
+    AMP_PARAMS:
+      AMP_TYPE: pytorch
+      USE_AMP: True
+  LOSS:
+    name: dino_loss
+    dino_loss:
+      momentum: 0.996
+      teacher_temp_warmup_iters: 37530 # 30 epochs
+      teacher_temp_min: 0.04
+      teacher_temp_max: 0.07
+      ema_center: 0.9
+      normalize_last_layer: false
+  OPTIMIZER:
+      name: adamw
+      momentum: 0.9
+      nesterov: False
+      num_epochs: 300
+      regularize_bn: False
+      regularize_bias: False
+      param_schedulers:
+        lr_head:
+          name: composite
+          schedulers:
+            - name: linear
+              start_value: 0.00001
+              end_value: 0.002
+            - name: cosine
+              start_value: 0.002
+              end_value: 0.00001
+          update_interval: epoch
+          interval_scaling: [rescaled, fixed]
+          lengths: [0.0333, 0.9667]
+        lr:
+          name: composite
+          schedulers:
+            - name: linear
+              start_value: 0.00001
+              end_value: 0.002
+            - name: cosine
+              start_value: 0.002
+              end_value: 0.00001
+          update_interval: epoch
+          interval_scaling: [rescaled, fixed]
+          lengths: [0.0333, 0.9667]
+        weight_decay:
+          name: cosine
+          start_value: 0.04
+          end_value: 0.4
+          update_interval: epoch
+        weight_decay_head:
+          name: cosine
+          start_value: 0.04
+          end_value: 0.4
+          update_interval: epoch
+  DISTRIBUTED:
+    BACKEND: nccl
+    NUM_NODES: 2
+    NUM_PROC_PER_NODE: 8
+    INIT_METHOD: tcp
+    RUN_ID: auto
+  MACHINE:
+    DEVICE: gpu
+  CHECKPOINT:
+    DIR: "."
+    AUTO_RESUME: True
+    CHECKPOINT_FREQUENCY: 5
+    OVERWRITE_EXISTING: true
diff --git a/vissl/config/defaults.yaml b/vissl/config/defaults.yaml
@@ -622,6 +622,25 @@ config:
         QKV_BIAS: False # Bias for QKV in attention layers.
         QK_SCALE: False # Scale
 
+      # ------------------------------------------------------------- #
+      # XCiT (https://arxiv.org/abs/2106.09681)
+      # default config is that of xcit_small_12_p16
+      # ------------------------------------------------------------- #
+      XCIT:
+        name:
+        IMAGE_SIZE: 224
+        PATCH_SIZE: 16
+        HIDDEN_DIM: 384
+        NUM_LAYERS: 12
+        NUM_HEADS: 8
+        DROPOUT_RATE: 0
+        ATTENTION_DROPOUT_RATE: 0
+        DROP_PATH_RATE: 0.05
+        ETA: 1
+        TOKENS_NORM: True
+        QKV_BIAS: True # Bias for QKV in attention layers.
+        QK_SCALE: False # Scale
+
       # ------------------------------------------------------------- #
       # Parameters unique to the ConViT and not used for standard vision
       # transformers