RoB distillation + JEPA evaluations (#284)

Summary: Pull Request resolved: fairinternal/ssl_scaling#284 Reviewed By: odelalleau Differential Revision: D42220017 Pulled By: QuentinDuval fbshipit-source-id: 742419aa859fdbe4bc80f1f9e9f4771fee0f41a2
facebookresearch · Dec 28, 2022 · 04788de · 04788de
1 parent 346114a
commit 04788de
Show file tree

Hide file tree

Showing 259 changed files with 13,408 additions and 791 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -55,7 +55,8 @@ install_classy_vision: &install_classy_vision
       working_directory: ~/
       command: |
         pip uninstall -y classy_vision
-        pip install classy-vision@https://github.com/facebookresearch/ClassyVision/tarball/main
+        pip install classy-vision@https://github.com/facebookresearch/ClassyVision/tarball/4785d5ee19d3bcedd5b28c1eb51ea1f59188b54d
+
 
 setup_venv: &setup_venv
   - run:
@@ -151,7 +152,7 @@ jobs:
       # Cache the vissl_venv directory that contains dependencies
       - restore_cache:
           keys:
-            - v8-cpu-dependencies-{{ checksum "requirements.txt" }}-{{ checksum "setup.py" }}
+            - v9-cpu-dependencies-{{ checksum "requirements.txt" }}-{{ checksum "setup.py" }}
 
       - <<: *install_vissl_dep
       - <<: *install_augly
@@ -163,7 +164,7 @@ jobs:
       - save_cache:
           paths:
             - ~/vissl_venv
-          key: v8-cpu-dependencies-{{ checksum "requirements.txt" }}-{{ checksum "setup.py" }}
+          key: v9-cpu-dependencies-{{ checksum "requirements.txt" }}-{{ checksum "setup.py" }}
 
       - <<: *install_vissl
 
@@ -195,7 +196,7 @@ jobs:
       # Download and cache dependencies
       - restore_cache:
           keys:
-            - v8-gpu-dependencies-{{ checksum "requirements.txt" }}-{{ checksum "setup.py" }}-{{ checksum "docker/common/install_apex.sh" }}
+            - v9-gpu-dependencies-{{ checksum "requirements.txt" }}-{{ checksum "setup.py" }}-{{ checksum "docker/common/install_apex.sh" }}
 
       - <<: *install_vissl_dep
       - <<: *install_classy_vision
@@ -210,7 +211,7 @@ jobs:
       - save_cache:
           paths:
             - ~/vissl_venv
-          key: v8-gpu-dependencies-{{ checksum "requirements.txt" }}-{{ checksum "setup.py" }}-{{ checksum "docker/common/install_apex.sh" }}
+          key: v9-gpu-dependencies-{{ checksum "requirements.txt" }}-{{ checksum "setup.py" }}-{{ checksum "docker/common/install_apex.sh" }}
 
       - <<: *install_vissl
 

diff --git a/configs/config/benchmark/fulltune/imagenet1k/eval_resnet_8gpu_transfer_in1k_fulltune.yaml b/configs/config/benchmark/fulltune/imagenet1k/eval_resnet_8gpu_transfer_in1k_fulltune.yaml
@@ -50,9 +50,6 @@ config:
   TRAINER:
     TRAIN_STEP_NAME: standard_train_step
   MODEL:
-    FEATURE_EVAL_SETTINGS:
-      EVAL_MODE_ON: True
-      EVAL_TRUNK_AND_HEAD: False
     TRUNK:
       NAME: resnet
       RESNETS:

diff --git a/configs/config/benchmark/fulltune/imagenet1k/models/mobilenet_v3_timm.yaml b/configs/config/benchmark/fulltune/imagenet1k/models/mobilenet_v3_timm.yaml
@@ -0,0 +1,14 @@
+# @package _global_
+config:
+  MODEL:
+    TRUNK:
+      NAME: mobilenetv3_timm
+      MOBILE_NET:
+        NAME: mobilenetv3_large_100
+        TRUNK_ONLY: True
+    HEAD:
+      PARAMS: [
+        ["mobilenet_v3_head_timm", {"num_classes": 1000}],
+      ]
+  OPTIMIZER:
+    regularize_bn: True
diff --git a/configs/config/benchmark/fulltune/imagenet1k/models/mobilenet_v3_tv.yaml b/configs/config/benchmark/fulltune/imagenet1k/models/mobilenet_v3_tv.yaml
@@ -0,0 +1,12 @@
+# @package _global_
+config:
+  MODEL:
+    TRUNK:
+      NAME: mobilenetv3_tv
+      MOBILE_NET:
+        NAME: mobilenetv3_large_100
+        TIMM_BN: False
+    HEAD:
+      PARAMS: [
+        ["mobilenet_v3_head", {"num_classes": 1000}],
+      ]
diff --git a/configs/config/benchmark/fulltune/imagenet1k/models/resnet18_eval_mlp.yaml b/configs/config/benchmark/fulltune/imagenet1k/models/resnet18_eval_mlp.yaml
@@ -0,0 +1,9 @@
+# @package _global_
+config:
+  MODEL:
+    TRUNK:
+      NAME: resnet
+      RESNETS:
+        DEPTH: 18
+    HEAD:
+      PARAMS: [['eval_mlp', {'in_channels': 512, 'dims': [512, 1000]}]]
diff --git a/configs/config/benchmark/fulltune/imagenet1k/models/resnet34_eval_mlp.yaml b/configs/config/benchmark/fulltune/imagenet1k/models/resnet34_eval_mlp.yaml
@@ -0,0 +1,9 @@
+# @package _global_
+config:
+  MODEL:
+    TRUNK:
+      NAME: resnet
+      RESNETS:
+        DEPTH: 34
+    HEAD:
+      PARAMS: [['eval_mlp', {'in_channels': 512, 'dims': [512, 1000]}]]
diff --git a/configs/config/benchmark/fulltune/imagenet1k/models/resnext50_eval_mlp.yaml b/configs/config/benchmark/fulltune/imagenet1k/models/resnext50_eval_mlp.yaml
@@ -0,0 +1,9 @@
+# @package _global_
+config:
+  MODEL:
+    TRUNK:
+      NAME: resnet
+      RESNETS:
+        DEPTH: 50
+    HEAD:
+      PARAMS: [['eval_mlp', {'in_channels': 2048, 'dims': [2048, 1000]}]]
diff --git a/configs/config/benchmark/fulltune/imagenet1k/models/vit_tiny_cls4_eval_mlp.yaml b/configs/config/benchmark/fulltune/imagenet1k/models/vit_tiny_cls4_eval_mlp.yaml
@@ -0,0 +1,27 @@
+# @package _global_
+config:
+  MODEL:
+    FEATURE_EVAL_SETTINGS:
+      EVAL_MODE_ON: True
+      FREEZE_TRUNK_AND_HEAD: True
+      LINEAR_EVAL_FEAT_POOL_OPS_MAP: [
+        ["concatCLS4", ["Identity", []] ],
+      ]
+    TRUNK: # Tiny
+      NAME: vision_transformer
+      VISION_TRANSFORMERS:
+        IMAGE_SIZE: 224
+        PATCH_SIZE: 16
+        NUM_LAYERS: 12
+        NUM_HEADS: 3
+        HIDDEN_DIM: 192
+        MLP_DIM: 768
+        CLASSIFIER: token
+        DROPOUT_RATE: 0
+        ATTENTION_DROPOUT_RATE: 0
+        QKV_BIAS: True
+        DROP_PATH_RATE: 0.0
+    HEAD:
+      PARAMS: [
+        ["eval_mlp", {"in_channels": 768, "dims": [768, 1000]}],
+      ]
diff --git a/configs/config/benchmark/linear_image_classification/cifar100/models/beit_vit_l16.yaml b/configs/config/benchmark/linear_image_classification/cifar100/models/beit_vit_l16.yaml
@@ -1,29 +1,5 @@
 # @package _global_
 config:
-  DATA:
-    TRAIN:
-      BATCHSIZE_PER_REPLICA: 32
-      TRANSFORMS:
-        - name: RandomResizedCrop
-          size: 224
-          interpolation: 3
-        - name: RandomHorizontalFlip
-        - name: ToTensor
-        - name: Normalize
-          mean: [0.485, 0.456, 0.406]
-          std: [0.229, 0.224, 0.225]
-    TEST:
-      BATCHSIZE_PER_REPLICA: 32
-      TRANSFORMS:
-        - name: Resize
-          size: 256
-          interpolation: 3
-        - name: CenterCrop
-          size: 224
-        - name: ToTensor
-        - name: Normalize
-          mean: [0.485, 0.456, 0.406]
-          std: [0.229, 0.224, 0.225]
   MODEL:
     FEATURE_EVAL_SETTINGS:
       LINEAR_EVAL_FEAT_POOL_OPS_MAP: [
@@ -54,9 +30,5 @@ config:
         ["eval_mlp", {"in_channels": 4096, "dims": [4096, 100]}],
         ["eval_mlp", {"in_channels": 1024, "dims": [1024, 100]}],
       ]
-    WEIGHTS_INIT:
-      PARAMS_FILE: "manifold://ssl_framework/tree/gfsai-bistro2-east/ai-group/users/prigoyal/vissl/oss_beit_large_patch16_224_pt22k.pth"
-      APPEND_PREFIX: trunk.base_model.
-      STATE_DICT_KEY_NAME: 'model'
   OPTIMIZER:
       regularize_bn: True
diff --git a/configs/config/benchmark/linear_image_classification/cifar100/models/mobilenet_v3_timm.yaml b/configs/config/benchmark/linear_image_classification/cifar100/models/mobilenet_v3_timm.yaml
@@ -0,0 +1,20 @@
+# @package _global_
+config:
+  MODEL:
+    FEATURE_EVAL_SETTINGS:
+      LINEAR_EVAL_FEAT_POOL_OPS_MAP: [
+        ["flatten", ["Identity", []] ],
+        ["flatten", ["Identity", []] ],
+      ]
+    TRUNK:
+      NAME: mobilenetv3_timm
+      MOBILE_NET:
+        NAME: mobilenetv3_large_100
+        PRETRAINED: False
+    HEAD:
+      PARAMS: [
+        ["eval_mlp", {"in_channels": 1280, "dims": [1280, 100]}],
+        ["mlp", {"dims": [1280, 100]}],
+      ]
+  OPTIMIZER:
+    regularize_bn: True
diff --git a/configs/config/benchmark/linear_image_classification/cifar100/models/mobilenet_v3_tv.yaml b/configs/config/benchmark/linear_image_classification/cifar100/models/mobilenet_v3_tv.yaml
@@ -0,0 +1,105 @@
+# @package _global_
+config:
+  MODEL:
+    FEATURE_EVAL_SETTINGS:
+      EVAL_MODE_ON: True
+      FREEZE_TRUNK_ONLY: True
+      SHOULD_FLATTEN_FEATS: True
+      LINEAR_EVAL_FEAT_POOL_OPS_MAP: [
+        # Linear heads on top of normalized or not representations
+        ["trunk_pool", ["Identity", []] ],
+        ["trunk_pool", ["Identity", []] ],
+        ["trunk_pool", ["Identity", []] ],
+
+        # MobileNet head on top of normalized or not representations
+        ["trunk_pool", ["Identity", []] ],
+        ["trunk_pool", ["Identity", []] ],
+        ["trunk_pool", ["Identity", []] ],
+        # ["trunk_pool", ["Identity", []] ],
+        # ["trunk_pool", ["Identity", []] ],
+
+        # Exploring a two layer head
+        ["trunk_pool", ["Identity", []] ],
+        ["trunk_pool", ["Identity", []] ],
+        ["trunk_pool", ["Identity", []] ],
+
+        # Combining several levels of representations
+        ["trunk", ["AdaptiveAvgPool2d", [[2, 1]]]],
+        ["trunk", ["AdaptiveAvgPool2d", [[2, 1]]]],
+        ["trunk", ["AdaptiveAvgPool2d", [[2, 1]]]],
+        ["trunk", ["AdaptiveAvgPool2d", [[2, 2]]]],
+        ["trunk", ["AdaptiveAvgPool2d", [[2, 2]]]],
+        ["trunk", ["AdaptiveAvgPool2d", [[2, 2]]]],
+      ]
+    TRUNK:
+      NAME: mobilenetv3_tv
+      MOBILE_NET:
+        NAME: mobilenetv3_large_100
+        PRETRAINED: False
+    HEAD:
+      PARAMS: [
+        # Linear heads on top of normalized or not representations
+        ["eval_mlp", {"in_channels": 960, "dims": [960, 100]}],
+        ["eval_mlp", {"in_channels": 960, "dims": [960, 100]}],
+        ["eval_mlp", {"in_channels": 960, "dims": [960, 100]}],
+
+        # MobileNet head on top of normalized or not representations
+        ["mobilenet_v3_head", {"with_bn": True, "num_classes": 100}],
+        ["mobilenet_v3_head", {"with_bn": True, "num_classes": 100}],
+        ["mobilenet_v3_head", {"with_bn": True, "num_classes": 100}],
+        # ["mobilenet_v3_head", {"with_bn": True, "drop_out": 0.1, "num_classes": 100}],
+        # ["mobilenet_v3_head", {"with_bn": True, "drop_out": 0.0, "num_classes": 100}],
+
+        # Exploring a two layers head
+        ["eval_mlp", {"in_channels": 960, "dims": [960, 1280, 100]}],
+        ["eval_mlp", {"in_channels": 960, "dims": [960, 1280, 100]}],
+        ["eval_mlp", {"in_channels": 960, "dims": [960, 1280, 100]}],
+
+        # Combining several levels of representations
+        ["eval_mlp", {"in_channels": 1920, "dims": [1920, 100]}],
+        ["eval_mlp", {"in_channels": 1920, "dims": [1920, 100]}],
+        ["eval_mlp", {"in_channels": 1920, "dims": [1920, 100]}],
+        ["eval_mlp", {"in_channels": 3840, "dims": [3840, 100]}],
+        ["eval_mlp", {"in_channels": 3840, "dims": [3840, 100]}],
+        ["eval_mlp", {"in_channels": 3840, "dims": [3840, 100]}],
+      ]
+  OPTIMIZER:
+    name: sgd
+    # In the OSS Caffe2 benchmark, RN50 models use 1e-4 and AlexNet models 5e-4
+    weight_decay: 0.0005
+    momentum: 0.9
+    num_epochs: 28
+    nesterov: True
+    regularize_bn: True
+    regularize_bias: True
+    param_schedulers:
+      lr:
+        auto_lr_scaling:
+          auto_scale: true
+          base_value: 0.01
+          base_lr_batch_size: 256
+        name: multistep
+        values: [0.01, 0.001, 0.0001, 0.00001]
+        milestones: [8, 16, 24]
+        update_interval: epoch
+    param_group_constructor: linear_eval_heads
+    linear_eval_heads:
+      # Linear heads on top of normalized or not representations
+      - {"lr": 1.0, "weight_decay": 0.0005, "regularize_bn": True}
+      - {"lr": 1.0, "weight_decay": 0.0005, "regularize_bn": False}
+      - {"lr": 1.0, "weight_decay": 0.0}
+      # MobileNet head on top of normalized or not representations
+      - {"lr": 1.0, "weight_decay": 0.0005, "regularize_bn": True}
+      - {"lr": 1.0, "weight_decay": 0.0005, "regularize_bn": False}
+      - {"lr": 1.0, "weight_decay": 0.0}
+      # Exploring a two layers head
+      - {"lr": 1.0, "weight_decay": 0.0005}
+      - {"lr": 1.0, "weight_decay": 0.0001}
+      - {"lr": 1.0, "weight_decay": 0.0}
+      # Combining several levels of representations
+      - {"lr": 1.0, "weight_decay": 0.0005, "regularize_bn": True}
+      - {"lr": 1.0, "weight_decay": 0.0005, "regularize_bn": False}
+      - {"lr": 1.0, "weight_decay": 0.0}
+      - {"lr": 1.0, "weight_decay": 0.0005, "regularize_bn": True}
+      - {"lr": 1.0, "weight_decay": 0.0005, "regularize_bn": False}
+      - {"lr": 1.0, "weight_decay": 0.0}
diff --git a/configs/config/benchmark/linear_image_classification/cifar100/models/vit_g16_no_cls.yaml b/configs/config/benchmark/linear_image_classification/cifar100/models/vit_g16_no_cls.yaml
@@ -0,0 +1,54 @@
+# @package _global_
+config:
+  DATA:
+    TRAIN:
+      TRANSFORMS:
+        - name: RandomResizedCrop
+          size: 224
+          interpolation: 3
+        - name: RandomHorizontalFlip
+        - name: ToTensor
+        - name: Normalize
+          mean: [0.485, 0.456, 0.406]
+          std: [0.229, 0.224, 0.225]
+    TEST:
+      TRANSFORMS:
+        - name: Resize
+          size: 256
+          interpolation: 3
+        - name: CenterCrop
+          size: 224
+        - name: ToTensor
+        - name: Normalize
+          mean: [0.485, 0.456, 0.406]
+          std: [0.229, 0.224, 0.225]
+  MODEL:
+    FEATURE_EVAL_SETTINGS:
+      LINEAR_EVAL_FEAT_POOL_OPS_MAP: [
+        ["concatPOOL4", ["Identity", []] ],
+        ["lastPOOL", ["Identity", []] ],
+        ["concatPOOL4", ["Identity", []] ],
+        ["lastPOOL", ["Identity", []] ],
+      ]
+    TRUNK: # L-16
+      NAME: vision_transformer
+      VISION_TRANSFORMERS:
+        IMAGE_SIZE: 224
+        PATCH_SIZE: 16
+        NUM_LAYERS: 40
+        NUM_HEADS: 16
+        HIDDEN_DIM: 1408
+        MLP_DIM: 6144
+        DROPOUT_RATE: 0.0
+        ATTENTION_DROPOUT_RATE: 0.0
+        CLASSIFIER: token
+        QKV_BIAS: True
+        DROP_PATH_RATE: 0.0
+        USE_CLASS_TOKEN: False
+    HEAD:
+      PARAMS: [
+        ["eval_mlp", {"in_channels": 5632, "dims": [5632, 100]}],
+        ["eval_mlp", {"in_channels": 1408, "dims": [1408, 100]}],
+        ["mlp", {"dims": [5632, 100]}],
+        ["mlp", {"dims": [1408, 100]}],
+      ]