In [1]:
! pip install deepspeed perceiver-multi-modality-pytorch==1.1.0 



In [2]:
!pip install mpi4py
!!pip install torch==1.7.1+cu110  -f https://download.pytorch.org/whl/torch_stable.html




['Looking in links: https://download.pytorch.org/whl/torch_stable.html',

In [3]:
!ds_report

--------------------------------------------------
DeepSpeed C++/CUDA extension op report
--------------------------------------------------
NOTE: Ops not installed will be just-in-time (JIT) compiled at
      runtime if needed. Op compatibility means that your system
      meet the required dependencies to JIT install the op.
--------------------------------------------------
JIT compiled ops requires ninja
ninja .................. [92m[OKAY][0m
--------------------------------------------------
op name ................ installed .. compatible
--------------------------------------------------
cpu_adam ............... [93m[NO][0m ....... [92m[OKAY][0m
fused_adam ............. [93m[NO][0m ....... [92m[OKAY][0m
fused_lamb ............. [93m[NO][0m ....... [92m[OKAY][0m
sparse_attn ............ [93m[NO][0m ....... [93m[NO][0m
transformer ............ [93m[NO][0m ....... [92m[OKAY][0m
stochastic_transformer . [93m[NO][0m ....... [92m[OKAY][0m
utils ..............

In [2]:

num_epochs=10

from perceiver_pytorch.multi_modality_perceiver import  InputModality
from perceiver_pytorch.multi_modality_with_text_perceiver import MultiModalityWithTextPerceiver, InputModalityWithEmbedding
import torch

import deepspeed

video_modality = InputModalityWithEmbedding(
    name='video',
    input_channels=3,  # number of channels for each token of the input
    input_axis=3,  # number of axes, 3 for video)
    num_freq_bands=6,  # number of freq bands, with original value (2 * K + 1)
    max_freq=4.,  # maximum frequency, hyperparameter depending on how fine the data is
)
image_modality = InputModalityWithEmbedding(
    name='image',
    input_channels=3,  # number of channels for each token of the input
    input_axis=2,  # number of axes, 2 for images
    num_freq_bands=6,  # number of freq bands, with original value (2 * K + 1)
    max_freq=4.,  # maximum frequency, hyperparameter depending on how fine the data is
)
audio_modality = InputModalityWithEmbedding(
    name='audio',
    input_channels=1,  # number of channels for mono audio
    input_axis=1,  # number of axes, 2 for images
    num_freq_bands=6,  # number of freq bands, with original value (2 * K + 1)
    max_freq=8.,  # maximum frequency, hyperparameter depending on how fine the data is
)
model = MultiModalityWithTextPerceiver(
    modalities=(video_modality, image_modality),
    depth=2,  # depth of net, combined with num_latent_blocks_per_layer to produce full Perceiver
    num_latents=12,
    # number of latents, or induced set points, or centroids. different papers giving it different names
    latent_dim=64,  # latent dimension
    cross_heads=1,  # number of heads for cross attention. paper said 1
    latent_heads=2,  # number of heads for latent self attention, 8
    cross_dim_head=64,
    latent_dim_head=64,
    num_classes=10,  # output number of classes
    attn_dropout=0.,
    ff_dropout=0.,
    weight_tie_layers=True,
    num_latent_blocks_per_layer=2 # Note that this parameter is 1 in the original Lucidrain implementation
    # whether to weight tie layers (optional, as indicated in the diagram)
)


ds_config={    "train_batch_size": 3,
    "steps_per_print": 2000,
    "optimizer": {
      "type": "Adam",
      "params": {
        "lr": 0.001,
        "betas": [
          0.8,
          0.999
        ],
        "eps": 1e-8,
        "weight_decay": 3e-7
      }
    },
    "fp16": {
      "enabled": True,
      "loss_scale": 0,
      "initial_scale_power": 32,
      "loss_scale_window": 1000,
      "hysteresis": 2,
      "min_loss_scale": 1
    },
    "scheduler": {
      "type": "WarmupLR",
      "params": {
        "warmup_min_lr": 0,
        "warmup_max_lr": 0.001,
        "warmup_num_steps": 1000
      }
    },
    "wall_clock_breakdown": False

  }
stage_3=True
if stage_3:
  ds_config.update({
      "zero_optimization": {
    "stage": 3,
    "cpu_offload": False,
    "cpu_offload_params": False,
    "overlap_comm": True,
    "contiguous_gradients": True,
    "stage3_max_live_parameters": 6000000,
    "stage3_max_reuse_distance": 100000000,
    "stage3_prefetch_bucket_size": 200000,
    "stage3_param_persistence_threshold": 100000,
    "reduce_bucket_size": 3000000,
    "sub_group_size": 1e6
  }})
model=model.to(torch.device('cuda'))
parameters = filter(lambda p: p.requires_grad, model.parameters())
# Initialize DeepSpeed to use the following features
# 1) Distributed model
# 2) Distributed data loader
# 3) DeepSpeed optimizer
model_engine, optimizer, trainloader, __ = deepspeed.initialize( model=model,
                                                                model_parameters=parameters,
                                                                config_params=ds_config
)

for epoch in range(num_epochs):  # loop over the dataset multiple times

      running_loss = 0.0

      image_inputs= torch.rand(size=(3, 64, 64, 3), requires_grad=True).to(model_engine.local_rank)
      video_inputs= torch.rand(size=(3, 2, 64, 64, 3), requires_grad=True).to(model_engine.local_rank)
      with torch.cuda.amp.autocast():
        outputs = model_engine({
            'image': image_inputs,
            'video': video_inputs
            }
        )
      
      loss = outputs.mean()

      model_engine.backward(loss)
      model_engine.step()
print("DONE")


[2021-04-18 14:14:19,654] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed info: version=0.3.14, git-hash=unknown, git-branch=unknown
[2021-04-18 14:14:19,662] [INFO] [engine.py:80:_initialize_parameter_parallel_groups] data_parallel_size: 1, parameter_parallel_size: 1
Using /root/.cache/torch_extensions as PyTorch extensions root...
No modifications detected for re-loaded extension module fused_adam, skipping build step...
Loading extension module fused_adam...
Time to load fused_adam op: 0.0027871131896972656 seconds
[2021-04-18 14:14:19,777] [INFO] [engine.py:608:_configure_optimizer] Using DeepSpeed Optimizer param name adam as basic optimizer
[2021-04-18 14:14:19,778] [INFO] [engine.py:612:_configure_optimizer] DeepSpeed Basic Optimizer = FusedAdam
Checking ZeRO support for optimizer=FusedAdam type=<class 'deepspeed.ops.adam.fused_adam.FusedAdam'>
[2021-04-18 14:14:19,782] [INFO] [logging.py:60:log_dist] [Rank 0] Creating fp16 ZeRO stage 3 optimizer
Initializing ZeRO Stage 3
[20



[2021-04-18 14:14:20,063] [INFO] [config.py:741:print]   zero_optimization_stage ...... 3
[2021-04-18 14:14:20,064] [INFO] [config.py:747:print]   json = {
    "fp16":{
        "enabled":true,
        "hysteresis":2,
        "initial_scale_power":32,
        "loss_scale":0,
        "loss_scale_window":1000,
        "min_loss_scale":1
    },
    "optimizer":{
        "params":{
            "betas":[
                0.8,
                0.999
            ],
            "eps":1e-08,
            "lr":0.001,
            "weight_decay":3e-07
        },
        "type":"Adam"
    },
    "scheduler":{
        "params":{
            "warmup_max_lr":0.001,
            "warmup_min_lr":0,
            "warmup_num_steps":1000
        },
        "type":"WarmupLR"
    },
    "steps_per_print":2000,
    "train_batch_size":3,
    "wall_clock_breakdown":false,
    "zero_optimization":{
        "contiguous_gradients":true,
        "cpu_offload":false,
        "cpu_offload_params":false,
        "overlap_co

RuntimeError: ignored

In [None]:
ONE