In [1]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import torch.nn.functional as F

from models.linear_decoder import LinearDecoder
from transformers import AutoModelForDepthEstimation

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
encoder_name = ['vit_large_patch14_dinov2', "eva02_base_patch14_224", "depth-anything/Depth-Anything-V2-Base-hf"]
img_size = (1024, 1024) # img size 512 works with 8 and 16 patch size, not 14
patch_size = 16
num_classes = 19

In [None]:
model = LinearDecoder(
    encoder_name[0],
    num_classes = num_classes,
    img_size = img_size,
    ckpt_path = None,
    sub_norm = False, 
    patch_size = patch_size,
    pretrained = True
)

  warn_deprecated('vmap', 'torch.vmap')


In [4]:
device = torch.device("cuda:0")

model.cuda()
data = torch.randn(1, 3, *(img_size)).to(device)
target = torch.randint(0, num_classes, (1, 1024, 1024)).to(device)
dataset = torch.utils.data.TensorDataset(data, target)
dataloader = DataLoader(dataset, batch_size=1)
critertion = nn.CrossEntropyLoss()

In [None]:
logits = model(data)
segmentation = F.interpolate(logits, img_size, mode="bilinear")
loss = critertion(segmentation, target)
loss.backward()

In [None]:
with torch.no_grad():
    logits = model(data)
    print('hw logits:',logits.shape)
    segmentation = F.interpolate(logits, img_size, mode="bilinear")
    print('HW segmentation logits:',segmentation.shape)
    loss = critertion(segmentation, target)
    loss.backward()

Forwarding Linear Decoder
 Called encoder forward
 Normalized: torch.Size([1, 3, 1024, 1024])
     Forward features HF_models
     Returning last layer: torch.Size([1, 5330, 768])
 Logits: torch.Size([1, 5330, 768])
Got output from decoder: torch.Size([1, 5329, 768])
Got output from head: torch.Size([1, 5329, 19])
Transposed: torch.Size([1, 19, 5329])
hw logits: torch.Size([1, 19, 73, 73])
HW segmentation logits: torch.Size([1, 19, 1024, 1024])


In [16]:
depthv2 = AutoModelForDepthEstimation.from_pretrained("depth-anything/Depth-Anything-V2-Large-hf",
                                                    state_dict=None)
depthv2.cuda()
with torch.no_grad():
    out = depthv2.backbone.embeddings(data)
    out = depthv2.backbone.encoder(out)
    out_back = depthv2.backbone(data)



- DINOv2 (done)
    - python main.py fit -c configs/cityscapes_linear_semantic.yaml --root /media/data/workspace_Giuseppe/code/datasets/CityScapes --data.num_workers 8 --trainer.devices [0,1] --model.network.encoder_name vit_large_patch14_dinov2 --model.network.patch_size 8

- EVA02 
    - 'at the moment OOM' on 24 GB, maybe works on 48 GBs
    - patch size 16 and 1024 works, according to benchmark size 16 correlates with 8 so at least for a first experiment
      it should be fine
    - python main.py fit -c configs/cityscapes_linear_semantic.yaml --root /media/data/workspace_Giuseppe/code/datasets/CityScapes --data.num_workers 8 --trainer.devices [0,1] --model.network.encoder_name eva02_large_patch14_224.mim_m38m --no_compile --model.network.patch_size 8

- Depth-v2 
    - the memory consumption when using lightning seems higher, which is strange bc using mixed-precision 16 
      and the training step consists of calling the code present in the notebook (+ just a couple more of computations)
    - patch size 8 (even with torch.no_grad())/14 -> OOM
    - python main.py fit -c configs/cityscapes_linear_semantic.yaml --root /media/data/workspace_Giuseppe/code/datasets/CityScapes --data.num_workers 8 --trainer.devices [0,1] --model.network.encoder_name depth-anything/Depth-Anything-V2-Base-hf --model.network.patch_size 14

----
## DepthAny-V2 SS

In [None]:
depthv2 = AutoModelForDepthEstimation.from_pretrained("depth-anything/Depth-Anything-V2-Large-hf",
                                                    state_dict=None)



In [6]:
data = torch.randn(1, 3, 1024, 1024)

model.cuda()
depthv2.cuda()
data = data.cuda()

In [7]:
# Dinov2 + Linear Decoder
with torch.no_grad():
    print('encoder:')
    data = (data - model.pixel_mean) / model.pixel_std
    output = model.encoder.forward_features(data)
    print(output.shape)
    if output.dim() == 4:
        output = output.flatten(2).transpose(1, 2) 
    else: 
        # remove cls token
        output = output[:, model.encoder.num_prefix_tokens :]
    print(output.shape)
    print('linear decoder:')
    output = model.head(output)    
    print(output.shape)
    output = output.transpose(1,2)
    print(output.shape)
    logits = output.reshape(output.shape[0], -1, *model.grid_size)
    print(logits.shape)
    print('upsample:')
    logits = F.interpolate(logits, (1024,1024), mode="bilinear")
    print(logits.shape)

encoder:
torch.Size([1, 16385, 1024])
torch.Size([1, 16384, 1024])
linear decoder:
torch.Size([1, 16384, 19])
torch.Size([1, 19, 16384])
torch.Size([1, 19, 128, 128])
upsample:
torch.Size([1, 19, 1024, 1024])


In [8]:
# DINOv2 encoder forward features
with torch.no_grad():
    out = model.encoder.patch_embed(data) # proj + flatten 
    print(out.shape)
    out = model.encoder._pos_embed(out) # add positional embedding and cls token
    print(out.shape)
    out = model.encoder.patch_drop(out) # dropout
    print(out.shape)
    out = model.encoder.norm_pre(out) # layer norm
    print(out.shape)
    out_blocks = model.encoder.blocks(out) # transformer blocks
    print(out_blocks.shape)

torch.Size([1, 16384, 1024])
torch.Size([1, 16385, 1024])
torch.Size([1, 16385, 1024])
torch.Size([1, 16385, 1024])
torch.Size([1, 16385, 1024])


### How to get features

- paper 
    - depth: dinov2 encoder + DPT decoder 
    - SS: they say on top of our encoders, so I guess they only Dinov2 encoders? (Probably yes)
        - is the forward pass modified? bc normally they only extract feats at 4 blocks' levels
        - should I simply leave the forward as it is, reshape features out backbone and then apply linear decoder? 
          DinoV2 only final block features 
- create two versions (keep linear and same fine-tuning settings)
    - use hf model backbone and discard all but last feature map (stage 24 = last block)
        - test 14x14 patch size for less epochs
        - if worked, then use 8x8 patch size
    - use hf model backbone and modify forward pass to stack the feature maps, reshape and adjust linear decoder 
    - try the same, but change pre-processing to depthv2 pipeline

- problems: 
    - neck outputs ms features 
        - pick only the largest one
        - find way to combine
        - NOTE: "transferring our Depth Anything encoders to semantic segmentation", does it mean I only use backbone and no neck? (lose ~30M params -> basically same params of DinoV2)
            - stack backbone features and reshape to 3 dims
            - take only the last one
    - in paper use Mask2Former, not linear probe
        - benchmark paper shows what ViT-g + mask2former does really well only for mIoU, but shit ECE, FPR@95 etc. for semantics, but excels in OOD

In [122]:
with torch.no_grad():
    print('backbone dinov2:')
    output_backbone = depthv2.backbone(data)
    feat_maps_backbone = output_backbone.feature_maps
    print(torch.stack(feat_maps_backbone).shape)

_, _, height, width = data.shape
patch_size = depthv2.config.patch_size
patch_height = height // patch_size
patch_width = width // patch_size

with torch.no_grad():
    print('neck:')
    output_neck = depthv2.neck(feat_maps_backbone, patch_height, patch_width) # reassemble stage + fusion stage
    for i in output_neck:
        print(i.shape)

backbone dinov2:
torch.Size([4, 1, 5330, 1024])
neck:
torch.Size([1, 256, 73, 73])
torch.Size([1, 256, 146, 146])
torch.Size([1, 256, 292, 292])
torch.Size([1, 256, 584, 584])


In [58]:
cacca = torch.randn(4, 1, 5330, 1024)
cacca.reshape(1, -1, 1024).shape

torch.Size([1, 21320, 1024])

#### 1)flatten after backbone + linear decoder

In [61]:
# get 4 features maps from the backbone
depthv2.backbone.out_features

['stage5', 'stage12', 'stage18', 'stage24']

In [None]:
with torch.no_grad():
    out_depth = depthv2.backbone(data)
    out_depth = torch.stack(out_depth.feature_maps)
    print(out_depth.reshape(1, 5330*4, 1024).shape)

torch.Size([1, 21320, 1024])


---

In [129]:
other_stuff_dino = sum([p.numel() for p in model.encoder.patch_embed.parameters() if p.requires_grad]) + \
    sum([p.numel() for p in model.encoder.norm.parameters() if p.requires_grad])
other_stuff_dino

605184

In [133]:
sum([p.numel() for p in model.encoder.parameters() if p.requires_grad]) - transformer_dino - other_stuff_dino

5458944

In [205]:
dinov2_params = sum([p.numel() for p in model.encoder.parameters() if p.requires_grad])
depthv2_params = sum([p.numel() for p in depthv2.backbone.parameters() if p.requires_grad])
dinov2_params - depthv2_params

4054016

In [298]:
print('DinoV2 patch_embed params:')
dinov2_embed_params = 0
for name, params in model.encoder.patch_embed.named_parameters():
    print(name, params.requires_grad, params.numel())
    dinov2_embed_params += params.numel()

print('-'*50)
print('DepthV2 patch_embed params:')
depthv2_embeddings_params = 0
for name, params in depthv2.backbone.embeddings.named_parameters():
    print(name, params.requires_grad, params.numel())
    depthv2_embeddings_params += params.numel()

print('-'*50)
print('params difference:', depthv2_embeddings_params - dinov2_embed_params)

DinoV2 patch_embed params:
proj.weight True 602112
proj.bias True 1024
--------------------------------------------------
DepthV2 patch_embed params:
cls_token True 1024
mask_token True 1024
position_embeddings True 1402880
patch_embeddings.projection.weight True 602112
patch_embeddings.projection.bias True 1024
--------------------------------------------------
params difference: 1404928


In [206]:
transformer_dino = sum([p.numel() for p in model.encoder.blocks.parameters() if p.requires_grad])
transformer_depthv2 = sum([p.numel() for p in depthv2.backbone.encoder.parameters() if p.requires_grad])
transformer_dino == transformer_depthv2

True