#### First get the candidate model parts (Head and Feature Extractor)

In [1]:
import mlflow
mlflow.set_tracking_uri("http://127.0.0.1:5000")
model_uri = "models:/{}/{}".format('Laion Balanced', '1')
head_model = mlflow.pytorch.load_model(model_uri)
head_model



DiffFeatureDetectorParamBiDirectional(
  (cls_layer): Sequential(
    (features_dropout): Dropout(p=0.31296189949335906, inplace=False)
    (linear_0): Linear(in_features=1280, out_features=184, bias=True)
    (relu_0): ReLU()
    (hidden_dropout_0): Dropout(p=0.5838650129135917, inplace=False)
    (linear_1): Linear(in_features=184, out_features=184, bias=True)
    (relu_1): ReLU()
    (hidden_dropout_1): Dropout(p=0.5838650129135917, inplace=False)
    (linear_2): Linear(in_features=184, out_features=184, bias=True)
    (relu_2): ReLU()
    (hidden_dropout_2): Dropout(p=0.5838650129135917, inplace=False)
    (linear_out): Linear(in_features=184, out_features=1, bias=True)
  )
)

In [None]:
from drecg.training.ignite_finetune import define_model_for_tune
import mlflow
mlflow.set_tracking_uri("http://127.0.0.1:5000")
model_1 = define_model_for_tune('Laion Balanced', 'ViT_LAION')

`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [1]:
from drecg.feature_extraction.distributed import VitImageFeatureExtractor

In [2]:
model = VitImageFeatureExtractor.load_pretrained()

config.json not found in HuggingFace Hub.


In [3]:
seq_model = model.to_sequential()

In [4]:
seq_model.eval()

Sequential(
  (0): Sequential(
    (0): CLIPVisionEmbeddings(
      (patch_embedding): Conv2d(3, 1664, kernel_size=(14, 14), stride=(14, 14), bias=False)
      (position_embedding): Embedding(257, 1664)
    )
    (1): LayerNorm((1664,), eps=1e-05, elementwise_affine=True)
  )
  (1): EncoderLayerSimple(
    (enc_layer): CLIPEncoderLayer(
      (self_attn): CLIPAttention(
        (k_proj): Linear(in_features=1664, out_features=1664, bias=True)
        (v_proj): Linear(in_features=1664, out_features=1664, bias=True)
        (q_proj): Linear(in_features=1664, out_features=1664, bias=True)
        (out_proj): Linear(in_features=1664, out_features=1664, bias=True)
      )
      (layer_norm1): LayerNorm((1664,), eps=1e-05, elementwise_affine=True)
      (mlp): CLIPMLP(
        (activation_fn): GELUActivation()
        (fc1): Linear(in_features=1664, out_features=8192, bias=True)
        (fc2): Linear(in_features=8192, out_features=1664, bias=True)
      )
      (layer_norm2): LayerNorm((1664,

In [5]:
import torch
dummy_tensor_input = torch.rand(3, 3, 224, 224)


In [9]:
out_0 = model(dummy_tensor_input)

In [9]:
out_1 = seq_model(dummy_tensor_input)

In [12]:
torch.allclose(out_0, out_1)

True

In [6]:
from transformers import AutoModel

original_laion_model = AutoModel.from_pretrained("laion/CLIP-ViT-bigG-14-laion2B-39B-b160k")

`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
original_laion_model.eval();

In [8]:
out_2 = original_laion_model.get_image_features(pixel_values=dummy_tensor_input)

In [10]:
torch.allclose(out_1, out_2)

True

In [None]:
from transformers.models.clip.modeling_clip import CLIPModel

In [None]:
original_laion_model.get_image_features(torch.rand(3, 3, 224, 224))

In [3]:
import torch
class Extractor(torch.nn.Module):
    def __init__(self, vision_model):
        super().__init__()
        self.vision_model = vision_model

    def forward(self, x):
        img_a, img_b = x
        return self.vision_model(img_a), self.vision_model(img_b)

In [5]:
#Only model weights in GPU: 7.9 GB
#Batch of 4 images pairs forward pass: +13.5GB
#Batch of 4 images pairs backward pass: +5GB
#Fordward pass time GPU batch 4 pairs: 300 ms
#forward + Backward  pass time CPU batch 4 pairs: 13 segs + 40 segs

In [1]:
import os
import torch
import torch.nn as nn
import time
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '29500'
torch.distributed.rpc.init_rpc('worker', rank=0, world_size=1)
from drecg.feature_extraction.distributed import sequential_model_to_devices

from torch.distributed.pipeline.sync import Pipe
from drecg.feature_extraction.distributed import VitImageFeatureExtractor
model = VitImageFeatureExtractor.load_pretrained()

model_seq = model.to_sequential()
model_seq.add_module('head', nn.Linear(1280, 1))

device0 = torch.device("cuda:0")
device1 = torch.device("cpu")
sequential_model_to_devices(model_seq, device0, device1)
model_pipe = Pipe(model_seq, chunks=2)


loss_fn = nn.BCEWithLogitsLoss()
y_true = torch.ones(10, 1, dtype=torch.float32)

model_pipe.train()
adam_w = torch.optim.AdamW(model_pipe.parameters(), lr=1e-3)

for i in range(10):
    init_time = time.time()
    dummy_tensor_input = torch.rand(10, 3, 224, 224)
    dummy_tensor_input = dummy_tensor_input.to(device0)
    adam_w.zero_grad()
    out = model_pipe(dummy_tensor_input)
    out = out.local_value()
    loss = loss_fn(out, y_true)
    loss.backward()
    adam_w.step()
    total_time_segs = time.time() - init_time
    print("iter: {}, loss: {}, time: {} segs".format(i, loss.item(), total_time_segs))



config.json not found in HuggingFace Hub.


iter: 0, loss: 1.5391587018966675, time: 32.90772318840027 segs
iter: 1, loss: 3.576278118089249e-07, time: 32.02351236343384 segs
iter: 2, loss: 1.0216133887297474e-05, time: 31.575616121292114 segs
iter: 3, loss: 9.691621016827412e-06, time: 31.562179565429688 segs
iter: 4, loss: 5.960463766996327e-08, time: 31.31337833404541 segs
iter: 5, loss: 0.002470702398568392, time: 31.104001760482788 segs
iter: 6, loss: 0.0, time: 31.410987615585327 segs
iter: 7, loss: 0.0, time: 31.036481857299805 segs
iter: 8, loss: 0.0, time: 31.05917716026306 segs
iter: 9, loss: 0.0, time: 31.293527126312256 segs
