#### Feature extraction experiments
This experiments are intended to test the performance of the model when using the feature extraction technique. The idea is to use the pre-trained model as a feature extractor and then train a new model using the extracted features. 

The features area extracted ahead of time and saved for performance reasons. The caveat is that we can not use the data augmentation techniques when extracting the features.

#### Vit Feature extraction
* Load the feature extractor model
* Load the dataloaders for training and testing 
* Iterate and extract the features (save in a list of batches)
* Check memory
* Create a new Dataset with the extracted features

In [1]:
from drecg.models.uform import get_model
import torch

In [3]:
import mlflow
def promote_model_to_registry(run_id, artifact_name, model_name):
    return mlflow.register_model(
        f"runs:/{run_id}/{artifact_name}",
        model_name
    )
mlflow.set_tracking_uri("http://127.0.0.1:5000")
run_id = '0d16bb7f58b543ae9c3f126cab327b2c'
artifact_name = 'model'
promote_model_to_registry(run_id, artifact_name, 'Laion Balanced')

Successfully registered model 'Laion Balanced'.
2023/03/05 13:45:40 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: Laion Balanced, version 1
Created version '1' of model 'Laion Balanced'.


<ModelVersion: creation_timestamp=1678041940322, current_stage='None', description='', last_updated_timestamp=1678041940322, name='Laion Balanced', run_id='0d16bb7f58b543ae9c3f126cab327b2c', run_link='', source='./mlflow_artifacts/16/0d16bb7f58b543ae9c3f126cab327b2c/artifacts/model', status='READY', status_message='', tags={}, user_id='', version='1'>

In [2]:
#get mlflow model
import mlflow
mlflow.set_tracking_uri("http://127.0.0.1:5000")
model_uri = "models:/{}/{}".format('Laion Balanced', '1')
model = mlflow.pytorch.load_model(model_uri)
model



DiffFeatureDetectorParamBiDirectional(
  (cls_layer): Sequential(
    (features_dropout): Dropout(p=0.31296189949335906, inplace=False)
    (linear_0): Linear(in_features=1280, out_features=184, bias=True)
    (relu_0): ReLU()
    (hidden_dropout_0): Dropout(p=0.5838650129135917, inplace=False)
    (linear_1): Linear(in_features=184, out_features=184, bias=True)
    (relu_1): ReLU()
    (hidden_dropout_1): Dropout(p=0.5838650129135917, inplace=False)
    (linear_2): Linear(in_features=184, out_features=184, bias=True)
    (relu_2): ReLU()
    (hidden_dropout_2): Dropout(p=0.5838650129135917, inplace=False)
    (linear_out): Linear(in_features=184, out_features=1, bias=True)
  )
)

In [2]:
model = get_model('unum-cloud/uform-vl-english')
# model = get_model('unum-cloud/uform-vl-multilingual')

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
from drecg.feature_extraction.utils import VitLaionFeatureExtractor

feat_extractor = VitLaionFeatureExtractor()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
from drecg.data.utils import create_dataloader_train

dataloader_ref = create_dataloader_train(transforms=feat_extractor.transforms)
dataloader_new = create_dataloader_train(transforms=model.preprocess_image)


In [5]:
sample_batch_ref = next(iter(dataloader_ref))
sample_batch_new = next(iter(dataloader_new))

In [6]:
x_ref, y_ref, paths_ref = sample_batch_ref
imga_ref, imgb_ref = x_ref


In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
feat_extractor.eval()
feat_extractor.to(device)
imga_ref = imga_ref.to(device)

In [9]:
with torch.no_grad():
    feats = feat_extractor.vit_model.get_image_features(pixel_values=imga_ref)

In [15]:
model.eval()
model.to(device)
x_new, y_new, paths_new = sample_batch_new
imga_new, imgb_new = x_new
imga_new = imga_new.to(device)
with torch.no_grad():
    image_features, image_embedding = model.encode_image(imga_new, return_features=True)

In [20]:
image_embedding.shape, feats.shape, image_embedding.dtype, feats.dtype

(torch.Size([32, 256]), torch.Size([32, 1280]), torch.float32, torch.float32)

In [38]:

def validate_same_shape(batch_ref, batch_new):
    x_ref, y_ref, paths_ref = batch_ref
    imga_ref, imgb_ref = x_ref
    x_new, y_new, paths_new = batch_new
    imga_new, imgb_new = x_new
    assert imga_ref.shape == imga_new.shape
    assert imgb_ref.shape == imgb_new.shape
    assert y_ref.shape == y_new.shape

    ##Print shapes of both batches
    print(f'imga_ref: {imga_ref.shape}')
    print(f'imgb_ref: {imgb_ref.shape}')
    print(f'y_ref: {y_ref.shape}')
    print(f'imga_new: {imga_new.shape}')
    print(f'imgb_new: {imgb_new.shape}')
    print(f'y_new: {y_new.shape}')

validate_same_shape(sample_batch_ref, sample_batch_new)

imga_ref: torch.Size([32, 3, 224, 224])
imgb_ref: torch.Size([32, 3, 224, 224])
y_ref: torch.Size([32])
imga_new: torch.Size([32, 3, 224, 224])
imgb_new: torch.Size([32, 3, 224, 224])
y_new: torch.Size([32])


In [7]:
import torch
isinstance(feat_extractor.transforms, torch.nn.Module)

True

In [1]:
from drecg.feature_extraction.utils import extract_features_with_model


In [2]:
extract_features_with_model(model='ViT_LAION', root_dir='feat_extracted/features_ext_vit_laion')

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/122 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/122 [00:00<?, ?it/s]

  0%|          | 0/122 [00:00<?, ?it/s]

RuntimeError: Parent directory feat_extracted/features_ext_vit_laion does not exist.

In [30]:
from drecg.data.utils import FeaturesDataset
root_dir = 'features_ext_vit_2'
train_dataset_augmented = FeaturesDataset(f'{root_dir}/train_features_augmented.pt')
validation_dataset = FeaturesDataset(f'{root_dir}/validation_features.pt')
test_dataset = FeaturesDataset(f'{root_dir}/test_features.pt')

In [31]:
(img_a, img_b), y = train_dataset_augmented[0]
img_a.shape, img_b.shape, img_a.device, img_b.dtype, y.device

(torch.Size([1024]),
 torch.Size([1024]),
 device(type='cpu'),
 torch.float32,
 device(type='cuda', index=0))

In [23]:
(img_a, img_b), y = train_dataset_augmented[0]
img_a.shape, img_b.shape, img_a.dtype, img_b.dtype, y

(torch.Size([1024]),
 torch.Size([1024]),
 torch.float32,
 torch.float32,
 tensor(0))

In [1]:
from drecg.training.ignite_finetune import train
import mlflow
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment('Uform Fine Tuning')
with mlflow.start_run():
    # train(5, model_head_name='Laion Balanced', feat_ext_name='ViT_LAION')
    train(5)




Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

Starting training
seed_everything done:  42


[1/121]   1%|           [00:00<?]



[1/121]   1%|           [00:00<?]

[1/121]   1%|           [00:00<?]

[1/121]   1%|           [00:00<?]

[1/121]   1%|           [00:00<?]



'11.7'

In [3]:
head = model.head
head

DiffFeatureDetectorParamBiDirectional(
  (cls_layer): Sequential(
    (features_dropout): Dropout(p=0.20600446496280106, inplace=False)
    (linear_0): Linear(in_features=256, out_features=21, bias=True)
    (relu_0): ReLU()
    (hidden_dropout_0): Dropout(p=0.30559697271906344, inplace=False)
    (linear_1): Linear(in_features=21, out_features=21, bias=True)
    (relu_1): ReLU()
    (hidden_dropout_1): Dropout(p=0.30559697271906344, inplace=False)
    (linear_out): Linear(in_features=21, out_features=1, bias=True)
  )
)

In [4]:
vit_model = model.feature_extractor.vit_model.img_encoder

In [5]:
vit_model

VisualEncoder(
  (encoder): VisionTransformer(
    (patch_embed): PatchEmbed(
      (proj): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      (norm): Identity()
    )
    (pos_drop): Dropout(p=0.0, inplace=False)
    (norm_pre): Identity()
    (blocks): Sequential(
      (0): Block(
        (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (attn): Attention(
          (qkv): Linear(in_features=768, out_features=2304, bias=True)
          (attn_drop): Dropout(p=0.0, inplace=False)
          (proj): Linear(in_features=768, out_features=768, bias=True)
          (proj_drop): Dropout(p=0.0, inplace=False)
        )
        (ls1): LayerScale()
        (drop_path1): Identity()
        (norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (mlp): Mlp(
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (act): GELU(approximate='none')
          (drop1): Dropout(p=0.0, inplace=False)
          (fc2): Linear(in_features=

In [6]:
import torch
v3 = torch.randn((10, 3, 224, 224))
img_a_batch.shape, v3.shape

(torch.Size([4, 3, 224, 224]), torch.Size([10, 3, 224, 224]))

In [9]:
vit_model.to('cuda')
vit_model(v3.to('cuda'))

(tensor([[[-6.2998e-01,  2.5293e-01,  4.5888e-01,  ...,  9.6236e-02,
           -6.7797e-01,  2.9705e-01],
          [ 8.7818e-01,  7.4715e-01, -3.9827e-01,  ...,  2.0539e-01,
            6.1649e-01,  1.3331e+00],
          [ 7.0685e-01,  1.0762e+00, -3.3192e-01,  ..., -3.6140e-01,
            6.3357e-01,  1.0098e+00],
          ...,
          [-1.3128e-01,  6.8886e-02, -3.4928e-02,  ..., -8.7357e-02,
            9.9788e-02, -1.6247e-01],
          [ 3.8275e-02,  4.6661e-01,  3.1559e-01,  ..., -3.2795e-01,
            2.4437e-01,  6.2411e-01],
          [ 5.1062e-02, -5.0209e-01, -1.5708e-01,  ...,  3.3960e-01,
            5.6685e-01, -1.7917e-01]],
 
         [[-7.0150e-01,  2.1580e-01,  4.3162e-01,  ...,  7.7260e-02,
           -6.4393e-01,  3.2967e-01],
          [ 1.4705e-01,  2.2726e-01,  2.6083e-02,  ...,  8.0055e-02,
           -2.5484e-02,  5.0873e-02],
          [ 9.7804e-01,  6.7927e-01, -6.4200e-02,  ...,  2.0482e-01,
           -3.4817e-02,  5.4986e-01],
          ...,
    

In [8]:
import torch
v0 = torch.randn((10, 256)).to(device)
v1 = torch.randn((10, 256)).to(device)
head((v0, v1))

tensor([[ 0.6482],
        [-4.0981],
        [ 1.4429],
        [-1.3412],
        [-0.9219],
        [ 1.2507],
        [ 0.7375],
        [-0.3058],
        [-1.0294],
        [ 1.0365]], device='cuda:0', grad_fn=<DivBackward0>)

In [4]:
img_a_batch.shape, img_b_batch.shape, label_batch.shape

(torch.Size([4, 3, 224, 224]), torch.Size([4, 3, 224, 224]), torch.Size([4]))

In [2]:
from drecg.training.ignite_finetune import define_model_for_tune
import mlflow
mlflow.set_tracking_uri("http://localhost:5000")
model_head_name='uform-vl-english'
feat_ext_name='UForm_V1'
define_model_for_tune(model_head_name, feat_ext_name)



Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

CompleteModelToTune(
  (head): DiffFeatureDetectorParamBiDirectional(
    (cls_layer): Sequential(
      (features_dropout): Dropout(p=0.20600446496280106, inplace=False)
      (linear_0): Linear(in_features=256, out_features=21, bias=True)
      (relu_0): ReLU()
      (hidden_dropout_0): Dropout(p=0.30559697271906344, inplace=False)
      (linear_1): Linear(in_features=21, out_features=21, bias=True)
      (relu_1): ReLU()
      (hidden_dropout_1): Dropout(p=0.30559697271906344, inplace=False)
      (linear_out): Linear(in_features=21, out_features=1, bias=True)
    )
  )
  (feature_extractor): UFormV1FeatureExtractor(
    (vit_model): VLM(
      (img_encoder): VisualEncoder(
        (encoder): VisionTransformer(
          (patch_embed): PatchEmbed(
            (proj): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
            (norm): Identity()
          )
          (pos_drop): Dropout(p=0.0, inplace=False)
          (norm_pre): Identity()
          (blocks): Sequential(
     

In [1]:
from drecg.feature_extraction.utils import VitLaionFeatureExtractor, VitFeatureExtractorComplete
ext = VitFeatureExtractorComplete()

`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.
2023-04-08 16:11:16.565159: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-04-08 16:11:16.734112: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
from drecg.data.utils import create_dataloader_train
train_dataloader = create_dataloader_train(transforms=ext.transforms)

In [3]:
(imgs_a, imgs_b), labels, paths = next(iter(train_dataloader))

In [4]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
ext.to(device);

In [5]:
imgs_a = imgs_a.to(device);

In [6]:
from transformers.modeling_outputs import BaseModelOutputWithPooling


In [8]:
out = ext.forward_single(imgs_a[:1], output_attentions=True, output_hidden_states=True, use_return_dict=True)
out.shape



torch.Size([1, 257, 1664])

In [9]:
from torch.nn import MultiheadAttention
att = MultiheadAttention(embed_dim=1664, num_heads=8, dropout=0.1, batch_first=True, device=device)

torch.Size([1, 3, 224, 224])

In [11]:
out_attn = att(out, out, out)

In [15]:
out_attn[0].shape

torch.Size([1, 257, 1664])

In [17]:
last_hidden_state = out.last_hidden_state
last_hidden_state[:, 0, :].shape, last_hidden_state.shape

(torch.Size([1, 1664]), torch.Size([1, 257, 1664]))

In [10]:
import transformers
transformers.models.clip.modeling_clip.CLIPModel()

In [9]:
ext.vit_model.visual_projection, ext.vit_model.config.output_attentions, ext.vit_model.config.output_hidden_states

(Linear(in_features=1664, out_features=1280, bias=False), False, False)

In [13]:
type(ext.vit_model.vision_model)

transformers.models.clip.modeling_clip.CLIPVisionTransformer

In [None]:
transformers.models.clip.modeling_clip.CLIPVisionTransformer

In [12]:
from transformers.models.clip.modeling_clip import CLIPVisionTransformer

CLIPVisionTransformer

(1, 2, 1)

In [1]:
import gc

# Create a large list
large_list = list(range(1000000))

# Explicitly free the memory occupied by the large list
del large_list
gc.collect()


278

In [2]:
import h5py
import torch

tensors = [torch.rand(5, 5) for _ in range(10)]  # replace this with your list of tensors
file_path = "tensors.hdf5"

with h5py.File(file_path, "w") as f:
    for i, tensor in enumerate(tensors):
        f.create_dataset(f"tensor_{i}", data=tensor.numpy())


In [3]:
file_to_read = "feat_extracted/laion_last_hidden/test_features.pt"
import torch
features = torch.load(file_to_read)

In [6]:
len(features)

8

In [11]:
len(features[0])

3

In [14]:
data = features[0]

In [19]:
(feature_tensors_a, feature_tensors_b), labels, (path_str_a, path_str_b) = data

In [22]:
print(feature_tensors_a.shape, feature_tensors_b.shape, labels.shape, len(path_str_a), len(path_str_b))

torch.Size([32, 257, 1664]) torch.Size([32, 257, 1664]) torch.Size([32]) 32 32


In [23]:
import h5py
import torch


file_path = "data.hdf5"

with h5py.File(file_path, "w") as f:
    for i, ((feature_tensors_a, feature_tensors_b), labels, (path_str_a, path_str_b)) in enumerate(features):
        grp = f.create_group(f"entry_{i}")
        grp.create_dataset("features_a", data=feature_tensors_a.numpy())
        grp.create_dataset("features_b", data=feature_tensors_b.numpy())
        grp.create_dataset("labels", data=labels)
        grp.attrs["paths_a"] = path_str_a
        grp.attrs["paths_b"] = path_str_b


In [42]:
from drecg.data.utils import convert_to_hdf5
convert_to_hdf5(source_files = ["feat_extracted/laion_last_hidden/test_features.pt"], dest_path = "test_features3.hdf5")

Processing items: 8 batches [00:00, 15.35 batches/s]


In [2]:
import h5py
file_path = "feat_extracted/laion_last_hidden_hdf5/train_features_augmented.hdf5"
f = h5py.File(file_path, "r")

In [8]:
f['entry_140']['features_a'][1]

array([[ -0.6300168 ,   0.8605088 ,  -1.5693604 , ...,  -1.4104166 ,
        -10.171236  ,  -1.9453514 ],
       [ -2.2364345 ,   1.7495991 ,   0.44537437, ...,   2.1975954 ,
          0.9050273 ,  -0.6109122 ],
       [  3.2277853 ,   1.5823239 ,  -1.3100352 , ...,   1.2139174 ,
         -2.297767  ,   0.7951103 ],
       ...,
       [ -2.1315188 ,  -1.0066409 ,  -2.001802  , ...,   0.5799053 ,
         -1.596676  ,  -0.6917656 ],
       [ -2.0988412 ,  -1.1044948 ,  -2.921171  , ...,   0.84670824,
         -7.2343183 ,  -6.1011157 ],
       [ -2.7840497 ,  -3.143204  ,  -2.949893  , ...,  -1.4124207 ,
         -4.018466  ,  -0.37366414]], dtype=float32)

In [27]:
entry = f["entry_0"]

In [31]:
len(entry["features_a"])

32

In [33]:
entry.attrs["paths_b"]

array(['/home/daniel/data_dogs/testing/same/1402b.png',
       '/home/daniel/data_dogs/testing/same/1403b.png',
       '/home/daniel/data_dogs/testing/same/1404b.png',
       '/home/daniel/data_dogs/testing/same/1408b.png',
       '/home/daniel/data_dogs/testing/same/1411b.png',
       '/home/daniel/data_dogs/testing/same/1413b.png',
       '/home/daniel/data_dogs/testing/same/1414b.png',
       '/home/daniel/data_dogs/testing/same/1420b.png',
       '/home/daniel/data_dogs/testing/same/1423b.png',
       '/home/daniel/data_dogs/testing/same/1426b.png',
       '/home/daniel/data_dogs/testing/same/1429b.png',
       '/home/daniel/data_dogs/testing/same/1430b.png',
       '/home/daniel/data_dogs/testing/same/150b.png',
       '/home/daniel/data_dogs/testing/same/151b.png',
       '/home/daniel/data_dogs/testing/same/153b.png',
       '/home/daniel/data_dogs/testing/same/156b.png',
       '/home/daniel/data_dogs/testing/same/157b.png',
       '/home/daniel/data_dogs/testing/same/158b.png'

In [45]:
import h5py
def load_data(file_path="data.hdf5"):
    with h5py.File(file_path, "r") as f:
        data_features = []
        for key in f.keys():
            entry = f[key]
            feature_tensors_a = torch.tensor(entry["features_a"][()])
            feature_tensors_b = torch.tensor(entry["features_b"][()])
            labels = torch.tensor(entry["labels"][()])
            path_str_a = entry.attrs["paths_a"]
            path_str_b = entry.attrs["paths_b"]
            data_features.append(((feature_tensors_a, feature_tensors_b), labels, (path_str_a, path_str_b)))

    return data_features

#### HDF5 Dataset for loading hidden state features

In [9]:
import h5py
import torch
file_to_read = "feat_extracted/laion_last_hidden_hdf5/train_features.hdf5"

hfd5_file = h5py.File(file_to_read, "r")

In [12]:
len([k for k in hfd5_file.keys()])

121

In [18]:
first_key = "entry_0"
last_key = f"entry_{len(hfd5_file.keys()) - 1}"
hfd5_file[first_key]["features_a"], hfd5_file[last_key]["features_a"]

(<HDF5 dataset "features_a": shape (32, 257, 1664), type "<f4">,
 <HDF5 dataset "features_a": shape (26, 257, 1664), type "<f4">)

In [23]:
hfd5_file[first_key]["features_a"].shape[0], hfd5_file[last_key]["features_a"].shape[0]

(32, 26)

In [24]:
def items_in_file(hf):
    num_keys = len(hf.keys())
    batch_size = hf["entry_0"]["features_a"].shape[0]
    last_batch_size = hf[f"entry_{num_keys - 1}"]["features_a"].shape[0]
    return num_keys * batch_size - (batch_size - last_batch_size)

items_in_file(hfd5_file)

3866

In [1]:
import torch

In [7]:
features = torch.rand((32, 256, 1668))

In [8]:
features.shape

torch.Size([32, 256, 1668])

In [11]:
linear = torch.nn.Linear(1668, 256)

x = linear(features)

In [12]:
x.shape

torch.Size([32, 256, 256])