In [1]:
import os
os.chdir('../../vlm_toolbox/')

/home/alireza/vlm_toolbox


In [2]:
%load_ext autoreload
%reload_ext autoreload
%autoreload 2

# Imports

In [7]:
import gc
import os
import warnings

import torch
from transformers import (
    AutoImageProcessor,
    AutoModel,
    Trainer,
    TrainingArguments,
)

from config.annotations import AnnotationsConfig
from config.enums import (
    CLIPBackbones,
    ImageDatasets,
    Modalities,
    ModelType,
    PrecisionDtypes,
    Setups,
    Sources,
    Stages,
    Trainers,
)
from config.image_datasets import ImageDatasetConfig
from config.setup import Setup
from config.train import TrainingArgumentsConfig
from data.data_access.image_factory import ImageHandlerFactory
from data.data_access.label_factory import LabelHandleFactory
from metric.accuracy import AccuracyMetricEvaluator
from util.path import mkdir_if_missing
from util.torch_helper import describe_model, set_module_trainable

In [None]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"
warnings.filterwarnings('ignore')

In [4]:
def flush():
    gc.collect()
    torch.cuda.empty_cache()

# Config

### Device

In [16]:
embed_dset_dir = '/home/alireza/io/imagenet1k/embedding/dyno_v2_giant/huggingface/validation/'
mkdir_if_missing(embed_dset_dir)

### Training

In [17]:
setup = Setup(
    setup_type=Setups.FULL,
    model_type=ModelType.ZERO_SHOT,
    dataset_name=ImageDatasets.IMAGENET_1K,
    backbone_name=CLIPBackbones.CLIP_VIT_B_16,
    trainer_name=Trainers.CLIP,
    source=Sources.OPEN_AI,
    metric_for_best_model=AccuracyMetricEvaluator.get_main_metric_name(),
    eval_split=Stages.EVAL,
    train_split=Stages.TRAIN,
    eval_batch_size=1024,
    n_shots=20,
    # label_column_name='family',
    precision_dtype=PrecisionDtypes.FP16,
)
setup

Setup: (setup_type='full', dataset_name='imagenet1k', backbone_name='clip_vit_b_16', source='open_ai', trainer_name='clip', model_type='zero_shot', annotations_key_value_criteria={}, precision_dtype='fp16', metric_for_best_model='top_1', n_shots=20, enable_novelty=False, train_batch_size=512, eval_batch_size=1024, preprocess_batch_size=1024, train_split='train', m1_m2_id_same_granularity_level=False, eval_split='validation', top_k=5, random_state=42, is_soft=False, train_full_precision=False, eval_full_precision=False)

# Labels Loading

In [18]:
annotations_config = AnnotationsConfig.get_config(dataset_name=setup.get_dataset_name())
label_handler = LabelHandleFactory.create_from_config(annotations_config)
label_handler = (
    label_handler
    .config_prompts()
)
class_id_label_id_adj_matrix = label_handler.get_class_id_label_id_adj_matrix()

### Image Dataset

In [19]:
train_image_dataset_config = ImageDatasetConfig.get_config(
    setup,
    split='validation',
    data_type='raw',
)

In [12]:
train_image_dataset_handler = ImageHandlerFactory.create_from_config(
    Modalities.M1,
    'validation',
    train_image_dataset_config,
).show()

Dataset({
    features: ['image', 'class_id'],
    num_rows: 50000
})

Identifier column name: class_id
Value column names: ['image']

Modality(modality_type='image', identifier='class_id', raw_keys=['image'], preprocessed_keys={'input': ['image'], 'output': ['pixel_values']}, key='m1', stage='validation', embedding_key={'input': ['pixel_values'], 'output': ['image_embeds']}, status='raw', requires_grad=true, requires_preprocess=true, perform_augmentation=false)


In [13]:
class ModelWrapper(torch.nn.Module):
    def __init__(self, model):
        super(ModelWrapper, self).__init__()
        self.model = model
    
    def forward(self, *args, **kwargs):
        outputs = self.model(*args, **kwargs)
        return outputs[1]
    
processor = AutoImageProcessor.from_pretrained('facebook/dinov2-giant')
model = AutoModel.from_pretrained('facebook/dinov2-giant').eval().to(torch.device('cuda'))
set_module_trainable(model, set_trainable=False)
describe_model(model)
wrapped_model = ModelWrapper(model)

Total model parameters: 1136480768
🔥 Trainable model parameters: 0 (0.00%)
🧊 Frozen model parameters: 1136480768 (100.00%)

Attribute Name: embeddings, Type: Dinov2Embeddings, Param Share: 0.27%:
-    Attribute Name: patch_embeddings, Type: Dinov2PatchEmbeddings, Param Share: 30.04%:
--       🔥 Trainable: 0.00%, 🧊 Frozen: 100.00%
-    Attribute Name: dropout, Type: Dropout, Param Share: 0.00%:
--       🔥 Trainable: 0.00%, 🧊 Frozen: 0.00%

Attribute Name: encoder, Type: Dinov2Encoder, Param Share: 99.73%:
-    Attribute Name: layer, Type: ModuleList, Param Share: 100.00%:
--       🔥 Trainable: 0.00%, 🧊 Frozen: 100.00%

Attribute Name: layernorm, Type: LayerNorm, Param Share: 0.00%:

Device: cuda:0
Model size: 4335.330MB


In [21]:
dataset = train_image_dataset_handler.get_dataset()
dataset = dataset.map(
    lambda batch: processor(images=batch['image'], return_tensors='pt'),
    batched=True, batch_size=1024, remove_columns=['image'],
)
dataset.set_format(type="torch")
flush()

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [22]:
default_args = TrainingArgumentsConfig.get_config(
    setup_type=Setups.EVAL_ONLY,
    precision_dtype=setup.get_precision_dtype(),
    eval_full_precision=setup.get_eval_full_precision(),
)
default_args['do_eval'] = False
default_args['do_predict'] = True

trainer_args = TrainingArguments(
    per_device_eval_batch_size=setup.get_eval_batch_size(),
    **default_args,
)
trainer = Trainer(
    model=wrapped_model,
    args=trainer_args,
    compute_metrics=None,
)

In [None]:
embeds = trainer.predict(dataset.select_columns(['pixel_values']))
dataset = dataset.remove_columns(['pixel_values']).add_column(name='image_embeds', column=embeds.tolist())
dataset.set_format(type='torch', columns=['image_embeds'])