In [1]:
from src.models.dual_encoding.pretrained_clip import PreTrainedCLIP
from src.models.utils.data import FoodPricingDataset
from torch.utils.data import DataLoader
from torchvision import transforms

With PreTrainedCLIP class

In [2]:
pretrained_model_name_or_path = "clip-italian/clip-italian"

model_kwargs = {
    "pretrained_model_name_or_path": pretrained_model_name_or_path,
}

In [3]:
img_feature_dim, txt_feature_dim = None, None
model = PreTrainedCLIP(
    model_kwargs, 
    img_feature_dim=img_feature_dim,
    txt_feature_dim=txt_feature_dim,
)

print("Text: ", model.clip.text_embed_dim)
print("Image: ", model.clip.vision_embed_dim)
print("Projection: ", model.clip.projection_dim)

Text:  768
Image:  768
Projection:  512


In [4]:
size = model.processor.feature_extractor.crop_size
mean = model.processor.feature_extractor.image_mean
std = model.processor.feature_extractor.image_std

img_transform = transforms.Compose(
    [
        transforms.Resize(size=(size, size)),
        transforms.ToTensor(),
        transforms.Normalize(
            mean=mean,
            std=std,
        )
    ]
)
txt_transform = lambda x: x

training_data = FoodPricingDataset(
    img_transform=img_transform,
    txt_transform=txt_transform,
    split="train",
)
dataloader = DataLoader(
    training_data,
    shuffle=True,
    batch_size=4,
    num_workers=8,
)
sample = next(iter(dataloader))

In [5]:
model(sample)

{'img': tensor([[ 0.0538, -0.0227,  0.0455,  ...,  0.0760, -0.0372, -0.0016],
         [ 0.0208, -0.1054, -0.0081,  ...,  0.0906, -0.0299, -0.0068],
         [-0.0125, -0.0821,  0.0717,  ...,  0.0025, -0.0701,  0.0934],
         [ 0.0009, -0.0610, -0.0152,  ..., -0.0062, -0.0364, -0.0182]]),
 'txt': tensor([[-0.0066, -0.0041,  0.0068,  ...,  0.0665, -0.0019,  0.0276],
         [-0.0074, -0.0795, -0.0241,  ...,  0.1290,  0.0049,  0.0489],
         [-0.0330, -0.0561,  0.0298,  ...,  0.0980, -0.0224,  0.0113],
         [-0.0468, -0.0384, -0.0105,  ...,  0.0095,  0.0013,  0.0270]])}