# Calculating visual embeddings

##Group:
- Federico Natho
- Felipe Concha
- Francisco Madariaga

---
---

## Vision encoders implemented:
- Masked AutoEncoder (MAE)
- Contrastive Language-Image Pretraining (CLIP)
- Vision-Transformer (ViT)

---
---



In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%cd /content/drive/MyDrive/RecSysCuratorNet/images/

/content/drive/.shortcut-targets-by-id/1YiKr4FP7crAvtELiLpVx2qQNaFKXB2Y0/RecSysCuratorNet/images


In [5]:
import torch
import os 

device = "cuda" if torch.cuda.is_available() else "cpu"
images = os.listdir()

# Masked AutoEncoders (MAE) Embeddings

Reference: https://github.com/facebookresearch/mae

In [21]:
import sys
import os
import requests

import torch
import numpy as np

import matplotlib.pyplot as plt
from PIL import Image

# check whether run in Colab
if 'google.colab' in sys.modules:
    print('Running in Colab.')
    !pip3 install timm==0.4.5  # 0.3.2 does not work in Colab
    !git clone https://github.com/facebookresearch/mae.git
    sys.path.append('./mae')
else:
    sys.path.append('..')
import models_mae

Running in Colab.
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting timm==0.4.5
  Downloading timm-0.4.5-py3-none-any.whl (287 kB)
[K     |████████████████████████████████| 287 kB 7.5 MB/s 
Installing collected packages: timm
Successfully installed timm-0.4.5
Cloning into 'mae'...
remote: Enumerating objects: 39, done.[K
remote: Total 39 (delta 0), reused 0 (delta 0), pack-reused 39[K
Unpacking objects: 100% (39/39), done.


In [22]:
## Functions retrieved from MAE documentation
# Define the utils

imagenet_mean = np.array([0.485, 0.456, 0.406])
imagenet_std = np.array([0.229, 0.224, 0.225])

def show_image(image, title=''):
    # image is [H, W, 3]
    assert image.shape[2] == 3
    plt.imshow(torch.clip((image * imagenet_std + imagenet_mean) * 255, 0, 255).int())
    plt.title(title, fontsize=16)
    plt.axis('off')
    return

def prepare_model(chkpt_dir, arch='mae_vit_large_patch16'):
    # build model
    model = getattr(models_mae, arch)()
    # load model
    checkpoint = torch.load(chkpt_dir, map_location='cpu')
    msg = model.load_state_dict(checkpoint['model'], strict=False)
    print(msg)
    return model

def run_one_image(img, model):
    x = torch.tensor(img)

    # make it a batch-like
    x = x.unsqueeze(dim=0)
    x = torch.einsum('nhwc->nchw', x)

    # run MAE
    loss, y, mask = model(x.float(), mask_ratio=0.75)
    y = model.unpatchify(y)
    y = torch.einsum('nchw->nhwc', y).detach().cpu()

    # visualize the mask
    mask = mask.detach()
    mask = mask.unsqueeze(-1).repeat(1, 1, model.patch_embed.patch_size[0]**2 *3)  # (N, H*W, p*p*3)
    mask = model.unpatchify(mask)  # 1 is removing, 0 is keeping
    mask = torch.einsum('nchw->nhwc', mask).detach().cpu()
    
    x = torch.einsum('nchw->nhwc', x)

    # masked image
    im_masked = x * (1 - mask)

    # MAE reconstruction pasted with visible patches
    im_paste = x * (1 - mask) + y * mask

    # make the plt figure larger
    plt.rcParams['figure.figsize'] = [24, 24]

    plt.subplot(1, 4, 1)
    show_image(x[0], "original")

    plt.subplot(1, 4, 2)
    show_image(im_masked[0], "masked")

    plt.subplot(1, 4, 3)
    show_image(y[0], "reconstruction")

    plt.subplot(1, 4, 4)
    show_image(im_paste[0], "reconstruction + visible")

    plt.show()

In [52]:
# download checkpoint if not exist
!wget -nc https://dl.fbaipublicfiles.com/mae/visualize/mae_visualize_vit_large.pth

chkpt_dir = 'mae_visualize_vit_large.pth'
model_mae = prepare_model(chkpt_dir, 'mae_vit_large_patch16').to(device)
print('Model loaded.')


File ‘mae_visualize_vit_large.pth’ already there; not retrieving.

<All keys matched successfully>
Model loaded.


In [47]:
mae_embeddings = list()

In [55]:
for image in tqdm(images,desc='Calculating MAE embeddings'):
  path_to_image = image

  img = Image.open(path_to_image)
  img = img.convert('RGB') # To ensure the all of the images have 3 channels
  img = img.resize((224, 224))
  img = np.array(img) / 255.

  assert img.shape == (224, 224, 3)

  # normalize by ImageNet mean and std
  img = img - imagenet_mean
  img = img / imagenet_std

  ## Formatting the image as a tensor with specified dimensions for forward_encoder() function.
  img_processed = torch.tensor(img)
  img_processed = img_processed.unsqueeze(dim=0)
  img_processed = torch.einsum('nhwc->nchw', img_processed).to(device)

  ## forward_encoder -> returns x, mask, ids_restore
  ## defining the masking ratio as 0 to only do the forward pass from the encoder.
  mae_representation = model_mae.forward_encoder(img_processed.float(),mask_ratio=0)
  embeddings = mae_representation[0].detach()[0].cpu().numpy()
  mae_embeddings.append(np.mean(embeddings,axis=0))



Saving the embeddings as .npy file

In [None]:
with open('mae_embeddings_list.npy', 'wb') as f:
    np.save(f, mae_embeddings)

Saving the files a pickle

In [None]:
# open a file, where you ant to store the data
file = open('mae_embeddings_pickle', 'wb')
pickle.dump(mae_embeddings, file)
file.close()


# CLIP Embeddings

References: https://github.com/openai/CLIP

In [None]:
! pip install git+https://github.com/openai/CLIP.git

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-we5st4w3
  Running command git clone -q https://github.com/openai/CLIP.git /tmp/pip-req-build-we5st4w3
Collecting ftfy
  Downloading ftfy-6.1.1-py3-none-any.whl (53 kB)
[K     |████████████████████████████████| 53 kB 1.7 MB/s 
Building wheels for collected packages: clip
  Building wheel for clip (setup.py) ... [?25l[?25hdone
  Created wheel for clip: filename=clip-1.0-py3-none-any.whl size=1369408 sha256=b998acfc465bc879d1ed332da8fd6e84ac7cf85a79490fe5aee10be32ee7903b
  Stored in directory: /tmp/pip-ephem-wheel-cache-1jl3pcpb/wheels/ab/4f/3a/5e51521b55997aa6f0690e095c08824219753128ce8d9969a3
Successfully built clip
Installing collected packages: ftfy, clip
Successfully installed clip-1.0 ftfy-6.1.1


In [None]:
import os
import sys
from tqdm import tqdm
import numpy as np
import torch
from PIL import Image
import torch
import clip
import pickle

In [None]:
model, preprocess = clip.load("RN50x64", device=device)
embeddings_clip = list()

100%|█████████████████████████████████████| 1.26G/1.26G [00:42<00:00, 31.7MiB/s]


In [None]:
for image in tqdm(images,desc='Encoding images'):
  path_to_image = image

  ## Reading image
  image_processed = preprocess(Image.open(path_to_image)).unsqueeze(0).to(device)

  ## Visual-encoding the image
  with torch.no_grad():
      image_features = model.encode_image(image_processed)

  embeddings_clip.append([image,image_features.cpu().tolist()[0]])

Saving the embeddings as .npy file

In [None]:
with open('clip_embeddings_list.npy', 'wb') as f:
    np.save(f, embeddings_clip)

Saving the files a pickle

In [None]:
# open a file, where you ant to store the data
file = open('clip_embeddings_pickle', 'wb')
pickle.dump(embeddings_clip, file)
file.close()


# ViT Embeddings

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 7.9 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 61.4 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 79.1 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1


In [None]:
from transformers import ViTConfig, ViTModel, ViTFeatureExtractor
import torch
import numpy as np
from tqdm import tqdm
from PIL import Image
import os
import torch

model = ViTModel.from_pretrained("google/vit-base-patch16-224-in21k").to(device)


In [None]:
featureExtractor = ViTFeatureExtractor()
embedding_vit=[]

for image in tqdm(images):
    images_n= Image.open(image)
    images_n = images_n.convert('RGB')
    inputs = featureExtractor(images_n, return_tensors="pt").to(device)

    with torch.no_grad():
        outputs = model(**inputs)

    last_hidden_states = outputs.last_hidden_state.cpu().numpy()

    embedding = np.mean(np.squeeze(last_hidden_states, axis=0),axis=0)
    embedding_vit.append([image,embedding])
    del images_n, inputs

100%|██████████| 13297/13297 [1:03:40<00:00,  3.48it/s]
  arr = np.asanyarray(arr)


Saving the embeddings as .npy file

In [None]:
np.save('/content/drive/MyDrive/embedding/embeddings_vit_2.npy', embedding_vit, allow_pickle=True)

  arr = np.asanyarray(arr)


Saving the embeddings as pickle file

In [None]:
filehandler = open("embedding_vit_pickle","rb")
a = pickle.load(filehandler)
filehandler.close()