In [None]:
import clip
import torch
import os
from PIL import Image

image_path = "/home/eduardo/Downloads/rgbd_dataset_freiburg1_desk2/rgb/"

images = [os.path.join(image_path, f) for f in os.listdir(image_path) if f.endswith('.png') or f.endswith('.jpg')]
print("Found images:", len(images))

sample_image_path = images[0]
print("Image path:", sample_image_path)
sample_image = Image.open(sample_image_path).convert("RGB")
print("Sample image shape:", sample_image.size)

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)
model, preprocess = clip.load("ViT-L/14@336px", device=device)
image = preprocess(sample_image).unsqueeze(0).to(device)
print("Image shape after preprocessing:", image.shape)
with torch.no_grad():
    image_features = model.encode_image(image)
    print("Image features shape:", image_features.shape)
    # forward features
    feats = model.encode_image(image, dense_features=True)
    print("Dense features shape:", feats.shape)
    feats = feats[:, 1:, :] # remove the CLS token

# VIT-L/14@336px used 336px input size, with 14 patches per side
# so the output feature map will be 336/14 = 24 patches per side
# and each patch is 336/14 = 24 pixels wide
patch_size = 336 // 14
print("Patch size:", patch_size)
patch_embeddings = feats.reshape(1, 24, 24, -1).permute(0, 3, 1, 2) # (N, C, H, W)
print("Patch embeddings shape:", patch_embeddings.shape)
# Upsample the patch embeddings to the original image size
upsampled_embeddings = torch.nn.functional.interpolate(
    patch_embeddings,
    size=(image.shape[2], image.shape[3]),
    mode='bilinear',
    align_corners=False
)
print("Upsampled embeddings shape:", upsampled_embeddings.shape)

# Lets see the 3 PC components of the upsampled embeddings
# We will use PCA to reduce the dimensionality to 3
from sklearn.decomposition import PCA
import numpy as np
upsampled_embeddings_np = upsampled_embeddings.squeeze(0).cpu().numpy()
print("Upsampled embeddings shape (numpy):", upsampled_embeddings_np.shape) # (768, 336, 336)
pca = PCA(n_components=3)
pca_embeddings = pca.fit_transform(upsampled_embeddings_np.reshape(768, -1).T).T
print("PCA embeddings shape:", pca_embeddings.shape) # (3, 336, 336)
pca_embeddings = pca_embeddings.reshape(3, 336, 336)
print("PCA embeddings shape after reshape:", pca_embeddings.shape) # (3, 336, 336)




In [None]:
# Lets display the PCA embeddings as RGB channels
import matplotlib.pyplot as plt
_, ax = plt.subplots(1, 2, figsize=(15, 5))
input_image = image.squeeze(0).cpu().permute(1, 2, 0).numpy()  # (336, 336, 3)
# Normalize the input image to [0, 1]
input_image = (input_image - input_image.min()) / (input_image.max() - input_image.min())
input_image = input_image * 255  # Scale to [0, 255]
input_image = input_image.astype(np.uint8)  # Convert to uint8
ax[0].imshow(input_image)
ax[0].set_title("Original Image")
ax[0].axis('off')
pca_image = (pca_embeddings - pca_embeddings.min()) / (pca_embeddings.max() - pca_embeddings.min()
)  # Normalize to [0, 1]
pca_image = pca_image.transpose(1, 2, 0)  # (336, 336, 3)
ax[1].imshow(pca_image)
ax[1].set_title("PCA Embeddings")
ax[1].axis('off')
plt.tight_layout()
plt.show()

In [None]:
# test torch conv2d params
input_dim = 3 # RGB image
output_dim = 128 # Example output dimension
kernel_size = 3 # Example kernel size - default for many conv layers
stride = 1 # Default stride
padding = 1 # Default padding for 'same' convolution
conv_layer = torch.nn.Conv2d(input_dim, output_dim, kernel_size, stride, padding).to(device)
max_pool = torch.nn.MaxPool2d(kernel_size=2, stride=2, padding=0).to(device)
print("Conv2d layer:", conv_layer)

#dim maths
# Input: (N, C_in, H_in, W_in) -> Output: (N, C_out, H_out, W_out)
# Where:
# N = batch size
# C_in = input channels (e.g., 3 for RGB)
# C_out = output channels (e.g., 256)
# H_in = input height
# W_in = input width
# H_out = (H_in + 2*padding - kernel_size) // stride + 1
h_out = (image.shape[2] + 2 * padding - kernel_size) // stride + 1
w_out = (image.shape[3] + 2 * padding - kernel_size) // stride + 1
print("Output dimensions: H_out =", h_out, ", W_out =", w_out)
# W_out = (W_in + 2*padding - kernel_size) // stride + 1
feature_map = conv_layer(image)
reduced = max_pool(feature_map)
print("Feature map shape after conv2d:", feature_map.shape)
print("Reduced feature map shape after max pooling:", reduced.shape)
# how many parameters in conv layer?
# each filter has (input_dim * kernel_size * kernel_size) weights + 1 bias
# we have output_dim filters, so
num_params = (input_dim * kernel_size * kernel_size + 1) * output_dim
print("Number of parameters in conv layer:", num_params)
print("Total parameters in conv layer:", sum(p.numel() for p in conv_layer.parameters()))