In [None]:
import os
import clip
import torch
import torchvision
import numpy as np
import h5py
from PIL import Image

In [None]:
device = torch.device('cuda')
model, preprocess = clip.load('ViT-B/16', device, jit=False)
model.cuda().eval()
input_resolution = model.visual.input_resolution
context_length = model.context_length
vocab_size = model.vocab_size

print("Model parameters:", f"{np.sum([int(np.prod(p.shape)) for p in model.parameters()]):,}")
print("Input resolution:", input_resolution)
print("Context length:", context_length)
print("Vocab size:", vocab_size)
print("Preprocess:",preprocess)

In [None]:
def write_hdf5(outfile, arr_dict):
    """
    Write arrays to hdf5 file. Create outfile if not exists. 
    torch.Tensor will automatically reformat to numpy.array.
    Args:
      outfile: str, name of hdf5 file
      arr_dict: dict, group of data-to-write
    """
    with h5py.File(outfile, 'w') as f:
        for key in arr_dict.keys():
            f.create_dataset(str(key), data=arr_dict[key])

In [None]:
FFHQ_dir = os.path.join('path/to/FFHQ/images1024x1024')
print(FFHQ_dir)
FFHQ_features = []
for j in range(1000):
    if j%100==0:
        print(j*70,'/',70000)
    images_list=[]
    for k in range(70):
        FFHQ_img_dir = os.path.join(FFHQ_dir,str(j*70+k).zfill(5)+'.png')
        image = Image.open(FFHQ_img_dir).convert("RGB")
        images_list.append(preprocess(image))
    images_tensor = torch.tensor(np.stack(images_list)).cuda()
    with torch.no_grad():
        FFHQ_features.append(model.encode_image(images_tensor).float())
FFHQ_features_cat = torch.cat(FFHQ_features,dim=0).cpu()
print(FFHQ_features_cat.shape)
FFHQ_dic = {'image':FFHQ_features_cat}
write_hdf5('FFHQ'+'.hdf5',FFHQ_dic)