In [13]:
import torch
import open_clip
import pandas as pd
from pathlib import Path

In [14]:
output_dir = Path('./data/')
output_dir.mkdir(parents=True, exist_ok=True)

In [15]:
# Force to the last GPU (Adam - for now)

DEVICE = 'cuda:3' if torch.cuda.is_available() else 'cpu'
print(f'Using {DEVICE}')

Using cuda:3


In [16]:
model, _, preprocess = open_clip.create_model_and_transforms('ViT-H-14', pretrained='laion2b_s32b_b79k', device=DEVICE)
tokenizer = open_clip.get_tokenizer('ViT-H-14')

In [17]:
text = pd.read_parquet('mscoco.parquet').TEXT.to_numpy()

In [18]:
print(text[:10])

['A man with a red helmet on a small moped on a dirt road. '
 'Man riding a motor bike on a dirt road on the countryside.'
 'A man riding on the back of a motorcycle.'
 'A dirt path with a young person on a motor bike rests to the foreground of a verdant area with a bridge and a background of cloud-wreathed mountains. '
 'A man in a red shirt and a red hat is on a motorcycle on a hill side.'
 'A woman wearing a net on her head cutting a cake. '
 'A woman cutting a large white sheet cake.'
 'A woman wearing a hair net cutting a large sheet cake.'
 'there is a woman that is cutting a white cake'
 "A woman marking a cake with the back of a chef's knife. "]


In [19]:
# Test for one token to get dims and everything correct
test_toks = tokenizer(text[0]).to(DEVICE)
print(test_toks.shape)
print(test_toks)

with torch.no_grad(), torch.cuda.amp.autocast():
    features = model.encode_text(test_toks)
    features /= features.norm(dim=-1, keepdim=True)

print(features.shape)
print(features)

torch.Size([1, 77])
tensor([[49406,   320,   786,   593,   320,   736, 11122,   525,   320,  2442,
           617,  2966,   525,   320, 11795,  1759,   269, 49407,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0]], device='cuda:3')
torch.Size([1, 1024])
tensor([[ 0.0338,  0.0446, -0.0328,  ..., -0.0190,  0.0113, -0.0351]],
       device='cuda:3', dtype=torch.float16)


In [20]:
# Therefore, our output features are n x 1024
# Necessary for batching this output. We will allocate a tensor of that 
#   length and then continually place the features in the correct indxs
#   according to batch_size
import math
from tqdm import tqdm

BATCH_SIZE = 1024
num_batches = math.ceil(text.shape[0] / BATCH_SIZE)

# Preallocate output tensor
out_toks = torch.zeros(text.shape[0], 77)
out_feats = torch.zeros(text.shape[0], 1024)

for bn in tqdm(range(num_batches)):
    # Get batch toks
    tokens = tokenizer(text[BATCH_SIZE*bn:min(BATCH_SIZE*(bn+1), text.shape[0])]).to(DEVICE)

    # Place tokens in output
    out_toks[BATCH_SIZE*bn:min(BATCH_SIZE*(bn+1), text.shape[0]), :] = tokens

    # Encode text
    with torch.no_grad(), torch.cuda.amp.autocast():
        features = model.encode_text(tokens)
        features /= features.norm(dim=-1, keepdim=True)
    
    # Place them in output
    out_feats[BATCH_SIZE*bn:min(BATCH_SIZE*(bn+1), text.shape[0]), :] = features

torch.save(out_toks, output_dir / 'tokens.pt')
torch.save(out_feats, output_dir / 'features.pt')

  0%|          | 0/578 [00:00<?, ?it/s]

100%|██████████| 578/578 [08:00<00:00,  1.20it/s]
