In [None]:
## VOC12
VOC12_classes = ['aeroplane', 'bicycle', 'bird', 'boat', 'bottle',
               'bus', 'car', 'cat', 'chair', 'cow', 'diningtable', 'dog',
               'horse', 'motorbike', 'person', 'pottedplant', 'sheep', 'sofa',
               'train', 'tvmonitor']
## CONTEXT
CONTEXT_classes = ('aeroplane', 'bag', 'bed', 'bedclothes', 'bench', 'bicycle',
        'bird', 'boat', 'book', 'bottle', 'building', 'bus', 'cabinet',
        'car', 'cat', 'ceiling', 'chair', 'cloth', 'computer', 'cow',
        'cup', 'curtain', 'dog', 'door', 'fence', 'floor', 'flower',
        'food', 'grass', 'ground', 'horse', 'keyboard', 'light',
        'motorbike', 'mountain', 'mouse', 'person', 'plate', 'platform',
        'pottedplant', 'road', 'rock', 'sheep', 'shelves', 'sidewalk',
        'sign', 'sky', 'snow', 'sofa', 'table', 'track', 'train',
        'tree', 'truck', 'tv monitor', 'wall', 'water', 'window', 'wood')

## COCO
COCO_classes = ['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train',
        'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign',
        'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep',
        'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella',
        'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard',
        'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard',
        'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork',
        'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange',
        'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair',
        'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv',
        'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave',
        'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase',
        'scissors', 'teddy bear', 'hair drier', 'toothbrush', 'banner',
        'blanket', 'branch', 'bridge', 'building-other', 'bush', 'cabinet',
        'cage', 'cardboard', 'carpet', 'ceiling-other', 'ceiling-tile',
        'cloth', 'clothes', 'clouds', 'counter', 'cupboard', 'curtain',
        'desk-stuff', 'dirt', 'door-stuff', 'fence', 'floor-marble',
        'floor-other', 'floor-stone', 'floor-tile', 'floor-wood',
        'flower', 'fog', 'food-other', 'fruit', 'furniture-other', 'grass',
        'gravel', 'ground-other', 'hill', 'house', 'leaves', 'light', 'mat',
        'metal', 'mirror-stuff', 'moss', 'mountain', 'mud', 'napkin', 'net',
        'paper', 'pavement', 'pillow', 'plant-other', 'plastic', 'platform',
        'playingfield', 'railing', 'railroad', 'river', 'road', 'rock', 'roof',
        'rug', 'salad', 'sand', 'sea', 'shelf', 'sky-other', 'skyscraper',
        'snow', 'solid-other', 'stairs', 'stone', 'straw', 'structural-other',
        'table', 'tent', 'textile-other', 'towel', 'tree', 'vegetable',
        'wall-brick', 'wall-concrete', 'wall-other', 'wall-panel',
        'wall-stone', 'wall-tile', 'wall-wood', 'water-other', 'waterdrops',
        'window-blind', 'window-other', 'wood']

templates_6 = [
    'a photo of a {}.',
    'a portrait of a {}.',
    'a part of a {}.',
    'a segment of a {}.',
    'a photo of small {}.',
    'a photo of medium {}.',
    ]

templates_8 = [
    'a photo of a {}.',
    'a portrait of a {}.',
    'a part of a {}.',
    'a segment of a {}.',
    'a photo of small {}.',
    'a photo of medium {}.',
    'a photo of large {}.',
    'a background of a {}.'
    ]

In [None]:
pip install git+https://github.com/openai/CLIP.git

In [None]:
### From CLIP https://colab.research.google.com/github/openai/clip
import torch
import numpy as np
import clip
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load('ViT-B/16', device)

## single template
def single_templete(save_path, class_names, model):
    with torch.no_grad():
        texts = torch.cat([clip.tokenize(f"a photo of a {c}") for c in class_names]).cuda()
        text_embeddings = model.encode_text(texts)
        text_embeddings = text_embeddings / text_embeddings.norm(dim=-1, keepdim=True)
        np.save(save_path, text_embeddings.cpu().numpy())
    return text_embeddings

## multi templates
def multi_templete(save_path, class_names, model, templates):
    with torch.no_grad():
        text_embeddings = []
        for classname in class_names:
            texts = [template.format(classname) for template in templates] 
            texts = clip.tokenize(texts).cuda()
            class_embeddings = model.encode_text(texts)
            class_embeddings /= class_embeddings.norm(dim=-1, keepdim=True)
            class_embedding = class_embeddings.mean(dim=0)
            class_embedding /= class_embedding.norm()
            text_embeddings.append(class_embedding)
        text_embeddings = torch.stack(text_embeddings, dim=0).cuda()
    np.save(save_path, text_embeddings.cpu().numpy())
    return text_embeddings

In [None]:
## COCO:
save_path='./text_embeddings/coco_otseg.npy'
text_embeddings = single_templete(save_path, COCO_classes, model, templates_8)

In [None]:
## VOC12:
save_path='./text_embeddings/voc12_otseg.npy'
text_embeddings = single_templete(save_path, VOC12_classes, model, templates_6)

## CONTEXT:
save_path='./text_embeddings/context_otseg.npy'
text_embeddings = multi_templete(save_path, CONTEXT_classes, model, templates_8)