In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
! pip install ftfy regex tqdm
! pip install git+https://github.com/openai/CLIP.git

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting ftfy
  Downloading ftfy-6.1.1-py3-none-any.whl (53 kB)
[K     |████████████████████████████████| 53 kB 2.1 MB/s 
Installing collected packages: ftfy
Successfully installed ftfy-6.1.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-s4_x449p
  Running command git clone -q https://github.com/openai/CLIP.git /tmp/pip-req-build-s4_x449p
Building wheels for collected packages: clip
  Building wheel for clip (setup.py) ... [?25l[?25hdone
  Created wheel for clip: filename=clip-1.0-py3-none-any.whl size=1369409 sha256=8eb6f3b599cc9e5901d4084330e39211283721a7e5b9ec4e514bbfcc522dacf4
  Stored in directory: /tmp/pip-ephem-wheel-cache-4z_taj6r/wheels/fd/b9/c3/5b4470e35ed76e174bff77c92f91da82098d5e35fd5bc8cdac
Successfully

In [3]:
import numpy as np
import torch
import clip
from tqdm.notebook import tqdm
from pkg_resources import packaging
from PIL import Image
import matplotlib.pyplot as plt
import skimage
import torchvision
import os
from torchvision.datasets import ImageFolder

In [4]:
# CLIP Model 정보 확인

clip.available_models()

model, preprocess = clip.load("ViT-B/32")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

input_resolution = model.visual.input_resolution
context_length = model.context_length
vocab_size = model.vocab_size

print("Model parameters:", f"{np.sum([int(np.prod(p.shape)) for p in model.parameters()]):,}")
print("Input resolution:", input_resolution)
print("Context length:", context_length)
print("Vocab size:", vocab_size)

100%|████████████████████████████████████████| 338M/338M [00:01<00:00, 193MiB/s]


Model parameters: 151,277,313
Input resolution: 224
Context length: 77
Vocab size: 49408


In [5]:
# Classes and Templates

brand_classes = ["converse", "nike", "adidas", "vans", "fila", "newbalance", "reebok"]

brand_templates = [
    'a photo of the {} shoes.',
    'a photo of my {}.',
    'shoes made by the {}.'
]

# print(f"{len(brand_classes)} classes, {len(brand_templates)} templates")


color_classes = ["black", "white", "deep bordeaux", "blue", "green"]

color_templates = [
    'a photo of the {} color shoes.',
    'a photo of a {} shoes.',
    'a bright photo of a {} color sneakers.'
]

# print(f"{len(color_classes)} classes, {len(color_templates)} templates")


height_classes = ["high", "low"]

height_templates = [
    'a photo of the {} shoes.',
    'a photo of a {} sneakers.',
    'a bright photo of the {} sneakers.'
]

# print(f"{len(height_classes)} classes, {len(height_templates)} templates")

In [6]:
PATH_IMG = '/content/drive/MyDrive/dataset/converse/'  # 경로 설정 필요
#PATH_IMG = '/content/drive/MyDrive/dataset/extract'

# file_format = ["png", "jpeg", "jpg"]

file_list = os.listdir(PATH_IMG)

print('list of files', file_list)

num = len(os.listdir(PATH_IMG))
print('number:', num)

images = torchvision.datasets.ImageFolder(root=PATH_IMG, transform=preprocess)
loader = torch.utils.data.DataLoader(images, batch_size=64, num_workers=1)

list of files ['converse high black', 'converse high parchment', 'converse low rush blue', 'converse run star hike high white', 'converse onestar black', 'converse low parchment', 'converse low sunflower', 'converse run star hike hi black', 'converse onestar white', 'converse low white', 'converse low midnight clover', 'converse run star hike low black', 'converse run star hike low rush blue', 'converse high white', 'converse run star hike low white', 'converse high rush blue', 'converse low deep bordeaux', 'converse high sunflower', 'converse low black', 'converse high deep bordeaux', 'converse high midnight clover']
number: 21


In [7]:
# Zero-shot classifier, accuracy function

def zeroshot_classifier(classnames, templates):
    with torch.no_grad():
        zeroshot_weights = []
        for classname in tqdm(classnames):
            texts = [template.format(classname) for template in templates] #format with class
            texts = clip.tokenize(texts).cuda() #tokenize
            class_embeddings = model.encode_text(texts) #embed with text encoder
            class_embeddings /= class_embeddings.norm(dim=-1, keepdim=True)
            class_embedding = class_embeddings.mean(dim=0)
            class_embedding /= class_embedding.norm()
            zeroshot_weights.append(class_embedding)
        zeroshot_weights = torch.stack(zeroshot_weights, dim=1).cuda()
    return zeroshot_weights

"""
def accuracy(output, target, topk=(1, )):
    pred = output.topk(max(topk), 1, True, True)[1].t()
    correct = pred.eq(target.view(1, -1).expand_as(pred))
    return [float(correct[:k].reshape(-1).float().sum(0, keepdim=True).cpu().numpy()) for k in topk]
"""

brand_weights = zeroshot_classifier(brand_classes, brand_templates)
color_weights = zeroshot_classifier(color_classes, color_templates)
height_weights = zeroshot_classifier(height_classes, height_templates)

print('brand weights is:', brand_weights)

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

brand weights is: tensor([[-0.0258, -0.0129, -0.0125,  ...,  0.0032, -0.0185,  0.0010],
        [-0.0047, -0.0127, -0.0294,  ...,  0.0129, -0.0185, -0.0041],
        [ 0.0299,  0.0124,  0.0197,  ...,  0.0634,  0.0238,  0.0411],
        ...,
        [ 0.0172, -0.0081, -0.0049,  ..., -0.0039,  0.0055, -0.0071],
        [-0.0101,  0.0026,  0.0200,  ...,  0.0107,  0.0055, -0.0301],
        [ 0.0150,  0.0021,  0.0250,  ...,  0.0409,  0.0084, -0.0107]],
       device='cuda:0', dtype=torch.float16)


In [8]:
def accuracy_(output, target, topk=(1,)):
 
    with torch.no_grad():
        maxk = max(topk)  # max number labels we will consider in the right choices for out model
        batch_size = target.size(0)
        
        _, y_pred = output.topk(k=maxk, dim=1)  # _, [B, n_classes] -> [B, maxk]
        y_pred = y_pred.t()  # [B, maxk] -> [maxk, B] Expects input to be <= 2-D tensor and transposes dimensions 0 and 1.

        target_reshaped = target.view(1, -1).expand_as(y_pred)  # [B] -> [B, 1] -> [maxk, B]
        correct = (y_pred == target_reshaped)  # [maxk, B] were for each example we know which topk prediction matched truth

        # -- get topk accuracy
        list_topk_accs = []  # idx is topk1, topk2, ... etc
        for k in topk:
            ind_which_topk_matched_truth = correct[:k]  # [maxk, B] -> [k, B]
            flattened_indicator_which_topk_matched_truth = ind_which_topk_matched_truth.reshape(-1).float()  # [k, B] -> [kB]
            tot_correct_topk = flattened_indicator_which_topk_matched_truth.float().sum(dim=0, keepdim=True)  # [kB] -> [1]
            topk_acc = tot_correct_topk / batch_size  # topk accuracy for entire batch
            list_topk_accs.append(topk_acc)
        return list_topk_accs  # list of topk accuracies for entire batch [topk1, topk2, ... etc]

In [9]:
def filecount(PATH, f_list):
    filecount = 0
    for foldernames in f_list:
        file_num = len(os.listdir(os.path.join(PATH, foldernames)))

        filecount = filecount + file_num
    print('filecount', filecount)

    return filecount

filecount(PATH_IMG, file_list)

filecount 1235


1235

In [None]:
# try3

with torch.no_grad():
    top1, top5, n, sum1, sum5 = 0., 0., 0., 0., 0.
    for i, (images, target) in enumerate(tqdm(loader)):
        images = images.cuda()
        target = target.cuda()
        
        # predict
        image_features = model.encode_image(images)
        image_features /= image_features.norm(dim=-1, keepdim=True)
        logits = 100. * image_features @ brand_weights

        # measure accuracy
        acc1, acc5 = accuracy_(logits, target, topk=(1,5))
        top1 += acc1
        top5 += acc5
        n += images.size(0)

# print("before:", top1, top5)
print("number of all images: ", n)
top1 = (top1 * 100).data.cpu().numpy()
top5 = (top5 * 100).data.cpu().numpy()
print('top1: ', top1)

print(f"Top-1 accuracy: ", top1)
print(f"Top-5 accuracy: ", top5)


"""
sum1 += sum1 + top1
sum5 += sum5 + top5

avg1 = sum1 / len(os.listdir(PATH_IMG))
avg5 = sum5 / len(os.listdir(PATH_IMG))

print(f"average Top-1 accuracy: {avg1:.2f}")
print(f"average Top-5 accuracy: {avg5:.2f}")
"""

  0%|          | 0/20 [00:00<?, ?it/s]