In [24]:
import os
import pytorch_lightning as pl
import pytorch_lightning as pl
import pandas as pd
from typing import Optional
import torch
import numpy as np
from pytorch_lightning.loggers import TensorBoardLogger
from transformers import get_cosine_schedule_with_warmup
import torch
from PIL import Image
import open_clip


In [2]:
class Model(pl.LightningModule):
    def __init__(self, lr, warmup_ratio, total_optimization_steps):
        super().__init__()
        self.save_hyperparameters()
        self.lr = lr
        self.total_optimization_steps = total_optimization_steps
        self.warmup_ratio = warmup_ratio
        self.backbone = torch.nn.Sequential(
            torch.nn.Linear(in_features=1024, out_features=384),
        )
        self.criterion = torch.nn.MSELoss()

    def forward(self, x):
        out = self.backbone(x)
        return out

    def training_step(self, batch, batch_idx):
        domain_from, domain_to = batch
        domain_to_hat = self(domain_from)
        loss = self.criterion(domain_to_hat, domain_to)

        self.log("train_loss", loss, on_step=True, on_epoch=False, prog_bar=True)
        logs = {"train_loss": loss.detach().cpu().numpy()}
        return {"loss": loss, "log": logs,}

    def validation_step(self, batch, batch_idx):
        domain_from, domain_to = batch
        domain_to_hat = self(domain_from)
        loss = self.criterion(domain_to_hat, domain_to)
        self.log("val_loss", loss, on_step=True, on_epoch=True, prog_bar=False)
        return {"val_loss": loss,}

    # def validation_epoch_end(self, outputs):
    #     avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
    #     print(
    #         "\n\nVAL Loss: {}\n".format(
    #             avg_loss,
    #         )
    #     )

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.lr, weight_decay=0)
        lr_scheduler = get_cosine_schedule_with_warmup(
            optimizer,
            num_warmup_steps=self.warmup_ratio * self.total_optimization_steps,
            num_training_steps=self.total_optimization_steps,
        )
        return [optimizer], [{"scheduler": lr_scheduler, "interval": "step"}]


In [41]:
projection = Model.load_from_checkpoint('/home/toomuch/kaggle-diffusion/clip-to-minnilm/runs/-Debug/-Debug-epoch=32-val_loss=0.0011.ckpt')
projection = projection.to('cuda:2')

In [27]:
from transformers import AutoModel, AutoTokenizer
# import torch 
openclip_model = AutoModel.from_pretrained('laion/CLIP-ViT-H-14-laion2B-s32B-b79K').to('cuda:2')
openclip_tokenizer = AutoTokenizer.from_pretrained('laion/CLIP-ViT-H-14-laion2B-s32B-b79K')

_, _, preprocess = open_clip.create_model_and_transforms('ViT-H-14', pretrained='laion2b_s32b_b79k')

`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


In [17]:
# openclip_model.base_model(torch.rand((3, 224, 244)))
openclip_model.vision_model(torch.rand((1, 3, 224, 224)).to('cuda:2'))
# openclip_model.vision_model

BaseModelOutputWithPooling(last_hidden_state=tensor([[[ 1.0167, -0.1938, -0.2867,  ..., -0.3315,  0.7034,  0.6887],
         [-0.2502,  0.0163,  0.0645,  ...,  0.3769,  0.0875, -0.5767],
         [ 0.0457,  0.1948,  0.3860,  ...,  0.5367, -0.0246, -0.0176],
         ...,
         [ 0.0633,  0.3087,  0.1527,  ...,  0.3461,  0.0585, -0.7915],
         [ 0.2746,  0.2856,  0.1781,  ...,  0.2916, -0.1078, -0.4006],
         [ 0.0927,  0.0650,  0.2563,  ...,  0.5321, -0.3691, -0.2808]]],
       device='cuda:2', grad_fn=<AddBackward0>), pooler_output=tensor([[ 1.4526, -0.8311, -0.6829,  ..., -0.8668,  1.0815,  1.1504]],
       device='cuda:2', grad_fn=<NativeLayerNormBackward0>), hidden_states=None, attentions=None)

In [42]:
# os.listdir('../private-cv/test-images-small/')[0]
# dir(openclip_model)

In [33]:
preprocess(Image.open('../private-cv/test-images-small/f3fb94d15811b0.png')).shape

torch.Size([3, 224, 224])

In [50]:
class OpenClipVisualDataset(torch.utils.data.Dataset):
    def __init__(self, paths):
        self.paths = paths

    def __len__(self):
        return len(self.paths)

    def __getitem__(self, idx):
        path = self.paths[idx]
        img_tensor = preprocess(Image.open(path))
        return img_tensor, path.split('/')[-1].split('.')[0]
    
root = '../private-cv/test-images-small/'
paths = [os.path.join(root, el) for el in os.listdir(root)]
dataloader = torch.utils.data.DataLoader(dataset=OpenClipVisualDataset(paths), batch_size=256, shuffle=False)

In [54]:
from tqdm import tqdm


results = []
for batch in tqdm(dataloader):
    images_tensors, hashes = batch
    with torch.no_grad():
        out = openclip_model.vision_model(images_tensors.to('cuda:2'))['pooler_output']
        out =  openclip_model.visual_projection(out)
        out = projection(out)
    out = out.detach().cpu().numpy().tolist()
    for img_hash, emb in zip(hashes, out):
        for i, val in enumerate(emb):
            results.append((f'{img_hash}_{i}', val))
    # raise KeyboardInterrupt

100%|██████████| 4/4 [00:33<00:00,  8.42s/it]


In [57]:
pd.DataFrame(results, columns=['id', 'emb']).to_csv('submission.csv', index=False)

In [21]:

# laion/CLIP-ViT-H-14-laion2B-s32B-b79K
# _, _, _preprocess = open_clip.create_model_and_transforms('ViT-H-14', pretrained='laion2b_s32b_b79k')
# _tokenizer = open_clip.get_tokenizer('ViT-B-32-quickgelu')

# image = preprocess(Image.open("CLIP.png")).unsqueeze(0)
# text = tokenizer(["a diagram", "a dog", "a cat"])

# with torch.no_grad(), torch.cuda.amp.autocast():
#     image_features = model.encode_image(image)
#     text_features = model.encode_text(text)
#     image_features /= image_features.norm(dim=-1, keepdim=True)
#     text_features /= text_features.norm(dim=-1, keepdim=True)

#     text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)

# print("Label probs:", text_probs)  # prints: [[1., 0., 0.]]