In [3]:
import numpy as np
import os 
import pandas as pd
import cv2
import torch
import matplotlib.pyplot as plt
# from ipywidgets import interact
import albumentations as A
from albumentations.pytorch import ToTensorV2
import torchvision
from torch import nn
import torchsummary
from torch.utils.data import DataLoader
from collections import defaultdict
from torchvision.utils import make_grid

In [4]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

## Datasets

In [5]:
class Dataset():
    def __init__(self, root, phase, transformer=None):
        self.root=root
        self.phase=phase
        self.transformer=transformer
        self.image_list=sorted(os.listdir(root+"image/"+phase))
        self.des_list=sorted(os.listdir(root+"description/"+phase))
        self.label_list=sorted(os.listdir(root+"label/"+phase))
        
    def __getitem__(self, index):
        img, des, label = self.get_data(index)
        return img, des, label
        
    def __len__(self, ):
        return len(self.image_list)

    def get_data(self, index):
        # label
        label_file_name=self.label_list[index]
        lab_f=open(self.root+"label/"+self.phase+"/"+label_file_name, "r")
        label=lab_f.read()

        # description
        des_file_name=self.des_list[index]
        des_f=open(self.root+"description/"+self.phase+"/"+des_file_name, "r")
        des_text=des_f.read()
        des=des_text.split(" ")

        # image
        img_file_name=self.image_list[index]
        image=cv2.imread(self.root+"image/"+self.phase+"/"+img_file_name)
        img=cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        if(self.transformer!=None):
            transformed_img=self.transformer(image=img)
            img=transformed_img
        
        return img, des, label
        
    

In [6]:
IMAGE_SIZE=448
transformer = A.Compose([
            A.Resize(height=IMAGE_SIZE, width=IMAGE_SIZE),
            A.Normalize(mean=(0.485, 0.456, 0.406),std=(0.229, 0.224, 0.225)),
            ToTensorV2(),
        ]
    )

In [12]:
root='/workspace/team2/data/nickData/'
train_dataset=Dataset(root=root, phase="train", transformer=transformer)

In [13]:
img, des, label=train_dataset[0]

In [14]:
img['image'].shape

torch.Size([3, 448, 448])

## MODELs
 ![Untitled](../img/nickCLIP_arch.png)

### Image Encoder

In [40]:
class Image_Encoder(nn.Module):
    def __init__(self):
        super().__init__()
        
        resnet = torchvision.models.resnet34(pretrained = True)
        layers = [m for m in resnet.children()]
        
        self.backbone = nn.Sequential(*layers[:-2]) 
        
        self.head = nn.Sequential(
                nn.Conv2d(in_channels=512, out_channels=256, kernel_size=1, padding=0,bias=False),
                nn.BatchNorm2d(256),
                nn.ReLU(inplace=True),
                nn.Conv2d(in_channels=256, out_channels=128, kernel_size=3, padding=1,bias=False),
                nn.BatchNorm2d(128),
                nn.ReLU(inplace=True),
                nn.Conv2d(in_channels=128, out_channels=32, kernel_size=3, padding=1,bias=False),
                nn.BatchNorm2d(32),
                nn.ReLU(inplace=True),
                nn.Conv2d(in_channels=32, out_channels=1, kernel_size=3, padding=1,bias=False),
                nn.BatchNorm2d(1),
                nn.ReLU(inplace=True),
                nn.Flatten()
            )
    def forward(self, x):
        out = self.backbone(x)
        out = self.head(out) # final output=> (1, 196)
        return out

In [46]:
Image_Enc = Image_Encoder()
Image_Enc.to(device)
torchsummary.summary(Image_Enc, (3,448,448))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 64, 224, 224]           9,408
       BatchNorm2d-2         [-1, 64, 224, 224]             128
              ReLU-3         [-1, 64, 224, 224]               0
         MaxPool2d-4         [-1, 64, 112, 112]               0
            Conv2d-5         [-1, 64, 112, 112]          36,864
       BatchNorm2d-6         [-1, 64, 112, 112]             128
              ReLU-7         [-1, 64, 112, 112]               0
            Conv2d-8         [-1, 64, 112, 112]          36,864
       BatchNorm2d-9         [-1, 64, 112, 112]             128
             ReLU-10         [-1, 64, 112, 112]               0
       BasicBlock-11         [-1, 64, 112, 112]               0
           Conv2d-12         [-1, 64, 112, 112]          36,864
      BatchNorm2d-13         [-1, 64, 112, 112]             128
             ReLU-14         [-1, 64, 1

### Text Encoder

In [49]:
class Text_Encoder(nn.Module):
    def __init__(self):
        super().__init__()
        self.BERT = torch.hub.load('huggingface/pytorch-transformers', 'model', 'bert-base-uncased')
        
        
    def forward(self, x):
        out=self.BERT(x)
        return out

In [50]:
Text_Enc=Text_Encoder()


Using cache found in /root/.cache/torch/hub/huggingface_pytorch-transformers_main


PackageNotFoundError: The 'safetensors>=0.3.1' distribution was not found and is required by this application. 
Try: pip install transformers -U or pip install -e '.[dev]' if you're working with git main