In [None]:
#!pip install timm
# !pip install ftfy regex tqdm
!pip install git+https://github.com/openai/CLIP.git
!pip install albumentations

cleaning imagenet and landmark data according to https://arxiv.org/pdf/2003.11211.pdf
https://github.com/lyakaap/Landmark2019-1st-and-3rd-Place-Solution

In [None]:
import os
import random

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from torch.optim import lr_scheduler
import torch.backends.cudnn as cudnn
import numpy as np
import torchvision
from torchvision import transforms # datasets, models,
cudnn.benchmark = True
import albumentations as A

import clip
from clip.clip import _download, _MODELS

import numpy as np
import pandas as pd
from PIL import Image
import cv2
import matplotlib.pyplot as plt

from tqdm import tqdm

from pathlib import Path
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from timeit import default_timer as timer
# import joblib

import gc


In [None]:
clip.available_models()

In [None]:
class config:
    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
    RANDOM_SEED = 45

  # backbone
    IMAGE_SIZE = 224
    BACKBONE_MODEL_NAME = 'ViT-L/14'
    BACKBONE_MODEL = None
  
  # projection layer
    N_CLASSES = 1000
    EMB_DIM = 256

  # training
    TRAIN = True
    RESUME = True
    RESUME_STATES_DIR = './model_state.pkl'
    BATCH_SIZE = 32
    EPOCHS = 1
    LR = .001
    
  # files
    TRAIN_DIR = '../input/imagenet-object-localization-challenge/ILSVRC/Data/CLS-LOC/train'
    VAL_DIR = '../input/imagenet-object-localization-challenge/ILSVRC/Data/CLS-LOC/val'
    VAL_CSV = '../input/imagenet-object-localization-challenge/LOC_val_solution.csv'
    OUTPUT_DIR = Path("./")
    LOGS_DIR = Path(OUTPUT_DIR, "logs")
    MODEL_DIR = Path(OUTPUT_DIR, "model")
    OOF_DIR = Path(OUTPUT_DIR, "oof")
    LOGS_DIR.mkdir(parents=True, exist_ok=True)
    MODEL_DIR.mkdir(parents=True, exist_ok=True)
    OOF_DIR.mkdir(parents=True, exist_ok=True)

def seed_everything(seed):
    """
    Seeds basic parameters for reproductibility of results
    
    Arguments:
        seed {int} -- Number of the seed
    """
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    
seed_everything(config.RANDOM_SEED)

print(config.DEVICE)

In [None]:
# https://github.com/openai/CLIP

model_path = _download(_MODELS[config.BACKBONE_MODEL_NAME], os.path.expanduser("~/.cache/clip"))
with open(model_path, 'rb') as opened_file:
    clip_vit = torch.jit.load(opened_file, map_location=config.DEVICE).visual.eval()
    
    #model, preprocess = clip.load(config.BACKBONE_MODEL_NAME, device=config.DEVICE)
    
    config.BACKBONE_MODEL = clip_vit #model.to(config.DEVICE)
    #config.BACKBONE_PREPROCESS = preprocess

# https://github.com/rwightman/pytorch-image-models

In [None]:
#config.BACKBONE_MODEL

In [None]:
#config.BACKBONE_PREPROCESS

In [None]:
# https://ryanwingate.com/intro-to-machine-learning/deep-learning-with-pytorch/loading-image-data-into-pytorch/
# train_trans = transforms.Compose([transforms.RandomRotation(30),
#                                        transforms.RandomResizedCrop(config.IMAGE_SIZE),
#                                        transforms.RandomHorizontalFlip(),
#                                        transforms.ToTensor(),
#                                        transforms.Normalize([0.48145466, 0.4578275, 0.40821073], 
#                                                             [0.26862954, 0.26130258, 0.27577711])]
#                                      )
# val_trans = transforms.Compose([transforms.Resize(config.IMAGE_SIZE),
#                                       #transforms.CenterCrop(config.IMAGE_SIZE),
#                                       transforms.ToTensor(),
#                                       transforms.Normalize([0.48145466, 0.4578275, 0.40821073], 
#                                                            [0.26862954, 0.26130258, 0.27577711])]
#                                     )

In [None]:
#https://albumentations.ai/docs/examples/migrating_from_torchvision_to_albumentations/
#https://github.com/albumentations-team/albumentations/issues/687

#https://aigeekprogrammer.com/pytorch-dataset-division-transformations-gpu-metric-visualization/
class ToTensorCustom(A.BasicTransform):
    """Convert image and mask to `torch.Tensor`
    * Image numpy: [H, W, C] -> Image tensor: [C, H, W]

    """
    def __init__(self, always_apply=True, p=1.0):
        super().__init__(always_apply=always_apply, p=p)
    
    @property
    def targets(self):
        return {'image':self.apply}
    
    def apply(self, img, **params):
        """Image from numpy [H, W, C] to tensor [C, H, W]"""
        return torch.from_numpy(img.transpose(2, 0, 1))
    
train_trans = A.Compose([A.Resize(400,400), 
                               A.RandomCrop(config.IMAGE_SIZE,config.IMAGE_SIZE),
                               A.HueSaturationValue(hue_shift_limit=20, sat_shift_limit=30, val_shift_limit=20, p=0.5),
                               A.GaussianBlur(blur_limit=(3, 7), p=0.5),
                               A.HorizontalFlip(p=0.5),
                               A.Normalize(mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711]),
                              ToTensorCustom()])
val_trans = A.Compose([A.Resize(config.IMAGE_SIZE,config.IMAGE_SIZE),
                             #A.CenterCrop(config.IMAGE_SIZE,config.IMAGE_SIZE),
#                               A.HueSaturationValue(hue_shift_limit=20, sat_shift_limit=30, val_shift_limit=20, p=0.5),
#                               A.GaussianBlur(blur_limit=(3, 7), p=0.5),
#                               A.HorizontalFlip(p=0.5),
                              A.Normalize(mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711]),
                              ToTensorCustom()])

In [None]:
class ImagenetDataset(Dataset):
    def __init__(self, data,transformation):
        self.data = data
        self.length = len(data)
        self.trans = transformation
        
    def __getitem__(self,idx):
        row = self.data.iloc[idx]
        img = cv2.imread(row.path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        img = self.trans(image=img)
        return img, row.label
    
    def __len__(self):
        return self.length
    
    # will not work for test/val data which have no labels (?) and labels in csv respectively
def df_from_file_hierarchy(data_path,img_per_class_cap=50):
    img_and_type = []
    for img_class in tqdm(os.listdir(data_path)):
        img_count = 0
        for img_path in os.listdir(data_path+'/'+img_class):
            full_path = data_path+'/'+img_class+'/'+img_path
                
            img_and_type.append({'path':full_path,'type':img_class})
                
            img_count+=1
            if img_count == img_per_class_cap:
                break
        
    df = pd.DataFrame(img_and_type)
    return df
    
def df_from_csv(csv_path,img_path_parent=None,img_type='.JPEG'):#,img_per_class_cap=50):
    df = pd.read_csv(csv_path)
    img_and_type = []
    for i, row in df.iterrows():
        img_and_type.append({'path':img_path_parent+'/'+row.ImageId+img_type,
                                    'type':row.PredictionString.split()[0]})
    df = pd.DataFrame(img_and_type)
    return df

# very slow loading
# imagenet_train = ImageFolder(root = '../input/imagenet-object-localization-challenge/ILSVRC/Data/CLS-LOC/train')

In [None]:
def prepare_datasets(out_of_train_class_count = 5):
    df1 = df_from_file_hierarchy(config.TRAIN_DIR)
    #df2 = df_from_csv(config.VAL_CSV,img_path_parent=config.VAL_DIR)
    
    #data_df = pd.concat([df1,df2])   --> I don't think I should concat. I should get a dictionary for labels?
    
    data_df = df1
    data_df['label'] = LabelEncoder().fit_transform(data_df['type'])
    
    data_df.to_csv("data.csv")
    
    out_of_train_labels = np.random.choice(data_df['label'].unique(),size=out_of_train_class_count, replace=False)
    out_of_train_val_df = data_df.loc[data_df['label'].isin(out_of_train_labels)].reset_index()
    in_train = data_df.loc[~data_df['label'].isin(out_of_train_labels)].reset_index()
    
    train_idxs = []
    for label in tqdm(in_train['label'].unique()):
        label_group = in_train.loc[in_train['label']==label]
        selections = np.random.choice(len(label_group),int(len(label_group)*.9), replace=False)
        train_idxs.extend([index for index in label_group.iloc[selections].index])
    
    train_df = in_train.iloc[train_idxs].reset_index()
    val_df = in_train.loc[~in_train.index.isin(train_idxs)].reset_index()
    
    print('Train len:',len(train_df))
    print('Val len:',len(val_df))
    print('Oot val len:',len(out_of_train_val_df))
    
    return train_df,val_df,out_of_train_val_df

In [None]:
train_df,val_df,oot_val_df = prepare_datasets()

In [None]:
train_df['label'].value_counts()

In [None]:
val_df['label'].value_counts()

In [None]:
train_dataset = ImagenetDataset(train_df,train_trans)
val_dataset = ImagenetDataset(val_df,val_trans)
oot_val_dataset = ImagenetDataset(oot_val_df,val_trans)

In [None]:
train_loader = DataLoader(train_dataset, 
                          batch_size=config.BATCH_SIZE,
                          shuffle=True, num_workers=2)
val_loader = DataLoader(val_dataset,
                       batch_size=config.BATCH_SIZE,
                       shuffle=False, num_workers=2)
# oot_val_loader = DataLoader(oot_val_dataset,
#                        batch_size=config.BATCH_SIZE,
#                        shuffle=True, num_workers=2)

In [None]:
data_loaders = {}
data_loaders['train']=train_loader
data_loaders['val']=val_loader

In [None]:
len(train_loader)*config.BATCH_SIZE

In [None]:
class LinearProjection(nn.Module):
    def __init__(self,image_size: int = 224):
        super().__init__()
        self.image_size=image_size
        self.encoder = config.BACKBONE_MODEL
        self.linear1 = nn.Linear(768,350)
        self.linear2 = nn.Linear(350,1000)
        self.pool = nn.AdaptiveAvgPool1d(64)
        
        # using cross entropy loss so not softmax
        # self.softmax = nn.Softmax()

    def forward(self, input, submission: bool=True):
        if submission:
            input = transforms.functional.resize(input,size=[self.image_size,self.image_size])
            input = transforms.functional.normalize(input, mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711])
        
        #with torch.no_grad():
        output = self.encoder(input.half())#.encode_image(input)
        output = F.relu(output)
        output = self.linear1(output.float())
        output = F.relu(output)
        output = self.linear2(output.float())
        
        if submission:
            output = self.pool(output)
        
        return output

In [None]:
# import time
# import copy
# # https://pytorch.org/tutorials/beginner/transfer_learning_tutorial.html
# def train_model(model, dataloaders, criterion, optimizer, scheduler, num_epochs):
#     since = time.time()

#     best_model_wts = copy.deepcopy(model.state_dict())
#     best_acc = 0.0

#     for epoch in range(num_epochs):
#         print(f'Epoch {epoch}/{num_epochs - 1}')
#         print('-' * 10)

#         # Each epoch has a training and validation phase
#         for phase in ['train', 'val']:
#             if phase == 'train':
#                 model.train()  # Set model to training mode

#             else:
#                 model.eval()   # Set model to evaluate mode
                
#             running_loss = 0.0
#             running_corrects = 0
            
#             # Iterate over data.
#             for inputs, labels in tqdm(dataloaders[phase]):
#                 inputs = torch.transpose(inputs['image'],1,3)
#                 #print(inputs.shape)
#                 inputs = inputs.to(config.DEVICE)
#                 labels = labels.to(config.DEVICE)
                
#                 # zero the parameter gradients
#                 optimizer.zero_grad()

#                 # forward
#                 # track history if only in train
#                 with torch.set_grad_enabled(phase == 'train'):
#                     outputs = model(inputs,submission=False)
#                     _, preds = torch.max(outputs, 1)
                    
#                     loss = criterion(outputs, labels)

#                     # backward + optimize only if in training phase
#                     if phase == 'train':
#                         loss.backward()
#                         optimizer.step()
#                 # statistics
#                 running_loss += loss.detach().item() * inputs.size(0)
#                 running_corrects += torch.sum(preds == labels.data)
                
#                 gc.collect()
                
#             if phase == 'train':
#                 scheduler.step()

#             epoch_loss = running_loss / (len(dataloaders[phase])*config.BATCH_SIZE)
#             epoch_acc = running_corrects.double() / (len(dataloaders[phase])*config.BATCH_SIZE)

#             print(f'{phase} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}')

#             # deep copy the model
#             if phase == 'val' and epoch_acc > best_acc:
#                 best_acc = epoch_acc
#                 best_model_wts = copy.deepcopy(model.state_dict())
                
#     time_elapsed = time.time() - since
#     print(f'Training complete in {time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s')
#     print(f'Best val Acc: {best_acc:4f}')

#     # load best model weights
#     model.load_state_dict(best_model_wts)
#     return model

In [None]:
model = LinearProjection().to(config.DEVICE)

criterion = nn.CrossEntropyLoss().to(config.DEVICE)

# Observe that all parameters are being optimized
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

# Decay LR by a factor of 0.1 every 7 epochs
scheduler = lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)

gc.collect()

In [None]:
print(config.DEVICE)

In [None]:
train_accuracies = []
val_accuracies =[]
start = timer()
for epoch in range(config.EPOCHS):
    model.train()
    train_accuracy = []
    for x, y in tqdm(data_loaders['train']):
        x = x['image'].to(config.DEVICE)
        y = y.to(config.DEVICE)
        optimizer.zero_grad() 
        yhat = model.forward(x,submission=False)
        loss = criterion(yhat, y)
        loss.backward()
        optimizer.step()
        prediction = torch.argmax(yhat, dim=1)
        train_accuracy.extend((y == prediction).detach().cpu().numpy())
    train_accuracies.append(np.mean(train_accuracy)*100)

    # for every epoch we do a validation step to asses accuracy and overfitting
    model.eval()
    with torch.no_grad():
        val_accuracy = []  # accuracies for each batch of validation dataset
        for vx, vy in tqdm(data_loaders['val']):
            vx = vx['image'].to(config.DEVICE)
            vy = vy.to(config.DEVICE)
            yhat = model.forward(vx,submission=False)
            prediction = torch.argmax(yhat, dim=1)
            # to numpy in order to use next the vectorized np.mean
            val_accuracy.extend((vy == prediction).detach().cpu().numpy())
        val_accuracies.append(np.mean(val_accuracy)*100)
   # simple logging during training
    print(f'Epoch #{epoch+1}. Train accuracy: {np.mean(train_accuracy)*100:.2f}. \
                      Validation accuracy: {np.mean(val_accuracy)*100:.2f}')
end = timer()

In [None]:
#torch.save(model.state_dict(),'model_state.pkl')
states = model.state_dict().copy()

In [None]:
model_to_sub = LinearProjection()

In [None]:
model_to_sub.load_state_dict(states)

In [None]:
saved_model = torch.jit.script(model_to_sub)
saved_model.save('saved_model.pt')

# saved_model = torch.jit.load('saved_model.pt').cuda()
# input = torch.randint(0, 255, test_input_size, device='cuda', dtype=torch.uint8)

# with torch.no_grad():
#     output = saved_model(input)
#     print(output.dtype, output.shape, output.mean().item())
#     assert output.shape == torch.Size([test_input_size[0], 64])
#     torch.cuda.synchronize()

In [None]:
from zipfile import ZipFile
with ZipFile('submission.zip','w') as zip_file:
    zip_file.write('./saved_model.pt', arcname='saved_model.pt')