In [1]:
import os
from tqdm.notebook import tqdm
import gc
from torch.nn import Parameter
import torch.nn.functional as F
import torch.nn as nn
import math
import timm
import pandas as pl
import torch
import numpy as np
from torch.amp import GradScaler
import cv2
import random
from tqdm.notebook import tqdm
from torch.autograd import Variable
from skimage.metrics import structural_similarity as ssim
import torch.cuda.amp as amp

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(228)

In [3]:
pairs = pl.read_csv('data/raw/pairs_list.csv')
paths_embeds = pl.read_csv('data/raw/paths_embeds.csv')['image_path']
real_embeds = np.load('data/raw/real_embeds.npy')

In [4]:
class MCSDataset(torch.utils.data.Dataset):
    # Initialize the dataset with image paths, target labels, and an optional image size (default is 112)
    def __init__(self, image_path, target, imsize=112):
        self.image_path = image_path  # List of image file names or paths.
        self.target = target          # List of target labels or descriptors corresponding to the images.
        self.image_size = imsize      # Desired size (width and height) to which all images will be resized.

    # Return the number of samples in the dataset
    def __len__(self):
        return len(self.target)

    # A helper function to resize an image using a specified interpolation method.
    # This function is not used directly in __getitem__, but it provides an example of how to call cv2.resize with custom interpolation.
    # def resize(self, img, interp):
    #     return cv2.resize(
    #         img, (self.image_size, self.image_size), interpolation=interp)

    # Retrieve the image and target at index 'idx'
    def __getitem__(self, idx):
        # Retrieve the image file name/path and the corresponding target using the provided index.
        path = self.image_path[idx]
        target = self.target[idx]
        
        # Read the image from the specified path.
        # It assumes that images are stored in the 'imgs/train/' directory.
        img = cv2.imread(f'data/raw/train/{path}')
        
        # Resize the image to the desired dimensions using linear interpolation.
        img = cv2.resize(
            img, (self.image_size, self.image_size), interpolation=cv2.INTER_LINEAR)
        
        # Normalize the image pixels:
        # - Convert pixel values from [0, 255] to [0, 1] by dividing by 255.
        # - Then shift the range to [-0.5, 0.5] by subtracting 0.5.
        img = (img / 255.) - 0.5
        
        # Change the image array from shape (height, width, channels) to (channels, height, width)
        # This is necessary because PyTorch models expect the channels-first format.
        img = np.transpose(img, (2, 0, 1)).astype(np.float32)
        
        # Convert the numpy image array to a PyTorch tensor.
        img = torch.from_numpy(img)
        
        # Convert the target (assumed to be a numpy array) to a PyTorch tensor.
        target = torch.from_numpy(target)

        # Return the processed image and its corresponding target.
        return img, target


In [5]:
class Model(nn.Module):
    def __init__(self, model_name):
        # Initialize the parent class (nn.Module)
        super().__init__()
        
        # Save the model name (could be used for logging or further customization)
        self.model_name = model_name
        
        # Create a backbone model from the timm library.
        # - model_name: Specifies which pre-trained model to use.
        # - global_pool='': Disables automatic global pooling.
        # - num_classes=0: Removes the classification head.
        # - in_chans=3: Specifies that the model expects 3-channel (RGB) input images.
        self.timm_ = timm.create_model(model_name, global_pool='', num_classes=0, in_chans=3)
        
        # Determine the number of output features from the backbone model:
        # Pass a dummy tensor through the model to check its output shape.
        # The dummy tensor has shape (1, 3, 112, 112), matching the expected input size.
        output_features = self.timm_(torch.zeros((1, 3, 112, 112))).shape[1]
        
        # Create a 1D Batch Normalization layer to normalize the extracted features.
        # This layer will have as many features as the output of the backbone.
        self.norm = nn.BatchNorm1d(output_features)

    def forward(self, x):
        # Pass the input 'x' through the backbone model (timm_)
        # The output is assumed to be a feature map of shape (batch_size, channels, height, width)
        features = self.timm_(x)
        
        # Apply spatial average pooling by taking the mean over the height and width dimensions.
        # This converts the feature maps into a 1D feature vector per sample.
        pooled_features = features.mean(dim=(2, 3))
        
        # Normalize the pooled features using the Batch Normalization layer.
        normed_features = self.norm(pooled_features)
        
        # Finally, apply L2 normalization (using F.normalize) to ensure the feature vector has unit norm.
        # This is common in face recognition systems to facilitate distance-based comparisons.
        out_ = F.normalize(normed_features)
        
        # Return the final normalized feature vector.
        return out_


In [6]:
def make_predict(model, val_loader, val_target, loss_func, DEVICE='cuda'):
    # List to accumulate predictions for all batches
    preds = []
    
    # Set the model to evaluation mode (disables dropout, batch norm, etc.)
    model.eval()
    
    # Initialize variable to accumulate the loss over all batches
    average_loss = 0
    
    # Disable gradient calculation for inference to save memory and computations
    with torch.no_grad():
        # Iterate over the validation DataLoader
        for batch_number, (img, target) in enumerate(val_loader):
            # Move input images and targets to the specified device (e.g., GPU)
            img = img.to(DEVICE)
            target = target.to(DEVICE)
            
            # Use automatic mixed precision for efficiency during inference
            with torch.amp.autocast('cuda'):
                # Forward pass: compute model outputs for the batch of images
                outputs = model(img)
                
                # Calculate loss between the model outputs and the true targets using the given loss function
                loss = loss_func(outputs, target)
            
            # Accumulate the loss: move loss to CPU, detach from the graph, convert to numpy, then add to total
            average_loss += loss.cpu().detach().numpy()
            
            # Convert the outputs to a numpy array on the CPU and add them to the predictions list
            preds += [outputs.to('cpu').numpy()]
    
    # Concatenate predictions from all batches into a single numpy array
    preds = np.concatenate(preds)
    
    # Calculate the Mean Squared Error (MSE) between the predictions and the actual target values,
    # then print it. This is computed as the mean of squared differences.
    print('MAE: ', ((preds - np.array(val_target)) ** 2).mean())


In [7]:
# Clean up any unused objects and free up GPU memory
gc.collect()                         # Collect garbage from Python memory
torch.cuda.empty_cache()             # Empty unused cached memory on the GPU

# Hyperparameters and device configuration
batch_size = 64                      # Training batch size
valid_batch_size = 64                # Validation batch size (can be same as training)
epochs = 30                          # Number of training epochs
lr = 3e-3                            # Learning rate for the optimizer
clip_grad_norm = 15.8              # Maximum norm for gradient clipping
DEVICE = 'cuda'                      # Device to use for training (GPU)

# DataLoader parameters for training and validation datasets
params_train = {
    'batch_size': batch_size,
    'shuffle': True,                 # Shuffle the training data each epoch
    'drop_last': True,               # Drop last incomplete batch if dataset size is not divisible by batch_size
    'num_workers': 2                 # Number of subprocesses to use for data loading
}
params_val = {
    'batch_size': batch_size,
    'shuffle': False,                # Do not shuffle validation data
    'drop_last': False,              # Do not drop the last batch; useful for evaluation
    'num_workers': 2
}

# Split the dataset into training and validation sets using modulo indexing.
# Every 5th element is used for validation, the rest for training.
train_path = [x for i, x in enumerate(paths_embeds) if i % 5 != 0 ]
train_target = [x for i, x in enumerate(real_embeds) if i % 5 != 0 ]
val_path = [x for i, x in enumerate(paths_embeds) if i % 5 == 0 ]
val_target = [x for i, x in enumerate(real_embeds) if i % 5 == 0 ]

# Create PyTorch DataLoaders for training and validation datasets using the custom MCSDataset
train_loader = torch.utils.data.DataLoader(MCSDataset(train_path, train_target), **params_train)
val_loader = torch.utils.data.DataLoader(MCSDataset(val_path, val_target), **params_val)

# Number of labels (not used further in the provided snippet but might be used elsewhere)
num_lbl = 2000

# Initialize the model using a specified architecture ('resnet18') and move it to the GPU
model = Model('resnet34').cuda()

# Calculate the number of training steps (not used explicitly in this snippet)
num_train_steps = int(len(train_loader) / batch_size * epochs)

# Define the loss function (Mean Absolute Error loss) for regression-like outputs
loss_func = torch.nn.L1Loss()

# Set up a gradient scaler for mixed precision training (improves efficiency on GPU)
scaler = GradScaler('cuda')

# Create an AdamW optimizer to update model parameters with the given learning rate
optimizer = torch.optim.AdamW(model.parameters(), lr)

# Set up a cosine annealing learning rate scheduler with a minimal learning rate of 5e-7
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, len(train_loader) * epochs, 5e-9)
# Training loop for the specified number of epochs
for epoch in range(epochs):
    model.train()  # Set model to training mode (activates dropout, batch norm, etc.)
    average_loss = 0  # Initialize the loss accumulator for the epoch

    # Use tqdm to create a progress bar for the training loop over batches
    # tk0 = tqdm(enumerate(train_loader), total=len(train_loader))
    for (img, target) in train_loader:
        optimizer.zero_grad()  # Zero out gradients before each batch to avoid accumulation
        img = img.to(DEVICE)   # Move image batch to the GPU
        target = target.to(DEVICE)  # Move target batch to the GPU

        # Use automatic mixed precision context to improve performance on GPU
        with torch.amp.autocast('cuda'):
            outputs = model(img)  # Forward pass: compute model outputs for the batch
            loss = loss_func(outputs, target)  # Compute loss between outputs and target

        # Scale the loss and perform backpropagation
        scaler.scale(loss).backward()

        # Unscale gradients and perform gradient clipping to stabilize training
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip_grad_norm)

        # Step the optimizer and update the scaler for mixed precision training
        scaler.step(optimizer)
        scaler.update()

        # Update the learning rate scheduler
        scheduler.step()

        # Accumulate loss (move to CPU and detach from computation graph for safe numpy conversion)
        average_loss += loss.cpu().detach().numpy()
        # Update progress bar with current average loss, current learning rate, and epoch number
        # tk0.set_postfix(loss=average_loss / (batch_number + 1),
        #                 lr=scheduler.get_last_lr()[0],
        #                 stage="train",
        #                 epoch=epoch)
    
    # Evaluate the model on the validation dataset after each epoch
    make_predict(model, val_loader, val_target, loss_func)
    
# Save the final model state dictionary after training completes
std_m = model.state_dict()
!mkdir -p checkpoints/solution
torch.save(std_m, f'checkpoints/solution/model.pt')

MAE:  0.0027296157
MAE:  0.0022350024
MAE:  0.0019997736
MAE:  0.0018457756
MAE:  0.0017663452
MAE:  0.001702216
MAE:  0.0016507987
MAE:  0.0016446318
MAE:  0.0016257827
MAE:  0.0016183533
MAE:  0.0016124534
MAE:  0.0016108073
MAE:  0.0015928042
MAE:  0.0015997468
MAE:  0.0015924731
MAE:  0.0015874435
MAE:  0.0015853907
MAE:  0.0015822836
MAE:  0.0015801914
MAE:  0.0015789379
MAE:  0.0015758416
MAE:  0.0015791811
MAE:  0.0015773601
MAE:  0.0015774049
MAE:  0.0015787614
MAE:  0.0015801645
MAE:  0.0015793251
MAE:  0.0015803353
MAE:  0.0015810999
MAE:  0.0015818949


In [8]:
def read_img(path, image_size=112):
    # Read the image from the specified file path.
    # The file is expected to be in the 'imgs/train/' directory.
    img = cv2.imread(f'data/raw/train/{path}')
    
    # Resize the image to the desired dimensions (image_size x image_size)
    # using linear interpolation for smooth resizing.
    img_ = cv2.resize(img, (image_size, image_size), interpolation=cv2.INTER_LINEAR)
    
    # Normalize the resized image:
    # - Convert pixel values from the range [0, 255] to [0, 1] by dividing by 255.
    # - Shift the range to [-0.5, 0.5] by subtracting 0.5.
    img = (img_ / 255.) - 0.5
    
    # Change the image layout from (height, width, channels) to (channels, height, width)
    # which is the expected format for PyTorch models.
    img = np.transpose(img, (2, 0, 1)).astype(np.float32)
    
    # Convert the numpy array into a PyTorch tensor.
    img = torch.from_numpy(img)
    
    # Return the processed image tensor and the resized image (in its original numpy format)
    return img, img_


In [9]:
# Set maximum number of iterations for the adversarial attack optimization
max_iter = 10

# Define the loss function (Mean Squared Error) for comparing descriptors
loss = nn.MSELoss()

# Define the step size (epsilon) for the adversarial perturbation
eps = 1e-3

# Dictionary to store the final attacked images keyed by the source image path/name
attacked_img_dict = {}

# Iterate over each pair of source and target images using tqdm for progress visualization.
# Each element in pairs['source_imgs'] and pairs['target_imgs'] is assumed to be a string
# containing multiple image paths separated by '|'.
for sour, targ in tqdm(zip(pairs['source_imgs'], pairs['target_imgs']), total=len(pairs['source_imgs'])):
    
    # Initialize an array to store the target descriptors for the 5 target images.
    # The shape (5, 512) assumes each descriptor has 512 dimensions.
    target_descriptors = np.ones((5, 512), dtype=np.float32)
    
    # Split the target and source image strings into individual file names/paths.
    targ = targ.split('|')
    sour = sour.split('|')

    # List to optionally collect the original target images (might be used for debugging/visualization)
    list_tagt_img = []
    
    # Process each target image to compute its descriptor.
    for i, t in enumerate(targ):
        # Read the image using the read_img function.
        # img is the preprocessed tensor and orig_tgt is the resized original image.
        img, orig_tgt = read_img(t)
        
        # Append the original target image to the list.
        list_tagt_img += [orig_tgt]
        
        # Add a batch dimension and move the image tensor to GPU.
        img = img.unsqueeze(0).cuda(non_blocking=True)
        
        # Pass the image through the model (without gradients) to get its descriptor.
        # Variable() wraps the tensor, though requires_grad is False here.
        res = model(Variable(img, requires_grad=False)).data.cpu().numpy().squeeze()
        
        # Store the computed descriptor in the target_descriptors array.
        target_descriptors[i] = res

    # For each source image, perform the adversarial attack.
    for ii, s in enumerate(sour):
        # Read the source image; orig_img is the resized original image.
        img, orig_img = read_img(s)
        
        # Add batch dimension and move to GPU.
        img = img.unsqueeze(0).cuda(non_blocking=True)
        
        # Wrap the image tensor in a Variable and enable gradient computation.
        input_var = Variable(img, requires_grad=True)
        
        # Initialize attacked_img as the original image (will update if attack succeeds).
        attacked_img = orig_img
        
        # Perform iterative optimization for the adversarial attack.
        for iter_number in range(max_iter):
            # Initialize the adversarial noise accumulator as zeros.
            adv_noise = torch.zeros((3, 112, 112)).cuda(non_blocking=True)
            
            # Loop over each target descriptor to compute gradients for each.
            for tg in target_descriptors:
                # Convert the current target descriptor into a tensor Variable on GPU.
                # No gradient is required for target_out.
                target_out = Variable(torch.from_numpy(tg).unsqueeze(0).cuda(non_blocking=True), requires_grad=False)
                
                # Clear any existing gradients in input_var.
                input_var.grad = None
                
                # Forward pass: get the current descriptor of the (possibly perturbed) source image.
                out = model(input_var)
                
                # Compute the loss between the current source descriptor and the target descriptor.
                calc_loss = loss(out, target_out)
                
                # Backward pass: compute gradients with respect to the input image.
                calc_loss.backward()
                
                # Compute the noise update using the sign of the gradient,
                # scaled by the small epsilon value.
                noise = eps * torch.sign(input_var.grad.data).squeeze()
                
                # Accumulate noise from this target descriptor.
                adv_noise = adv_noise + noise

            # Update the source image by subtracting the accumulated adversarial noise.
            input_var.data = input_var.data - adv_noise

            # Convert the updated image tensor back to CPU and remove the batch dimension.
            changed_img = input_var.data.cpu().squeeze()
            
            # Denormalize the image back to the [0, 255] pixel range.
            changed_img = ((changed_img + 0.5) * 255)
            
            # Clip pixel values to ensure they remain valid.
            changed_img[changed_img < 0] = 0
            changed_img[changed_img > 255] = 255
            
            # Convert the tensor image to a numpy array with shape (height, width, channels)
            changed_img = np.transpose(changed_img.numpy(), (1, 2, 0)).astype(np.int16)
            
            # Compute the Structural Similarity Index (SSIM) between the original and changed images.
            # This helps ensure the perturbation remains visually imperceptible.
            ssim_score = ssim(orig_img, changed_img, channel_axis=2, data_range=256)
            
            # If the SSIM score drops below 0.95, the perturbation is too visible, so break out of the loop.
            if ssim_score < 0.95:
                break
            else:
                # Otherwise, update the attacked_img to the current changed image.
                attacked_img = changed_img
        
        # Save the final attacked image for the given source image in the dictionary.
        attacked_img_dict[s] = attacked_img


  0%|          | 0/1000 [00:00<?, ?it/s]

In [10]:
sample_submission = pl.read_csv('data/raw/sample_submission.csv')

In [11]:
# Create an empty DataFrame using Polars (pl) for the sample submission.
sample_submission_df = pl.DataFrame()

# Assign the 'Id' column from an existing sample_submission DataFrame to the new DataFrame.
sample_submission_df['Id'] = sample_submission['Id']

# Initialize an empty list to hold the result strings for each attacked image.
result = []

# Iterate over each image ID in the sample_submission DataFrame using tqdm for progress visualization.
for id_ in tqdm(sample_submission_df['Id']):
    # Retrieve the attacked image from the attacked_img_dict using the current ID.
    # Flatten the image array to a 1D list of pixel values.
    # Convert each pixel value to a string.
    # Join all pixel strings with the '|' character to form a single string.
    result += ['|'.join([str(i) for i in attacked_img_dict[id_].flatten().tolist()])]
    
# Add a new column 'Target' to the DataFrame, containing the processed attacked image pixel values.
sample_submission_df['Target'] = result


  0%|          | 0/5000 [00:00<?, ?it/s]

In [12]:
sample_submission_df.to_csv('data/submissions/solution.csv', index = None)

In [13]:
# METRIC FUNCTION

# from skimage.metrics import structural_similarity as ssim
# 
# class MCSDataset(torch.utils.data.Dataset):
#     def __init__(self, image_path,  imsize = 112):
#         self.image_path = image_path
#         self.image_size = imsize

#     def __len__(self):
#         return len(self.image_path)

#     def __getitem__(self, idx):
#         img = self.image_path[idx]
#         img = (img / 255.) - 0.5
#         img = np.transpose(img,(2,0,1)).astype(np.float32)
#         img = torch.from_numpy(img)

#         return img

        
# model = Model().eval()

# pairs = pd.read_csv('data/raw/pairs_list.csv')
# paths_embeds = pd.read_csv('data/raw/paths_embeds.csv')
# embeds = np.load('data/raw/real_embeds.npy')
# sample_submission = pd.read_csv('data/raw/sample_submission.csv')
# dict_embeds = {x:i for i,x in enumerate(paths_embeds['image_path'])}

# imgs_ = [np.array([int(i) for i in x.split('|')]).reshape((112, 112, 3)) for x in submission['Target']]

# dict_ss_ids = {x:i for i,x in enumerate(sample_submission['Id'])}
# for i, ids in enumerate(submission['Id']):
#     val = sample_submission['Target'][dict_ss_ids[ids]]
#     val = np.array([int(i) for i in val.split('|')]).reshape((112, 112, 3))
#     sim_ = ssim(imgs_[i], val, channel_axis=2, data_range = 256)
#     if sim_ < 0.95:
#         return -1
        
# params_val = {'batch_size': 64, 'shuffle': False, 'drop_last': False, 'num_workers': 2}
# imgs_path = os.listdir('/kaggle/input/ioai-contest-2') 
# val_loader = torch.utils.data.DataLoader(MCSDataset(imgs_), **params_val)

# embeds_sourse = []
# with torch.no_grad():
#     for batch_number,  img  in tqdm(enumerate(val_loader)):
#         outputs = model(img, None, train = False)
#         embeds_sourse += [outputs]
# embeds_sourse = np.concatenate(embeds_sourse)

# dict_sours = {x:i for i,x in enumerate(submission['Id'])}
# all_paths = set(submission['Id'])

# all_score = []
# for sour, targ in zip(pairs['source_imgs'], pairs['target_imgs']):

#     sour = sour.split('|')
#     targ = targ.split('|')

#     if sour[0] in all_paths:
#         score = []
#         for s in sour:
#             for t in targ:
#                 if t != s:
#                     score += [((embeds[dict_embeds[t]] - embeds_sourse[dict_sours[s]]) ** 2).sum() ** (1/2)]
                    
#         score = np.mean(score)
#         all_score += [score]

# score = np.mean(score)