In [22]:
# Libraries
import os
import gc
import time
import random
import math
from scipy import spatial
from tqdm import tqdm
import warnings
import cv2
import pandas as pd
import numpy as np
from numpy import dot, sqrt
import seaborn as sns
import matplotlib as mpl
import matplotlib.patches as patches
import matplotlib.pyplot as plt
from IPython.display import display_html

from sklearn.model_selection import StratifiedKFold
import torch
from torch.utils.data import DataLoader, Dataset
from torch.optim import Adam, lr_scheduler
import torch.nn as nn
import torch.nn.functional as F
from torch.cuda.amp import GradScaler, autocast

from albumentations.pytorch import transforms
import albumentations
import timm

from sklearn.preprocessing import normalize

# Environment check
warnings.filterwarnings("ignore")


In [23]:
def set_seed(seed = 1234):
    '''
    🌱src:https://www.kaggle.com/andradaolteanu/melanoma-competiton-aug-resnet-effnet-lb-0-91
    Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    
set_seed()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Device available now:', device)

Device available now: cuda


In [52]:


# --------- INITIAL PARAMETERS ---------
TRAIN_FOLDER = "./happy-whale-and-dolphin/aug_train/"
TEST_FOLDER = "./happy-whale-and-dolphin/test_images/"

# Set some parameters for sanity checks & experimenting
N_SPLITS = 5
BATCH_SIZE = 16
MODEL_NAME = 'efficientnet_b0'
NUM_CLASSES = 15587
NO_NEURONS = 250
EMBEDDING_SIZE = 128
# -------------------------------------



In [54]:


# Import the data
train = pd.read_csv("./happy-whale-and-dolphin/aug_train.csv")
test = pd.read_csv("./sample_submission.csv")
train['individual_key'] = train.groupby(['individual_id']).ngroup()

# Update path to new image folders
train["path"] = TRAIN_FOLDER + train["image"]
test["path"] = TEST_FOLDER + test["image"]

print("TRAIN:")
display_html(train.head())
print("\n", "TEST:")
display_html(test.head())



TRAIN:


Unnamed: 0,image,species,individual_id,file_path,class,count,individual_key,path
0,00021adfb725ed.jpg,melon_headed_whale,cadddb1636b9,./happy-whale-and-dolphin/aug_train/00021adfb7...,whale,5,12348,./happy-whale-and-dolphin/aug_train/00021adfb7...
1,00021adfb725ed_0.jpg,melon_headed_whale,cadddb1636b9,./happy-whale-and-dolphin/aug_train/00021adfb7...,whale,5,12348,./happy-whale-and-dolphin/aug_train/00021adfb7...
2,00021adfb725ed_1.jpg,melon_headed_whale,cadddb1636b9,./happy-whale-and-dolphin/aug_train/00021adfb7...,whale,5,12348,./happy-whale-and-dolphin/aug_train/00021adfb7...
3,00021adfb725ed_2.jpg,melon_headed_whale,cadddb1636b9,./happy-whale-and-dolphin/aug_train/00021adfb7...,whale,5,12348,./happy-whale-and-dolphin/aug_train/00021adfb7...
4,00021adfb725ed_3.jpg,melon_headed_whale,cadddb1636b9,./happy-whale-and-dolphin/aug_train/00021adfb7...,whale,5,12348,./happy-whale-and-dolphin/aug_train/00021adfb7...



 TEST:


Unnamed: 0,image,species,individual_id,path
0,00021adfb725ed.jpg,melon_headed_whale,cadddb1636b9,./happy-whale-and-dolphin/test_images/00021adf...
1,000562241d384d.jpg,humpback_whale,1a71fbb72250,./happy-whale-and-dolphin/test_images/00056224...
2,0007c33415ce37.jpg,false_killer_whale,60008f293a2b,./happy-whale-and-dolphin/test_images/0007c334...
3,0007d9bca26a99.jpg,bottlenose_dolphin,4b00fe572063,./happy-whale-and-dolphin/test_images/0007d9bc...
4,00087baf5cef7a.jpg,humpback_whale,8e5253662392,./happy-whale-and-dolphin/test_images/00087baf...


In [55]:
class HappyWhaleDataset(Dataset):
    
    def __init__(self, csv, trainFlag):
        '''Module to create the PyTorch Dataset.
        csv: full dataframe (train or test)
        trainFlag: True if csv is a training/validation dataset, False otherwise
        return: image and class target if trainFlag, otherwise only image'''
        
        self.csv = csv
        self.trainFlag = trainFlag
        if self.trainFlag:
            self.transform = albumentations.Compose([
                albumentations.Resize(128, 128),
                albumentations.HorizontalFlip(),
                albumentations.VerticalFlip(),
                albumentations.Rotate(),
                albumentations.Normalize(),
                # B&W?
            ])
        else:
            self.transform = albumentations.Compose([
                albumentations.Normalize()
            ])

            
    def __len__(self):
        return self.csv.shape[0]

    
    def __getitem__(self, index):
        # Get data
        row = self.csv.iloc[index]
        
        # Read and transform the image
        image = cv2.imread(row.path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        transformed_img = self.transform(image=image)['image'].astype(np.float32)
        image = transformed_img.transpose(2, 0, 1)
        image = torch.tensor(image)            

        if self.trainFlag:
            # Retrieve the target group
            target = torch.tensor(row.individual_key)
            return image, target
        
        else:
            return image

In [27]:


# Example for the Dataset data
example_dataset = HappyWhaleDataset(train.head(12), trainFlag=True)
example_loader = DataLoader(example_dataset, batch_size=3)

for k, (image, target) in enumerate(example_loader):
    print(f"--- Batch {k} ---")
    print("Image Shape:", image.shape)
    print("Target:", target, "\n")



--- Batch 0 ---
Image Shape: torch.Size([3, 3, 128, 128])
Target: tensor([12348, 12348, 12348]) 

--- Batch 1 ---
Image Shape: torch.Size([3, 3, 128, 128])
Target: tensor([12348, 12348,  1636]) 

--- Batch 2 ---
Image Shape: torch.Size([3, 3, 128, 128])
Target: tensor([1636, 1636, 1636]) 

--- Batch 3 ---
Image Shape: torch.Size([3, 3, 128, 128])
Target: tensor([5842, 4551, 8721]) 



In [28]:


# src: https://amaarora.github.io/2020/08/30/gempool.html

class GeM(nn.Module):
    def __init__(self, p=3, eps=1e-6):
        super(GeM,self).__init__()
        self.p = nn.Parameter(torch.ones(1)*p)
        self.eps = eps

    def forward(self, x):
        return self.gem(x, p=self.p, eps=self.eps)
        
    def gem(self, x, p=3, eps=1e-6):
        # Applies 2D average-pooling operation in kH * kW regions by step size
        return F.avg_pool2d(x.clamp(min=eps).pow(p), (x.size(-2), x.size(-1))).pow(1./p)
        
    def __repr__(self):
        return self.__class__.__name__ + '(' + 'p=' + '{:.4f}'.format(self.p.data.tolist()[0]) + ', ' + 'eps=' + str(self.eps) + ')'



In [29]:
class ArcMarginProduct(nn.Module):
    def __init__(self, in_features, out_features, s=30.0, 
                 m=0.50, easy_margin=False, ls_eps=0.0):
        '''
        in_features: dimension of the input
        out_features: dimension of the last layer (in our case the classification)
        s: norm of input feature
        m: margin
        ls_eps: label smoothing'''
        
        super(ArcMarginProduct, self).__init__()
        self.in_features, self.out_features = in_features, out_features
        self.s = s
        self.m = m
        self.ls_eps = ls_eps
        self.weight = nn.Parameter(torch.FloatTensor(out_features, in_features))
        # Fills the input `Tensor` with values according to the method described in
        # `Understanding the difficulty of training deep feedforward neural networks`
        # Glorot, X. & Bengio, Y. (2010)
        # using a uniform distribution.
        nn.init.xavier_uniform_(self.weight)

        self.easy_margin = easy_margin
        self.cos_m, self.sin_m = math.cos(m), math.sin(m)
        self.th = math.cos(math.pi - m)
        self.mm = math.sin(math.pi - m) * m

    def forward(self, input, label):
        # --------------------------- cos(theta) & phi(theta) ---------------------
        cosine = F.linear(F.normalize(input), F.normalize(self.weight))
        sine = torch.sqrt(1.0 - torch.pow(cosine, 2))
        phi = cosine * self.cos_m - sine * self.sin_m
        if self.easy_margin:
            phi = torch.where(cosine > 0, phi, cosine)
        else:
            phi = torch.where(cosine > self.th, phi, cosine - self.mm)
        # --------------------------- convert label to one-hot ---------------------
        one_hot = torch.zeros(cosine.size()).to(device)
        one_hot.scatter_(1, label.view(-1, 1).long(), 1)
        if self.ls_eps > 0:
            one_hot = (1 - self.ls_eps) * one_hot + self.ls_eps / self.out_features
        # -------------torch.where(out_i = {x_i if condition_i else y_i) ------------
        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
        output *= self.s

        return output

In [30]:
class HappyWhaleModel(nn.Module):
    def __init__(self, modelName, numClasses, noNeurons, embeddingSize):
        
        super(HappyWhaleModel, self).__init__()
        # Retrieve pretrained weights
        self.backbone = timm.create_model(modelName, pretrained=True)
        # Save the number features from the backbone
        ### different models have different numbers e.g. EffnetB3 has 1536
        backbone_features = self.backbone.classifier.in_features
        self.backbone.classifier = nn.Identity() # ?????
        self.backbone.global_pool = nn.Identity() # ?????
        self.gem = GeM()
        # Embedding layer (what we actually need)
        self.embedding = nn.Sequential(nn.Linear(backbone_features, noNeurons),
                                       nn.BatchNorm1d(noNeurons),
                                       nn.ReLU(),
                                       nn.Dropout(p=0.2),
                                       
                                       nn.Linear(noNeurons, embeddingSize),
                                       nn.BatchNorm1d(embeddingSize),
                                       nn.ReLU(),
                                       nn.Dropout(p=0.2))
        self.arcface = ArcMarginProduct(in_features=embeddingSize, 
                                        out_features=numClasses,
                                        s=30.0, m=0.50, easy_margin=False, ls_eps=0.0)
        
        
    def forward(self, image, target=None, prints=False):
        '''If there is a target it means that the model is training on the dataset.
        If there is no target, that means the model is predicting on the test dataset.
        In this case we would skip the ArcFace layer and return only the image embeddings.
        '''
        
        features = self.backbone(image)
        # flatten transforms from e.g.: [3, 1536, 1, 1] to [3, 1536]
        gem_pool = self.gem(features).flatten(1)
        embedding = self.embedding(gem_pool)
        if target != None:
            out = self.arcface(embedding, target)
        
        if prints:
            print("0. IN:", "image shape:", image.shape, "target:", target)
            print("1. Backbone Output:", features.shape)
            print("2. GeM Pool Output:", gem_pool.shape)
            print("3. Embedding Output:", embedding.shape)
            if target != None:
                print("4. ArcFace Output:", out.shape)
        
        if target != None:
            return out, embedding
        else:
            return embedding

In [31]:


# Create an example model - Effnet
model_example = HappyWhaleModel(MODEL_NAME, NUM_CLASSES, NO_NEURONS, EMBEDDING_SIZE).to(device)



INFO:timm.models._builder:Loading pretrained weights from Hugging Face hub (timm/efficientnet_b0.ra_in1k)
INFO:timm.models._hub:[timm/efficientnet_b0.ra_in1k] Safe alternative available for 'pytorch_model.bin' (as 'model.safetensors'). Loading weights using safetensors.


In [32]:


# Criterion
criterion_example = nn.CrossEntropyLoss()

# We'll use previous datasets & dataloader
for k, (image, target) in enumerate(example_loader):
    print(f"=== Batch {k} ===")
    image, target = image.to(device), target.to(device)
    out, _ = model_example(image, target, prints=True)
    loss = criterion_example(out, target)
    print('--- LOSS ---', loss.item(), "\n")



=== Batch 0 ===
0. IN: image shape: torch.Size([3, 3, 128, 128]) target: tensor([12348, 12348, 12348], device='cuda:0')
1. Backbone Output: torch.Size([3, 1280, 4, 4])
2. GeM Pool Output: torch.Size([3, 1280])
3. Embedding Output: torch.Size([3, 128])
4. ArcFace Output: torch.Size([3, 15587])
--- LOSS --- 28.241270065307617 

=== Batch 1 ===
0. IN: image shape: torch.Size([3, 3, 128, 128]) target: tensor([12348, 12348,  1636], device='cuda:0')
1. Backbone Output: torch.Size([3, 1280, 4, 4])
2. GeM Pool Output: torch.Size([3, 1280])
3. Embedding Output: torch.Size([3, 128])
4. ArcFace Output: torch.Size([3, 15587])
--- LOSS --- 26.934799194335938 

=== Batch 2 ===
0. IN: image shape: torch.Size([3, 3, 128, 128]) target: tensor([1636, 1636, 1636], device='cuda:0')
1. Backbone Output: torch.Size([3, 1280, 4, 4])
2. GeM Pool Output: torch.Size([3, 1280])
3. Embedding Output: torch.Size([3, 128])
4. ArcFace Output: torch.Size([3, 15587])
--- LOSS --- 27.42755126953125 

=== Batch 3 ===
0. I

In [33]:


# --------- GLOBAL PARAMETERS ---------
NUM_CLASSES = 15587
N_SPLITS = 3
BATCH_SIZE = 32
MODEL_NAME = 'efficientnet_b0'
RUN_NAME = "B0_neurons_200_embed_200_epochs_4"
EPOCHS = 6
VALID_PERC = 0.1
NO_NEURONS = 250
EMBEDDING_SIZE = 128
# -> Optimizer
LR = 0.0001
WEIGHT_DECAY = 0.000001
# -> Scheduler
T_MAX = 500              # Maximum number of iterations
MIN_LR = 0.000001        # Minimum learning rate. Default: 0
# ------------------------------------



In [34]:


del model_example
gc.collect()



5868

In [35]:


def get_loaders(df, train_i, valid_i):
    '''
    df: the full initial dataframe
    train_i, valid_i: list of indexes for train and validation split
    VALID_PERC: percentage of how much of valid data to preserve - leave 1 for full dataset
    return: train_loader and valid_loader
    '''
    
    train_df = df.iloc[train_i, :]
    # To go quicker through validation
    valid_df = df.iloc[valid_i, :].sample(int(len(valid_i)*VALID_PERC), random_state=23)

    # Datasets & Dataloader
    train_dataset = HappyWhaleDataset(train_df, trainFlag=True)
    valid_dataset = HappyWhaleDataset(valid_df, trainFlag=True)

    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False)
    
    return train_loader, valid_loader



In [36]:


def get_model_optimizer_criterion():
    
    model = HappyWhaleModel(MODEL_NAME, NUM_CLASSES, NO_NEURONS, EMBEDDING_SIZE).to(device)
    optimizer = Adam(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY, amsgrad=False)
    scheduler = lr_scheduler.CosineAnnealingLR(optimizer, T_max=T_MAX, eta_min=MIN_LR)
    criterion = nn.CrossEntropyLoss()
    
    return model, optimizer, scheduler, criterion



In [37]:
def train_pipeline(train):
    '''
    train: the full training dataframe (to be split in train data & valid data)
    '''

    s = time.time()
    
    # 🐝 W&B Initialize
    params = dict(model=MODEL_NAME, epochs=EPOCHS, split=N_SPLITS, 
                  batch=BATCH_SIZE, lr=LR, weight_decay=WEIGHT_DECAY,
                  t_max=T_MAX, min_lr=MIN_LR)

    
    # === CV Split ===
    skf = StratifiedKFold(n_splits=N_SPLITS)
    skf_splits = skf.split(X=train, y=train["individual_key"])


    for fold, (train_i, valid_i) in enumerate(skf_splits):

        print("~"*25)
        print("~"*8, f"FOLD {fold}", "~"*8)
        print("~"*25)

        # Retrieve data loaders
        train_loader, valid_loader = get_loaders(train, train_i, valid_i)

        # Model/ Optimizer/ Scheduler/ Criterion
        model, optimizer, scheduler, criterion = get_model_optimizer_criterion()
        # Hooks into the torch model to collect gradients and the topology

        # Run Training
        BEST_SCORE = 9999

        for epoch in range(EPOCHS):
            print("~"*8, f"Epoch {epoch}", "~"*8)

            # === TRAIN ===
            model.train()
            train_losses = []

            for images, targets in tqdm(train_loader, desc = 'TRAIN'):
                images, targets = images.to(device), targets.to(device)

                # Clear gradients BEFORE prediction
                optimizer.zero_grad()
                # Make predictions
                out, _ = model(images, targets)
                # Compute Loss and Optimize
                loss = criterion(out, targets)             
                loss.backward()
                optimizer.step()

                train_losses.append(loss.cpu().detach().numpy().tolist())

            # Adjust Learning Rate
            scheduler.step()

            mean_train_loss = np.mean(train_losses)
            print("Mean Train Loss:", mean_train_loss)


            # === EVAL ===
            model.eval()
            valid_losses, valid_preds, valid_targets = [], [], []
            with torch.no_grad():
                for images, targets in valid_loader:
                    valid_targets.append(targets)
                    images, targets = images.to(device), targets.to(device)

                    out, _ = model(images, targets)
                    loss = criterion(out, targets)

                    valid_preds.append(out)
                    valid_losses.append(loss.cpu().detach().numpy().tolist())

            mean_valid_loss = np.mean(valid_losses)
            print("Mean Valid Loss:", mean_valid_loss)
            gc.collect()

            # === UPDATES ===

            if mean_valid_loss < BEST_SCORE:        
                print("! Saving model in fold {} | epoch {} ...".format(fold, epoch), "\n")
                torch.save(model.state_dict(), f"EffNetB0_fold_{fold}_loss_{round(mean_valid_loss, 3)}.pt")

                BEST_SCORE = mean_valid_loss

        # Clean memory before next fold
        del model, optimizer, scheduler, criterion, images, targets, \
                    train_losses, valid_losses, valid_preds, valid_targets
        torch.cuda.empty_cache()
        gc.collect()


    print(f"Time to run: {round((time.time() - s)/60, 2)} minutes")

In [38]:


train_pipeline(train)



INFO:timm.models._builder:Loading pretrained weights from Hugging Face hub (timm/efficientnet_b0.ra_in1k)
INFO:timm.models._hub:[timm/efficientnet_b0.ra_in1k] Safe alternative available for 'pytorch_model.bin' (as 'model.safetensors'). Loading weights using safetensors.


~~~~~~~~~~~~~~~~~~~~~~~~~
~~~~~~~~ FOLD 0 ~~~~~~~~
~~~~~~~~~~~~~~~~~~~~~~~~~
~~~~~~~~ Epoch 0 ~~~~~~~~


TRAIN:   6%|████                                                             | 150/2380 [00:12<03:11, 11.63it/s]


KeyboardInterrupt: 

In [39]:
# pretrained_name = "EffNetB0_fold_0_loss_14.979"
# pretrained_name = "EffNetB0_fold_1_loss_14.91"
pretrained_name = "EffNetB0_fold_2_loss_16.367.pt"

# Path to trained model parameters (i.e. weights and biases)
classif_model_path = pretrained_name

# Load the model and append learned params
model = HappyWhaleModel(MODEL_NAME, NUM_CLASSES, NO_NEURONS, EMBEDDING_SIZE).to(device)
model.load_state_dict(torch.load(classif_model_path))

INFO:timm.models._builder:Loading pretrained weights from Hugging Face hub (timm/efficientnet_b0.ra_in1k)
INFO:timm.models._hub:[timm/efficientnet_b0.ra_in1k] Safe alternative available for 'pytorch_model.bin' (as 'model.safetensors'). Loading weights using safetensors.


<All keys matched successfully>

In [40]:


# DataLoader
dataset = HappyWhaleDataset(train, trainFlag=True)
dataloader = DataLoader(dataset, batch_size=64, shuffle=False)

# Retrieve all embeddings for each image
all_embeddings = []

model.eval()
with torch.no_grad():
    for image, target in tqdm(dataloader): 
        image, target = image.to(device), target.to(device)
        _, embedding = model(image, target)
        embedding = embedding.detach().cpu().numpy()
        all_embeddings.append(embedding)
        
# Concatenate batches together
image_embeddings = np.concatenate(all_embeddings)

# Save embeddings and corresponding image
np.save(f'{pretrained_name}.npy', image_embeddings)



100%|███████████████████████████████████████████████████████████████████████| 1785/1785 [03:07<00:00,  9.51it/s]


In [None]:
# 🐝Save embeddings to W&B
save_dataset_artifact(run_name=pretrained_name, 
                      artifact_name=pretrained_name, 
                      path="../input/happywhale-2022/EffNetB0_fold_0_loss_14.979.npy")

In [41]:
from sklearn.neighbors import NearestNeighbors

# === CLUSTERING ===
# Use the cuml function from RAPIDS suite
knn_model = NearestNeighbors(n_neighbors=5)
# Train the model
knn_model.fit(image_embeddings)

# Infer on the training data
# distances - the distance between each point in the group
# indices - the index row of each image
distances, indices = knn_model.kneighbors(image_embeddings)



In [42]:


# === PREDICTION ===
# Create the grouped predictions based on distances & indices
predictions = {"images": [], "embeddings": []}

for i in tqdm(range(len(image_embeddings))):
    index = np.where(distances[k, ] < 6.0)[0]
    split = indices[i, index]
    
    grouped_images = train.iloc[split]["image"].values
    grouped_embeddings = image_embeddings[split]

    predictions["images"].append(grouped_images)
    predictions["embeddings"].append(grouped_embeddings)



100%|█████████████████████████████████████████████████████████████████| 114213/114213 [00:14<00:00, 7691.04it/s]


In [43]:
from sklearn.metrics.pairwise import cosine_similarity
from seaborn import heatmap

# Select a clustered group
group = 0

example_paths = ["../input/whale2-cropped-dataset/cropped_train_images/cropped_train_images/"+img \
                     for img in predictions["images"][group]]
example_embeds = predictions["embeddings"][group]

# Compute similarity matrix
cos_matrix = cosine_similarity(example_embeds)
mask = np.zeros_like(cos_matrix)
mask[np.triu_indices_from(mask)] = True



In [44]:


# --------- INFERENCE PARAMETERS ---------
PRETRAINED_NAME1 = "EffNetB0_fold_0_loss_16.666"
PRETRAINED_NAME2 = "EffNetB0_fold_1_loss_16.539"
PRETRAINED_NAME3 = "EffNetB0_fold_2_loss_16.367"
MODEL_NAME = 'efficientnet_b0'
NUM_CLASSES = 15587
NO_NEURONS = 250
EMBEDDING_SIZE = 128
# ----------------------------------------



In [None]:


# ===== I. EMBEDDINGS  =====
# Here I am retrieving the 3 test embeddings and averaging them together
test_embeddings1 = retrieve_test_embeddings(PRETRAINED_NAME1, 
                                            MODEL_NAME, NUM_CLASSES, NO_NEURONS, EMBEDDING_SIZE)
test_embeddings2 = retrieve_test_embeddings(PRETRAINED_NAME2, 
                                            MODEL_NAME, NUM_CLASSES, NO_NEURONS, EMBEDDING_SIZE)
test_embeddings3 = retrieve_test_embeddings(PRETRAINED_NAME3, 
                                            MODEL_NAME, NUM_CLASSES, NO_NEURONS, EMBEDDING_SIZE)

test_embeddings = (test_embeddings1+test_embeddings2+test_embeddings3)/3



NameError: name 'retrieve_test_embeddings' is not defined

In [47]:


# ===== II. CLUSTERS  =====
# Get full train embeddings
# Here I am adding the 3 train embeddings and averaging them together as well
train_embeddings = np.load("./EffNetB0_fold_2_loss_16.367.pt.npy")

train_individual_ids = train["individual_id"].values
print("Train Embeddings:", train_embeddings.shape, "\n"+
      "Train Individual Id:", train_individual_ids.shape, "\n")


# Train a final KNN model with the train embeddings
knn_final_model = NearestNeighbors(n_neighbors=50)
knn_final_model.fit(train_embeddings)

# Get distances & indexes for test
#test_embeddings = normalize(test_embeddings, axis=1, norm='l2')

D, I = knn_final_model.kneighbors(train_embeddings)
print("Distances shape:", D.shape, "\n"+
      "Index shape:", I.shape)

# List of the test dataframe image ids (to loop through it)
test_images = test["image"].tolist()



Train Embeddings: (114213, 128) 
Train Individual Id: (114213,) 

Distances shape: (114213, 50) 
Index shape: (114213, 50)


In [48]:


test_df = []

# Loop through each observation within test data
for k, image_id in tqdm(enumerate(test_images)):
    # Get individual_id & distances for the observation
    individual_id = train_individual_ids[I[k]]
    distances = D[k]
    # Create a df subset with this info
    subset_preds = pd.DataFrame(np.stack([individual_id, distances], axis=1),
                                columns=['individual_id','distances'])
    subset_preds['image_id'] = image_id
    test_df.append(subset_preds)
    
    
# Concatenate subset dataframes into 1 dataframe
test_df = pd.concat(test_df).reset_index(drop=True)
# Choose max distance for each unique pair of individual_id & image_id
test_df = test_df.groupby(['image_id','individual_id'])['distances'].max().reset_index()



51033it [00:10, 4827.90it/s]


In [49]:
# Have a look at the predictions dataset now
test_df.sample(n=5, random_state=24)

Unnamed: 0,image_id,individual_id,distances
2309569,ebf163f8ea6876.jpg,36a2b0f9281f,0.00953
1923562,c48f1de1806053.jpg,2393ace8cd64,0.004864
168397,115b95e01978e1.jpg,8ce3b57bc545,0.004645
1756395,b359dcd50304e5.jpg,86257eaa613b,0.00946
1656220,a9466ee34493e6.jpg,c493f38f5e18,0.003214


In [50]:
# ===== III. PREDICTION  =====

# Dictionary in format: {["image_id"]: 000, ["individual_id"]: 9999}
predictions = {}
thresh = 5

for k, row in tqdm(test_df.iterrows()):
    image_id = row["image_id"]
    individual_id = row["individual_id"]
    distance = row["distances"]
    
    # If the image_id has already been added in predictions before
    if image_id in predictions:
        # If total preds for this image_id are < 5 then add, else continue
        if len(predictions[image_id]) != 5:
            predictions[image_id].append(individual_id)
        else:
            continue
    # If the distance is greater than thresh add prediction + "new_individual"
    elif distance > thresh:
        predictions[image_id] = [individual_id, "new_individual"]
    else:
        predictions[image_id] = ["new_individual", individual_id]


# Fill in all lists that have less than 5 predictions as of yet
sample_list = ['37c7aba965a5', '114207cab555', 'a6e325d8e924', '19fbb960f07d','c995c043c353']

for image_id, preds in tqdm(predictions.items()):
    if len(preds) < 5:
        remaining = [individ_id for individ_id in sample_list if individ_id not in preds]
        preds.extend(remaining)
        predictions[image_id] = preds[:5]

2510053it [01:18, 31862.51it/s]
100%|████████████████████████████████████████████████████████████████| 51033/51033 [00:00<00:00, 3904843.77it/s]


In [51]:
# Create final submission
predictions = pd.Series(predictions).reset_index()
predictions.columns = ['image','predictions']
predictions['predictions'] = predictions['predictions'].apply(lambda x: ' '.join(x))
predictions.to_csv('submission.csv',index=False)

predictions.head()

Unnamed: 0,image,predictions
0,00021adfb725ed.jpg,new_individual 07a477b6a091 0f302e71d455 1394c...
1,000562241d384d.jpg,new_individual 02f9cc951294 04687f521fdf 0857f...
2,0007c33415ce37.jpg,new_individual 01db941ec9b6 044434ed1926 101e1...
3,0007d9bca26a99.jpg,new_individual 005cab4fa315 033a294ca772 063ca...
4,00087baf5cef7a.jpg,new_individual 0175ed3e02a4 04b434e53a13 06165...
