In [1]:
%config Completer.use_jedi = False

In [2]:
import sys
sys.path.append('../input/timmmaster')
import timm

In [3]:
# import sys
# !cp ../input/rapids/rapids.21.06 /opt/conda/envs/rapids.tar.gz
# !cd /opt/conda/envs/ && tar -xzvf rapids.tar.gz > /dev/null
# sys.path = ["/opt/conda/envs/rapids/lib/python3.7/site-packages"] + sys.path
# sys.path = ["/opt/conda/envs/rapids/lib/python3.7"] + sys.path
# sys.path = ["/opt/conda/envs/rapids/lib"] + sys.path 
# !cp /opt/conda/envs/rapids/lib/libxgboost.so /opt/conda/lib/

In [4]:
import math
import os
import numpy as np
import cv2
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
import timm
import torch
from torch import nn 
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F 
import albumentations
from albumentations.pytorch.transforms import ToTensorV2
from torch.optim import lr_scheduler
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn import metrics


import gc
import matplotlib.pyplot as plt
import cudf
import cuml
import cupy
from cuml.feature_extraction.text import TfidfVectorizer
from cuml import PCA
from cuml.neighbors import NearestNeighbors

import random

In [5]:
TRAIN_DIR = '../input/shopee-product-matching/train_images/'
TEST_DIR = '../input/shopee-product-matching/test_images/'
TRAIN_CSV = '../input/crossvalidationfolds/folds.csv/'
MODEL_PATH = './'

class CFG:
    seed = 123
    classes = 11014 
    scale = 30 
    margin = 0.5
    model_name =  'tf_efficientnet_b4'
    fc_dim = 512
    img_size = 512
    batch_size = 20
    num_workers = 4
    device = device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model_path = '../input/trained-bg-softmax-512-512/softmax_512x512_tf_efficientnet_b4.pt'
    isTraining=False
    

In [6]:
# Read test dataset

def read_test_dataset():
    test_df = pd.read_csv('../input/shopee-product-matching/test.csv')
    cu_df = cudf.DataFrame(test_df)
    image_paths = '../input/shopee-product-matching/test_images/' + test_df['image']
    return image_paths, cu_df, test_df

In [7]:
def seed_torch(seed=10042):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_torch(CFG.seed)

# Image Predictions 

## Create Custom Dataset

In [8]:
class ShopeeDataset(Dataset):
    
    def __init__(self,image_path_lst,isTraining=False, transform=None):
        self.image_paths = image_path_lst
        self.transform = transform
    
    def __len__(self):
        return self.image_paths.shape[0]
    
    def __getitem__(self, idx):
        image_path = self.image_paths[idx]
        # read image convert to RGB and apply augmentation
        image = cv2.imread(image_path)
#         print(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        if self.transform:
            aug = self.transform(image=image)
            image = aug['image']
        
        return image, torch.tensor(1)
            

In [9]:
def getTestAugmentation(IMG_SIZE, isTraining=False):
    
    
    return albumentations.Compose([
        albumentations.Resize(IMG_SIZE, IMG_SIZE, always_apply=True),
        albumentations.Normalize(),
        ToTensorV2(p=1.0)
    ])

## Build Model

In [10]:
class ShopeeLabelGroupClassfier(nn.Module):
    
    def __init__(self,
                     model_name='tf_efficientnet_b0',
                     loss_fn='softmax',
                     classes = CFG.classes,
                     fc_dim = CFG.fc_dim,
                     pretrained=True,
                     use_fc=True,
                     isTraining=False
                ):
        
        
        super(ShopeeLabelGroupClassfier,self).__init__()
        
        # create bottlenack backbone network from pretrained model 
        self.backbone = timm.create_model(model_name, pretrained=pretrained)
        in_features = self.backbone.classifier.in_features
        # we will put FC layers over backbone to classfy images based on label groups
        self.backbone.classifier = nn.Identity()
        self.backbone.global_pool = nn.Identity()
        self.pooling = nn.AdaptiveAvgPool2d(1)
        self.use_fc = use_fc
        self.loss_fn =loss_fn
        
        # build top fc layers
        if self.use_fc:
            self.dropout = nn.Dropout(0.2)
            self.fc = nn.Linear(in_features,fc_dim )
            self.bn = nn.BatchNorm1d(fc_dim)
            in_features = fc_dim
        self.loss_fn = loss_fn
        
        if self.loss_fn=='softmax':
            self.final = nn.Linear(in_features, CFG.classes)
    
    def forward(self, image, label):
        features = self.get_features(image)
        
        if self.loss_fn=='softmax' and CFG.isTraining:
            logits = self.final(features)
            return logits
        else:
            return features
    
    def get_features(self,inp):
        batch_dim = inp.shape[0]
        inp = self.backbone(inp)
        inp = self.pooling(inp).view(batch_dim, -1)
        if self.use_fc:
            inp = self.dropout(inp)
            inp = self.fc(inp)
            inp = self.bn(inp)
        
        return inp
    
    

## Generate Image Embeddings

In [11]:
def generateImageEmbeddings(images):
    model = ShopeeLabelGroupClassfier(pretrained=False).to(CFG.device)
    model.load_state_dict(torch.load(CFG.model_path))
    model.eval()
    
    # create dataset
    test_aug = getTestAugmentation(CFG.img_size, CFG.isTraining)
    test_dataset = ShopeeDataset(image_path_lst=images, isTraining=CFG.isTraining, transform = test_aug)
    
    test_data_loader = torch.utils.data.DataLoader(
                                                test_dataset,
        batch_size=CFG.batch_size,
        num_workers =CFG.num_workers
    )
    
    embeddings = []
    
    with torch.no_grad():
        
        for image, label  in (test_data_loader):
            image = image.cuda()
            label = label.cuda()
            features = model(image, label)
            image_embeddings = features.detach().cpu().numpy()
            embeddings.append(image_embeddings)
            
    del model
    image_embeddings = np.concatenate(embeddings)
    print("Final Embedding shape {}".format(image_embeddings.shape))
    
    del embeddings
    gc.collect()
    
            
    return image_embeddings

## get top K neighbors

In [12]:
def get_topk_neighbors(data, embeddings, KNN=50):

    model = NearestNeighbors(n_neighbors = KNN)
    model.fit(embeddings)
    distances, indices = model.kneighbors(embeddings)
    
    threshold = 4.5
    predictions = []
    for k in tqdm(range(embeddings.shape[0])):
#         print(k)
        idx = np.where(distances[k,] < threshold)[0]
        ids = indices[k,idx]
#         print(ids)
        posting_ids = data['posting_id'].iloc[ids].values
        predictions.append(posting_ids)
        
    del model, distances, indices
    gc.collect()
    return data, predictions

In [13]:
image_paths, cu_df, test_df = read_test_dataset()
test_df.head()

Unnamed: 0,posting_id,image,image_phash,title
0,test_2255846744,0006c8e5462ae52167402bac1c2e916e.jpg,ecc292392dc7687a,Edufuntoys - CHARACTER PHONE ada lampu dan mus...
1,test_3588702337,0007585c4d0f932859339129f709bfdc.jpg,e9968f60d2699e2c,(Beli 1 Free Spatula) Masker Komedo | Blackhea...
2,test_4015706929,0008377d3662e83ef44e1881af38b879.jpg,ba81c17e3581cabe,READY Lemonilo Mie instant sehat kuah dan goreng


In [14]:
image_embeddings = generateImageEmbeddings(image_paths.values)


  cpuset_checked))


Final Embedding shape (3, 512)


In [15]:
data, image_predictions = get_topk_neighbors(test_df, image_embeddings, KNN=50 if len(test_df)>3 else 3)
data.head()

100%|██████████| 3/3 [00:00<00:00, 2796.82it/s]


Unnamed: 0,posting_id,image,image_phash,title
0,test_2255846744,0006c8e5462ae52167402bac1c2e916e.jpg,ecc292392dc7687a,Edufuntoys - CHARACTER PHONE ada lampu dan mus...
1,test_3588702337,0007585c4d0f932859339129f709bfdc.jpg,e9968f60d2699e2c,(Beli 1 Free Spatula) Masker Komedo | Blackhea...
2,test_4015706929,0008377d3662e83ef44e1881af38b879.jpg,ba81c17e3581cabe,READY Lemonilo Mie instant sehat kuah dan goreng


In [16]:
data['matches'] = image_predictions
data[['posting_id', 'matches']].to_csv('submission.csv', index=False)