# NAME

- load data
- load model
- load weight
- data -> feature 
- feature matching

# Import and Setup

In [3]:
%load_ext autoreload
%autoreload 2

import os, sys
sys.path.append('../')

import pandas as pd
from tqdm import tqdm

In [114]:
import torch
import cv2
from torch.utils.data import Dataset,DataLoader

In [7]:
from dataset.ImageDataloader import BuildImageDataloader
from torch_utils.Config import DEFAULT_CFG

In [50]:
from model.recognition.ShopeeCurricularFaceModel import ShopeeCurricularFaceModel

In [95]:
import albumentations
from albumentations.pytorch.transforms import ToTensorV2

# Function

In [86]:
def read_matching_dataset(csv, img_folder, no_GPU=False):
    df = pd.read_csv(csv)
    if 'label_group' in df:
        tmp = df.groupby(['label_group'])['posting_id'].unique().to_dict()
        df['matches'] = df['label_group'].map(tmp)
        df['matches'] = df['matches'].apply(lambda x: ' '.join(x))
    if no_GPU:
        df_cu = None
    else:
        df_cu = cudf.DataFrame(df)
    image_paths = img_folder + df['image']
    return df, df_cu, image_paths

In [93]:
DIM = (512,512)
def get_test_transforms():
    return albumentations.Compose(
        [
            albumentations.Resize(DIM[0],DIM[1],always_apply=True),
            albumentations.Normalize(),
        ToTensorV2(p=1.0)
        ]
    )

In [97]:
def f1_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    intersection = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    len_y_pred = y_pred.apply(lambda x: len(x)).values
    len_y_true = y_true.apply(lambda x: len(x)).values
    f1 = 2 * intersection / (len_y_pred + len_y_true)
    return f1

In [98]:
def combine_predictions(row):
    x = np.concatenate([row['image_predictions'], row['text_predictions']])
    return ' '.join( np.unique(x) )

In [100]:
def get_neighbors(df, embeddings, KNN = 50, image = True):
    '''
    https://www.kaggle.com/ragnar123/unsupervised-baseline-arcface?scriptVersionId=57121538
    '''

    model = NearestNeighbors(n_neighbors = KNN)
    model.fit(embeddings)
    distances, indices = model.kneighbors(embeddings)
    
    # Iterate through different thresholds to maximize cv, run this in interactive mode, then replace else clause with a solid threshold
    if GET_CV:
        if image:
            thresholds = list(np.arange(2,4,0.1))
        else:
            thresholds = list(np.arange(0.1, 1, 0.1))
        scores = []
        for threshold in thresholds:
            predictions = []
            for k in range(embeddings.shape[0]):
                idx = np.where(distances[k,] < threshold)[0]
                ids = indices[k,idx]
                posting_ids = ' '.join(df['posting_id'].iloc[ids].values)
                predictions.append(posting_ids)
            df['pred_matches'] = predictions
            df['f1'] = f1_score(df['matches'], df['pred_matches'])
            score = df['f1'].mean()
            print(f'Our f1 score for threshold {threshold} is {score}')
            scores.append(score)
        thresholds_scores = pd.DataFrame({'thresholds': thresholds, 'scores': scores})
        max_score = thresholds_scores[thresholds_scores['scores'] == thresholds_scores['scores'].max()]
        best_threshold = max_score['thresholds'].values[0]
        best_score = max_score['scores'].values[0]
        print(f'Our best score is {best_score} and has a threshold {best_threshold}')
        
        # Use threshold
        predictions = []
        for k in range(embeddings.shape[0]):
            # Because we are predicting the test set that have 70K images and different label groups, confidence should be smaller
            if image:
                idx = np.where(distances[k,] < 2.7)[0]
            else:
                idx = np.where(distances[k,] < 0.60)[0]
            ids = indices[k,idx]
            posting_ids = df['posting_id'].iloc[ids].values
            predictions.append(posting_ids)
    
    # Because we are predicting the test set that have 70K images and different label groups, confidence should be smaller
    else:
        predictions = []
        for k in tqdm(range(embeddings.shape[0])):
            if image:
                idx = np.where(distances[k,] < 2.7)[0]
            else:
                idx = np.where(distances[k,] < 0.60)[0]
            ids = indices[k,idx]
            posting_ids = df['posting_id'].iloc[ids].values
            predictions.append(posting_ids)
        
    del model, distances, indices
    gc.collect()
    return df, predictions

In [116]:
class ShopeeDataset(Dataset):
    def __init__(self, image_paths, transforms=None):

        self.image_paths = image_paths
        self.augmentations = transforms

    def __len__(self):
        return self.image_paths.shape[0]

    def __getitem__(self, index):
        image_path = self.image_paths[index]
        
        image = cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        if self.augmentations:
            augmented = self.augmentations(image=image)
            image = augmented['image']       
        
        
        return image,torch.tensor(1)

# Main

In [65]:
!ls "{folder}/shopee-product-matching/"

sample_submission.csv [1m[36mtest_images[m[m           [1m[36mtrain_images[m[m
test.csv              train.csv


In [66]:
folder = "../../../"
csv_train = f"{folder}/shopee-product-matching/train.csv"
image_folder = f"{folder}/shopee-product-matching/train_images/"
csv_test = f"{folder}/shopee-product-matching/test.csv"
image_folder_test = f"{folder}/shopee-product-matching/train_images/test_images"

In [67]:
df = pd.read_csv(csv_train)
print(df.shape)
df.head()

(34250, 5)


Unnamed: 0,posting_id,image,image_phash,title,label_group
0,train_129225211,0000a68812bc7e98c42888dfb1c07da0.jpg,94974f937d4c2433,Paper Bag Victoria Secret,249114794
1,train_3386243561,00039780dfc94d01db8676fe789ecd05.jpg,af3f9460c2838f0f,"Double Tape 3M VHB 12 mm x 4,5 m ORIGINAL / DO...",2937985045
2,train_2288590299,000a190fdd715a2a36faed16e2c65df7.jpg,b94cb00ed3e50f78,Maling TTS Canned Pork Luncheon Meat 397 gr,2395904891
3,train_2406599165,00117e4fc239b1b641ff08340b429633.jpg,8514fc58eafea283,Daster Batik Lengan pendek - Motif Acak / Camp...,4093212188
4,train_3369186413,00136d1cf4edede0203f32f05f660588.jpg,a6f319f924ad708c,Nescafe \xc3\x89clair Latte 220ml,3648931069


In [68]:
df_test = pd.read_csv(csv_test)
print(df_test.shape)
df_test.head()

(3, 4)


Unnamed: 0,posting_id,image,image_phash,title
0,test_2255846744,0006c8e5462ae52167402bac1c2e916e.jpg,ecc292392dc7687a,Edufuntoys - CHARACTER PHONE ada lampu dan mus...
1,test_3588702337,0007585c4d0f932859339129f709bfdc.jpg,e9968f60d2699e2c,(Beli 1 Free Spatula) Masker Komedo | Blackhea...
2,test_4015706929,0008377d3662e83ef44e1881af38b879.jpg,ba81c17e3581cabe,READY Lemonilo Mie instant sehat kuah dan goreng


In [69]:
CFG = DEFAULT_CFG
CFG.BATCH_SIZE = 8
CFG.DEVICE = 'cpu'
CFG.NUM_WORKERS = 0

CFG.CLASSES = df['label_group'].nunique()

In [70]:
trainloader = BuildImageDataloader(df, image_folder, batch_size=CFG.BATCH_SIZE, num_workers=CFG.NUM_WORKERS)

In [71]:
### config?
CHECK_SUB = False
GET_CV = True

In [102]:
## load model with model config and weight_path
IMG_MODEL_PATH = "../weights/init_weight_curriuclarFace.pt"

In [109]:
model = ShopeeCurricularFaceModel(
    n_classes = CFG.CLASSES,
    model_name = CFG.MODEL_NAME,
    fc_dim = CFG.FC_DIM,
    margin = CFG.MARGIN,
    scale = CFG.SCALE)
model.eval()
model.load_state_dict(torch.load(IMG_MODEL_PATH),strict=False)
model = model.to(CFG.DEVICE)

Building Model Backbone for eca_nfnet_l0 model
Using Curricular Face


<All keys matched successfully>

In [118]:
df,_,image_paths = read_matching_dataset(csv_train, image_folder, no_GPU=True)
image_dataset = ShopeeDataset(image_paths=image_paths,transforms=get_test_transforms())

image_loader = torch.utils.data.DataLoader(
    image_dataset,
    batch_size=CFG.BATCH_SIZE,
    pin_memory=True,
    drop_last=False,
    num_workers=CFG.NUM_WORKERS
)

In [122]:
embeds = []

with torch.no_grad():
    for img,label in tqdm(image_loader): 
        img = img.to(CFG.DEVICE)
        label = label.to(CFG.DEVICE)
        feat = model.extract_feat(img)
        image_embeddings = feat.detach().cpu().numpy()
        embeds.append(image_embeddings)
    
# del model
image_embeddings = np.concatenate(embeds)
print(f'Our image embeddings shape is {image_embeddings.shape}')
# del embeds
# gc.collect()
# return image_embeddings

  0%|          | 4/4282 [00:51<15:15:23, 12.84s/it]


KeyboardInterrupt: 

In [135]:
# from cuml.feature_extraction.text import TfidfVectorizer
# from cuml import PCA
# from cuml.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA

In [None]:
# def get_text_embeddings(df_cu, max_features = 15000, n_components = 5000):
df_cu = df
max_features = 15000
n_components = 5000
nlp_model = TfidfVectorizer(stop_words = 'english', binary = True, max_features = max_features)
text_embeddings = nlp_model.fit_transform(df_cu['title']).toarray()
pca = PCA(n_components = n_components)
text_embeddings = pca.fit_transform(text_embeddings).get()
print(f'Our title text embedding shape is {text_embeddings.shape}')
# del model, pca
# gc.collect()
# return text_embeddings

In [1]:
def get_neighbors(df, embeddings, KNN = 50, image = True, thresh=None):
    '''
    image: 2.7, tfidf: 0.6
    https://www.kaggle.com/ragnar123/unsupervised-baseline-arcface?scriptVersionId=57121538
    '''

    model = NearestNeighbors(n_neighbors = KNN)
    model.fit(embeddings)
    distances, indices = model.kneighbors(embeddings)
    
    # Iterate through different thresholds to maximize cv, run this in interactive mode, then replace else clause with a solid threshold
    if thresh is None:
        if image:
            thresholds = list(np.arange(2,4,0.1))
        else:
            thresholds = list(np.arange(0.1, 1, 0.1))
        scores = []
        for threshold in thresholds:
            predictions = []
            for k in range(embeddings.shape[0]):
                idx = np.where(distances[k,] < threshold)[0]
                ids = indices[k,idx]
                posting_ids = ' '.join(df['posting_id'].iloc[ids].values)
                predictions.append(posting_ids)
            df['pred_matches'] = predictions
            df['f1'] = f1_score(df['matches'], df['pred_matches'])
            score = df['f1'].mean()
            print(f'Our f1 score for threshold {threshold} is {score}')
            scores.append(score)
        thresholds_scores = pd.DataFrame({'thresholds': thresholds, 'scores': scores})
        max_score = thresholds_scores[thresholds_scores['scores'] == thresholds_scores['scores'].max()]
        best_threshold = max_score['thresholds'].values[0]
        best_score = max_score['scores'].values[0]
        print(f'Our best score is {best_score} and has a threshold {best_threshold}')
        
        
        thresh = best_threshold
        
    # Because we are predicting the test set that have 70K images and different label groups, confidence should be smaller
    predictions = []
    for k in tqdm(range(embeddings.shape[0])):
        idx = np.where(distances[k,] < thresh)[0]
        ids = indices[k,idx]
        posting_ids = df['posting_id'].iloc[ids].values
        predictions.append(posting_ids)
        
    del model, distances, indices
    gc.collect()
    return df, predictions

In [2]:
def combine_predictions(row):
    x = np.concatenate([row['image_predictions'], row['text_predictions']])
    return ' '.join( np.unique(x) )

In [None]:
df,image_predictions = get_neighbors(df, image_embeddings, KNN = 50, image = True)

In [None]:

df['image_predictions'] = image_predictions
df['text_predictions'] = text_predictions
df['pred_matches'] = df.apply(combine_predictions, axis = 1)

if 'matches' in df:
    df['f1'] = f1_score(df['matches'], df['pred_matches'])
    score = df['f1'].mean()
    print(f'Our final f1 cv score is {score}')s
df['matches'] = df['pred_matches']
df[['posting_id', 'matches']].to_csv('submission.csv', index = False)