References: 
* https://www.kaggle.com/competitions/shopee-product-matching/discussion/238022
* https://www.kaggle.com/code/lyakaap/2nd-place-solution

In [1]:
%%capture
!pip install ../input/faiss-163/faiss_gpu-1.6.3-cp37-cp37m-manylinux2010_x86_64.whl
!pip install ../input/shopee-libs/editdistance-0.5.3-cp37-cp37m-manylinux1_x86_64.whl

In [2]:
import sys
sys.path.append('../input/timm045/')

from itertools import zip_longest
import json
import math
import gc
import os
from pathlib import Path
import joblib
from tqdm import tqdm

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchvision.io import read_image
from torchvision.transforms import Resize, RandomHorizontalFlip, ColorJitter, Normalize, Compose, RandomResizedCrop, CenterCrop, ToTensor

from PIL import Image
import timm
import faiss
from transformers import BertConfig, BertModel, BertTokenizerFast
from transformers import AutoTokenizer, AutoModel, AutoConfig

import string
import cupy as cp
import lightgbm as lgb
import nltk
from nltk.tokenize.treebank import TreebankWordTokenizer
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
import editdistance
import networkx as nx

In [3]:
DEBUG = len(pd.read_csv('../input/shopee-product-matching/test.csv')) == 3
NUM_CLASSES = 11014
NUM_WORKERS = 2
SEED = 0
k = 50
conf_th = 0.7


def load_data():
    if DEBUG:
        nrows = 1000
        df = pd.read_csv('../input/shopee-product-matching/train.csv', nrows=nrows, usecols=['posting_id', 'image', 'title'])
        img_dir = Path('../input/shopee-product-matching/train_images/')
    else:
        nrows = None
        df = pd.read_csv('../input/shopee-product-matching/test.csv', usecols=['posting_id', 'image', 'title'])
        img_dir = Path('../input/shopee-product-matching/test_images/')
    return df, img_dir

def gem(x, p=3, eps=1e-6):
    return F.avg_pool2d(x.clamp(min=eps).pow(p), (x.size(-2), x.size(-1))).pow(1./p)

def query_expansion(feats, sims, topk_idx, alpha=0.5, k=2):
    weights = np.expand_dims(sims[:, :k] ** alpha, axis=-1).astype(np.float32)
    feats = (feats[topk_idx[:, :k]] * weights).sum(axis=1)
    return feats

# Image similarity, Multi-modal similarity

In [4]:
class ShopeeNet(nn.Module):
    def __init__(self,
                 backbone,
                 num_classes,
                 fc_dim=512,
                 s=30, margin=0.5, p=3):
        super(ShopeeNet, self).__init__()

        self.backbone = backbone
        self.backbone.reset_classifier(num_classes=0)  # remove classifier

        self.fc = nn.Linear(self.backbone.num_features, fc_dim)
        self.bn = nn.BatchNorm1d(fc_dim)
        self._init_params()
        self.p = p

    def _init_params(self):
        nn.init.xavier_normal_(self.fc.weight)
        nn.init.constant_(self.fc.bias, 0)
        nn.init.constant_(self.bn.weight, 1)
        nn.init.constant_(self.bn.bias, 0)

    def extract_feat(self, x):
        batch_size = x.shape[0]
        x = self.backbone.forward_features(x)
        if isinstance(x, tuple):
            x = (x[0] + x[1]) / 2
            x = self.bn(x)
        else:
            x = gem(x, p=self.p).view(batch_size, -1)
            x = self.fc(x)
            x = self.bn(x)
        return x

    def forward(self, x, label):
        feat = self.extract_feat(x)
        x = self.loss_module(feat, label)
        return x, feat


class ShopeeDataset(Dataset):
    def __init__(self, df, img_dir, transform=None):
        self.df = df
        self.img_dir = img_dir
        self.transform = transform

    def __getitem__(self, index):
        row = self.df.iloc[index]
        img = read_image(str(self.img_dir / row['image']))
        _, h, w = img.shape
        st_size = (self.img_dir / row['image']).stat().st_size
        if self.transform is not None:
            img = self.transform(img)

        return img, row['title'], h, w, st_size

    def __len__(self):
        return len(self.df)


class MultiModalNet(nn.Module):
    def __init__(self,
                 backbone,
                 bert_model,
                 num_classes,
                 tokenizer,
                 max_len=32,
                 fc_dim=512,
                 s=30, margin=0.5, p=3, loss='ArcMarginProduct'):
        super().__init__()

        self.backbone = backbone
        self.backbone.reset_classifier(num_classes=0)  # remove classifier

        self.bert_model = bert_model
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.fc = nn.Linear(self.bert_model.config.hidden_size + self.backbone.num_features, fc_dim)
        self.bn = nn.BatchNorm1d(fc_dim)
        self._init_params()
        self.p = p

    def _init_params(self):
        nn.init.xavier_normal_(self.fc.weight)
        nn.init.constant_(self.fc.bias, 0)
        nn.init.constant_(self.bn.weight, 1)
        nn.init.constant_(self.bn.bias, 0)

    def extract_feat(self, img, title):
        batch_size = img.shape[0]
        img = self.backbone.forward_features(img)
        img = gem(img, p=self.p).view(batch_size, -1)

        tokenizer_output = self.tokenizer(title, truncation=True, padding=True, max_length=self.max_len)
        input_ids = torch.LongTensor(tokenizer_output['input_ids']).to('cuda')
        token_type_ids = torch.LongTensor(tokenizer_output['token_type_ids']).to('cuda')
        attention_mask = torch.LongTensor(tokenizer_output['attention_mask']).to('cuda')
        title = self.bert_model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
        # x = x.last_hidden_state.sum(dim=1) / attention_mask.sum(dim=1, keepdims=True)
        title = title.last_hidden_state.mean(dim=1)

        x = torch.cat([img, title], dim=1)
        x = self.fc(x)
        x = self.bn(x)
        return x

In [5]:
df, img_dir = load_data()

In [6]:
checkpoint1 = torch.load('../input/shopee/v45.pth')
checkpoint2 = torch.load('../input/shopee/v34.pth')
checkpoint3 = torch.load('../input/shopee/v79.pth')
params1 = checkpoint1['params']
params2 = checkpoint2['params']
params3 = checkpoint3['params']

In [7]:
params1

{'ver': 'v45',
 'size': 384,
 'test_size': 384,
 'lr': 0.001,
 'batch_size': 32,
 'optimizer': 'sam',
 'epochs': 18,
 'wd': 0.0,
 'backbone': 'vit_deit_base_distilled_patch16_384',
 'margin': 0.3,
 's': 50,
 'fc_dim': 768,
 'brightness': 0.2,
 'contrast': 0.2,
 'scale_lower': 0.2,
 'scale_upper': 1.0,
 'filter_wd': True,
 'p': 3.0,
 'p_eval': 6.0,
 'loss': 'CurricularFace'}

In [8]:
params2

{'ver': 'v34',
 'size': 256,
 'test_size': 320,
 'lr': 0.001,
 'batch_size': 32,
 'optimizer': 'sam',
 'epochs': 11,
 'wd': 0.0,
 'backbone': 'dm_nfnet_f0',
 'margin': 0.3,
 's': 50,
 'fc_dim': 256,
 'brightness': 0.2,
 'contrast': 0.2,
 'scale_lower': 0.2,
 'scale_upper': 1.0,
 'filter_wd': True,
 'p': 3.0,
 'p_eval': 6.0,
 'loss': 'CurricularFace'}

In [9]:
params3

{'ver': 'v79',
 'lr': 0.001,
 'batch_size': 16,
 'size': 256,
 'test_size': 320,
 'optimizer': 'sam',
 'epochs': 8,
 'loss': 'CurricularFace',
 'wd': 1e-05,
 'filter_wd': True,
 'margin': 0.3,
 's': 50,
 'fc_dim': 1024,
 'cycle': 1,
 'backbone': 'dm_nfnet_f0',
 'model_name': 'cahya/bert-base-indonesian-522M',
 'max_len': 64,
 'brightness': 0.2,
 'contrast': 0.2,
 'scale_lower': 0.2,
 'scale_upper': 1.0,
 'p': 3.0,
 'p_eval': 6.0}

In [10]:
# this transform is not directly applied to dataset
# instead, it applies to a batch generated by data loader later
transform = Compose([
    Resize(size=params1['test_size'] + 32, interpolation=Image.BICUBIC),
    CenterCrop((params1['test_size'], params1['test_size'])),
    Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

dataset = ShopeeDataset(df=df, img_dir=img_dir, transform=None)
data_loader = DataLoader(dataset, batch_size=8, shuffle=False,
                         drop_last=False, pin_memory=True, num_workers=NUM_WORKERS, collate_fn=lambda x: x)

In [11]:
# ShopeeNet
backbone = timm.create_model(model_name=params1['backbone'], pretrained=False)
model1 = ShopeeNet(backbone, num_classes=0, fc_dim=params1['fc_dim'])
model1 = model1.to('cuda')
model1.load_state_dict(checkpoint1['model'], strict=False)
model1.train(False)
model1.p = params1['p_eval']

# ShopeeNet
backbone = timm.create_model(model_name=params2['backbone'], pretrained=False)
model2 = ShopeeNet(backbone, num_classes=0, fc_dim=params2['fc_dim'])
model2 = model2.to('cuda')
model2.load_state_dict(checkpoint2['model'], strict=False)
model2.train(False)
model2.p = params2['p_eval']

# MultiModalNet
backbone = timm.create_model(model_name=params3['backbone'], pretrained=False)
tokenizer = BertTokenizerFast(vocab_file='../input/bert-indo/vocab.txt')
bert_config = BertConfig.from_json_file('../input/bert-indo/config.json')
bert_model = BertModel(bert_config)
model3 = MultiModalNet(backbone, bert_model, num_classes=0, tokenizer=tokenizer, max_len=params3['max_len'],
                       fc_dim=params3['fc_dim'], s=params3['s'], margin=params3['margin'], loss=params3['loss'])
model3 = model3.to('cuda')
model3.load_state_dict(checkpoint3['model'], strict=False)
model3.train(False)
model3.p = params3['p_eval']

In [12]:
img_feats1 = []
img_feats2 = []
mm_feats = []
img_hs = []
img_ws = []
st_sizes = []
for batch in tqdm(data_loader, total=len(data_loader), miniters=None, ncols=55):
    img, title, h, w, st_size = list(zip(*batch))
    img = torch.cat([transform(x.to('cuda').float() / 255)[None] for x in img], axis=0)
    title = list(title)
    with torch.no_grad():
        feats_minibatch1 = model1.extract_feat(img)
        img_feats1.append(feats_minibatch1.cpu().numpy())
        feats_minibatch2 = model2.extract_feat(img)
        img_feats2.append(feats_minibatch2.cpu().numpy())
        feats_minibatch3 = model3.extract_feat(img, title)
        mm_feats.append(feats_minibatch3.cpu().numpy())
    img_hs.extend(list(h))
    img_ws.extend(list(w))
    st_sizes.extend(list(st_size))

img_feats1 = np.concatenate(img_feats1)
img_feats1 /= np.linalg.norm(img_feats1, 2, axis=1, keepdims=True)
img_feats2 = np.concatenate(img_feats2)
img_feats2 /= np.linalg.norm(img_feats2, 2, axis=1, keepdims=True)
mm_feats = np.concatenate(mm_feats)
mm_feats /= np.linalg.norm(mm_feats, 2, axis=1, keepdims=True)

np.save('/tmp/img_feats1', img_feats1)
np.save('/tmp/img_feats2', img_feats2)

img_feats = np.concatenate([
    img_feats1 * 1.0,
    img_feats2 * 1.0,
], axis=1)

img_feats /= np.linalg.norm(img_feats, 2, axis=1, keepdims=True)

np.save('/tmp/img_feats', img_feats)

100%|████████████████| 125/125 [00:48<00:00,  2.59it/s]


In [13]:
# img similarities
res = faiss.StandardGpuResources()
index_img = faiss.IndexFlatIP(params1['fc_dim'] + params2['fc_dim'])
index_img = faiss.index_cpu_to_gpu(res, 0, index_img)
index_img.add(img_feats)
similarities_img, indexes_img = index_img.search(img_feats, k)

joblib.dump([similarities_img, indexes_img], '/tmp/lyk_img_data.pkl')
joblib.dump([st_sizes, img_hs, img_ws], '/tmp/lyk_img_meta_data.pkl')

# multi-modal similarities
res = faiss.StandardGpuResources()
index_mm = faiss.IndexFlatIP(params3['fc_dim'])
index_mm = faiss.index_cpu_to_gpu(res, 0, index_mm)
index_mm.add(mm_feats)
similarities_mm, indexes_mm = index_mm.search(mm_feats, k)

joblib.dump([similarities_mm, indexes_mm], '/tmp/lyk_mm_data.pkl')

np.save('/tmp/mm_feats', mm_feats)

### image QE

In [14]:
img_feats = np.load('/tmp/img_feats.npy')

res = faiss.StandardGpuResources()
index_img = faiss.IndexFlatIP(img_feats.shape[1])
index_img = faiss.index_cpu_to_gpu(res, 0, index_img)
index_img.add(img_feats)
img_D, img_I = index_img.search(img_feats, 60)

np.save('/tmp/img_D', img_D)
np.save('/tmp/img_I', img_I)

img_feats_qe = query_expansion(img_feats, img_D, img_I)
img_feats_qe /= np.linalg.norm(img_feats_qe, 2, axis=1, keepdims=True)

img_feats = np.hstack([img_feats, img_feats_qe])
img_feats /= np.linalg.norm(img_feats, axis=1).reshape((-1, 1))

res = faiss.StandardGpuResources()
index = faiss.IndexFlatIP(img_feats.shape[1])
index = faiss.index_cpu_to_gpu(res, 0, index)
index.add(img_feats)
img_D, img_I = index.search(img_feats, 60)

np.save('/tmp/img_D_qe', img_D)
np.save('/tmp/img_I_qe', img_I)

### Multi-modal QE

In [15]:
mm_feats = np.load('/tmp/mm_feats.npy')

res = faiss.StandardGpuResources()
index_mm = faiss.IndexFlatIP(mm_feats.shape[1])
index_mm = faiss.index_cpu_to_gpu(res, 0, index_mm)
index_mm.add(mm_feats)
mm_D, mm_I = index_mm.search(mm_feats, 60)

np.save('/tmp/mut_D', mm_D)
np.save('/tmp/mut_I', mm_I)

mm_feats_qe = query_expansion(mm_feats, mm_D, mm_I)
mm_feats_qe /= np.linalg.norm(mm_feats_qe, 2, axis=1, keepdims=True)

mm_feats = np.hstack([mm_feats, mm_feats_qe])
mm_feats /= np.linalg.norm(mm_feats, axis=1).reshape((-1, 1))

res = faiss.StandardGpuResources()
index = faiss.IndexFlatIP(mm_feats.shape[1])
index = faiss.index_cpu_to_gpu(res, 0, index)
index.add(mm_feats)
mm_D, mm_I = index.search(mm_feats, 60)

np.save('/tmp/mut_D_qe', mm_D)
np.save('/tmp/mut_I_qe', mm_I)

# BERT similarity

In [16]:
class BertNet(nn.Module):
    def __init__(self,
                 bert_model,
                 num_classes,
                 tokenizer,
                 max_len=32,
                 fc_dim=512,
                 simple_mean=True,
                 s=30, margin=0.5, p=3, loss='ArcMarginProduct'):
        super().__init__()

        self.bert_model = bert_model
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.fc = nn.Linear(self.bert_model.config.hidden_size, fc_dim)
        self.bn = nn.BatchNorm1d(fc_dim)
        self._init_params()
        self.p = p
        self.simple_mean = simple_mean

    def _init_params(self):
        nn.init.xavier_normal_(self.fc.weight)
        nn.init.constant_(self.fc.bias, 0)
        nn.init.constant_(self.bn.weight, 1)
        nn.init.constant_(self.bn.bias, 0)

    def extract_feat(self, x):
        tokenizer_output = self.tokenizer(x, truncation=True, padding=True, max_length=self.max_len)
        if 'token_type_ids' in tokenizer_output:
            input_ids = torch.LongTensor(tokenizer_output['input_ids']).to('cuda')
            token_type_ids = torch.LongTensor(tokenizer_output['token_type_ids']).to('cuda')
            attention_mask = torch.LongTensor(tokenizer_output['attention_mask']).to('cuda')
            x = self.bert_model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
        else:
            input_ids = torch.LongTensor(tokenizer_output['input_ids']).to('cuda')
            attention_mask = torch.LongTensor(tokenizer_output['attention_mask']).to('cuda')
            x = self.bert_model(input_ids=input_ids, attention_mask=attention_mask)
        if self.simple_mean:
            x = x.last_hidden_state.mean(dim=1)
        else:
            x = torch.sum(x.last_hidden_state * attention_mask.unsqueeze(-1), dim=1) / attention_mask.sum(dim=1, keepdims=True)
        x = self.fc(x)
        x = self.bn(x)
        return x


class BertDataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __getitem__(self, index):
        row = self.df.iloc[index]

        if 'y' in row.keys():
            target = torch.tensor(row['y'], dtype=torch.long)
            return row['title'], target
        else:
            return row['title']

    def __len__(self):
        return len(self.df)

In [17]:
df, img_dir = load_data()

In [18]:
checkpoint = torch.load('../input/shopee/v75.pth')
checkpoint2 = torch.load('../input/shopee/v102.pth')
checkpoint3 = torch.load('../input/shopee/v103.pth')

params_bert = checkpoint['params']
params_bert2 = checkpoint2['params']
params_bert3 = checkpoint3['params']

In [19]:
datasets = {
    'valid': BertDataset(df=df)
}
data_loaders = {
    'valid': DataLoader(datasets['valid'], 
                        batch_size=params_bert['batch_size'] * 2, 
                        shuffle=False, drop_last=False, 
                        pin_memory=True, 
                        num_workers=NUM_WORKERS)
}

In [20]:
# bert Indonesian
tokenizer = BertTokenizerFast(vocab_file='../input/bert-indo/vocab.txt')
bert_config = BertConfig.from_json_file('../input/bert-indo/config.json')
bert_model = BertModel(bert_config)
model = BertNet(bert_model, num_classes=0, tokenizer=tokenizer, max_len=params_bert['max_len'], simple_mean=True,
                fc_dim=params_bert['fc_dim'], s=params_bert['s'], margin=params_bert['margin'], loss=params_bert['loss'])
model = model.to('cuda')
model.load_state_dict(checkpoint['model'], strict=False)
model.train(False)

# bert multilingual
tokenizer = AutoTokenizer.from_pretrained('../input/bertmultilingual/')
bert_config = AutoConfig.from_pretrained('../input/bertmultilingual/')
bert_model = AutoModel.from_config(bert_config)
model2 = BertNet(bert_model, num_classes=0, tokenizer=tokenizer, max_len=params_bert['max_len'], simple_mean=False,
                 fc_dim=params_bert['fc_dim'], s=params_bert['s'], margin=params_bert['margin'], loss=params_bert['loss'])
model2 = model2.to('cuda')
model2.load_state_dict(checkpoint2['model'], strict=False)
model2.train(False)

# bert xlm
tokenizer = AutoTokenizer.from_pretrained('../input/bertxlm/')
bert_config = AutoConfig.from_pretrained('../input/bertxlm/')
bert_model = AutoModel.from_config(bert_config)
model3 = BertNet(bert_model, num_classes=0, tokenizer=tokenizer, max_len=params_bert3['max_len'], simple_mean=False,
                 fc_dim=params_bert3['fc_dim'], s=params_bert3['s'], margin=params_bert3['margin'], loss=params_bert3['loss'])
model3 = model3.to('cuda')
model3.load_state_dict(checkpoint3['model'], strict=False)
model3.train(False)

BertNet(
  (bert_model): XLMRobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((7

In [21]:
bert_feats1 = []
bert_feats2 = []
bert_feats3 = []
for i, title in tqdm(enumerate(data_loaders['valid']),
                     total=len(data_loaders['valid']), miniters=None, ncols=55):
    with torch.no_grad():
        bert_feats_minibatch = model.extract_feat(title)
        bert_feats1.append(bert_feats_minibatch.cpu().numpy())
        bert_feats_minibatch = model2.extract_feat(title)
        bert_feats2.append(bert_feats_minibatch.cpu().numpy())
        bert_feats_minibatch = model3.extract_feat(title)
        bert_feats3.append(bert_feats_minibatch.cpu().numpy())

bert_feats1 = np.concatenate(bert_feats1)
bert_feats1 /= np.linalg.norm(bert_feats1, 2, axis=1, keepdims=True)
bert_feats2 = np.concatenate(bert_feats2)
bert_feats2 /= np.linalg.norm(bert_feats2, 2, axis=1, keepdims=True)
bert_feats3 = np.concatenate(bert_feats3)
bert_feats3 /= np.linalg.norm(bert_feats3, 2, axis=1, keepdims=True)

np.save('/tmp/bert_feats1', bert_feats1)
np.save('/tmp/bert_feats2', bert_feats2)
np.save('/tmp/bert_feats3', bert_feats3)

100%|████████████████████| 4/4 [00:05<00:00,  1.29s/it]


In [22]:
# cancat feats1 and feats2
bert_feats = np.concatenate([bert_feats1, bert_feats2], axis=1)
bert_feats /= np.linalg.norm(bert_feats, 2, axis=1, keepdims=True)

# similarity search
res = faiss.StandardGpuResources()
index_bert = faiss.IndexFlatIP(params_bert['fc_dim'])
index_bert = faiss.index_cpu_to_gpu(res, 0, index_bert)
index_bert.add(bert_feats1)
similarities_bert, indexes_bert = index_bert.search(bert_feats1, k)

# concate feats1, 2, 3
bert_feats = np.concatenate([bert_feats1, bert_feats2, bert_feats3], axis=1)
bert_feats /= np.linalg.norm(bert_feats, 2, axis=1, keepdims=True)

np.save('/tmp/bert_feats', bert_feats)

joblib.dump([similarities_bert, indexes_bert], '/tmp/lyk_bert_data.pkl')

['/tmp/lyk_bert_data.pkl']

### Bert QE

In [23]:
brt_feats = np.load('/tmp/bert_feats.npy')

res = faiss.StandardGpuResources()
index_brt = faiss.IndexFlatIP(brt_feats.shape[1])
index_brt = faiss.index_cpu_to_gpu(res, 0, index_brt)
index_brt.add(brt_feats)
brt_D, brt_I = index_brt.search(brt_feats, 60)

np.save('/tmp/brt_D', brt_D)
np.save('/tmp/brt_I', brt_I)

del index_brt
gc.collect()

brt_feats_qe = query_expansion(brt_feats, brt_D, brt_I)
brt_feats_qe /= np.linalg.norm(brt_feats_qe, 2, axis=1, keepdims=True)

brt_feats = np.hstack([brt_feats, brt_feats_qe])
brt_feats /= np.linalg.norm(brt_feats, axis=1).reshape((-1, 1))

res = faiss.StandardGpuResources()
index = faiss.IndexFlatIP(brt_feats.shape[1])
index = faiss.index_cpu_to_gpu(res, 0, index)
index.add(brt_feats)
brt_D, brt_I = index.search(brt_feats, 60)

np.save('/tmp/brt_D_qe', brt_D)
np.save('/tmp/brt_I_qe', brt_I)

# Image & BERT similarity

In [24]:
feats_bert = np.load('/tmp/bert_feats.npy')
feats_img = np.load('/tmp/img_feats.npy')

bth_feats = np.hstack([feats_bert, feats_img])
bth_feats /= np.linalg.norm(bth_feats, 2, axis=1, keepdims=True)
print(bth_feats.shape)

res = faiss.StandardGpuResources()
index = faiss.IndexFlatIP(bth_feats.shape[1])
index = faiss.index_cpu_to_gpu(res, 0, index)
index.add(bth_feats)
bth_D, bth_I = index.search(bth_feats, 60)

np.save('/tmp/bth_D', bth_D)
np.save('/tmp/bth_I', bth_I)

del index
gc.collect()

bth_feats_qe = query_expansion(bth_feats, bth_D, bth_I)
bth_feats_qe /= np.linalg.norm(bth_feats_qe, 2, axis=1, keepdims=True)

bth_feats = np.hstack([bth_feats, bth_feats_qe])
bth_feats /= np.linalg.norm(bth_feats, axis=1).reshape((-1, 1))

res = faiss.StandardGpuResources()
index = faiss.IndexFlatIP(bth_feats.shape[1])
index = faiss.index_cpu_to_gpu(res, 0, index)
index.add(bth_feats)
bth_D, bth_I = index.search(bth_feats, 60)

np.save('/tmp/bth_D_qe', bth_D)
np.save('/tmp/bth_I_qe', bth_I)

(1000, 2304)


# GAT

In [25]:
class GraphDataset(Dataset):
    def __init__(self, feats=None, labels=None, weights=None, pair_tuples=None, k=50, top_neighbors=None):
        self.feats = feats
        self.labels = labels
        self.weights = weights
        self.pair_tuples = pair_tuples
        self.k = k
        self.top_neighbors = top_neighbors

    def __getitem__(self, index):
        i, j = self.pair_tuples[index]
        feat = torch.FloatTensor(self.feats[i][j])

        padding_i = [[0] * feat.shape[0]] * (self.k - len(self.top_neighbors[i]))
        neighbor_feats_i = torch.FloatTensor([self.feats[i][neighbor] for neighbor in self.top_neighbors[i]] + padding_i)
        padding_j = [[0] * feat.shape[0]] * (self.k - len(self.top_neighbors[j]))
        neighbor_feats_j = torch.FloatTensor([self.feats[j][neighbor] for neighbor in self.top_neighbors[j]] + padding_j)
        
        neighbor_feats = torch.cat([feat.unsqueeze(0), neighbor_feats_i, neighbor_feats_j], dim=0)

        outputs = (feat, neighbor_feats)
        
        if self.labels is not None:
            outputs += (self.labels[i] == self.labels[j],)
        if self.weights is not None:
            outputs += (self.weights[i],)

        return outputs

    def __len__(self):
        return len(self.pair_tuples)


class GraphAttentionLayer(nn.Module):
    def __init__(self, in_features, out_features, dropout=0.6, alpha=0.2, concat=True):
        super().__init__()
        self.dropout = dropout
        self.in_features = in_features
        self.out_features = out_features
        self.alpha = alpha
        self.concat = concat

        self.W = nn.Parameter(torch.empty(size=(in_features, out_features)))
        nn.init.xavier_uniform_(self.W.data, gain=1.414)
        self.a = nn.Parameter(torch.empty(size=(2 * out_features, 1)))
        nn.init.xavier_uniform_(self.a.data, gain=1.414)

        self.leakyrelu = nn.LeakyReLU(self.alpha)

    def forward(self, h):
        Wh = h @ self.W  # h.shape: (B, N, in_features), Wh.shape: (B, N, out_features)
        a_input = self._prepare_attentional_mechanism_input(Wh)
        e = self.leakyrelu(torch.matmul(a_input, self.a).squeeze(3))

        attention = F.softmax(e, dim=1)
        attention = F.dropout(attention, self.dropout, training=self.training)
        h_prime = torch.bmm(attention, Wh)

        if self.concat:
            return F.elu(h_prime)
        else:
            return h_prime

    def _prepare_attentional_mechanism_input(self, Wh):
        B, N, D = Wh.shape

        Wh_repeated_in_chunks = Wh.repeat_interleave(N, dim=1)
        Wh_repeated_alternating = Wh.repeat(1, N, 1)

        all_combinations_matrix = torch.cat([Wh_repeated_in_chunks, Wh_repeated_alternating], dim=2)
        return all_combinations_matrix.view(-1, N, N, 2 * D)

    def __repr__(self):
        return self.__class__.__name__ + ' (' + str(self.in_features) + ' -> ' + str(self.out_features) + ')'


class GATPairClassifier(nn.Module):
    def __init__(self, nfeat, nhid=8, nclass=1, dropout=0.6, alpha=0.2, nheads=8, pooling='first'):
        super().__init__()
        self.dropout = dropout
        self.pooling = pooling

        self.attentions = [GraphAttentionLayer(nfeat, nhid, dropout=dropout, alpha=alpha, concat=True) for _ in range(nheads)]
        for i, attention in enumerate(self.attentions):
            self.add_module('attention_{}'.format(i), attention)

        self.out_att = GraphAttentionLayer(nhid * nheads, nhid, dropout=dropout, alpha=alpha, concat=False)

        self.classifier = nn.Sequential(
            nn.Linear(nfeat + nhid, nhid),
            nn.PReLU(),
            nn.BatchNorm1d(nhid),
            nn.Linear(nhid, nclass),
        )

    def forward_gat(self, x):
        x = F.dropout(x, self.dropout, training=self.training)
        x = torch.cat([att(x) for att in self.attentions], dim=2)
        x = F.dropout(x, self.dropout, training=self.training)
        x = F.elu(self.out_att(x))
        if self.pooling == 'first':
            return x[:, 0]
        elif self.pooling == 'mean':
            return x.mean(dim=1)

    def forward(self, feats, neighbor_feats):
        gat_feats = self.forward_gat(neighbor_feats)
        cat_feats = torch.cat([feats, gat_feats], dim=1)
        return self.classifier(cat_feats).squeeze(1)

In [26]:
import time
from contextlib import contextmanager
from collections import defaultdict

map_used_time = defaultdict(float)

@contextmanager
def timer(title):
    t0 = time.time()
    yield
    tt = time.time() - t0
    map_used_time[title] += tt
    print("  {} - done in {:.5f}s".format(title, tt))

In [27]:
df, img_dir = load_data()

In [28]:
stop_words = set([
    'promo','diskon','baik','terbaik', 'murah',
    'termurah', 'harga', 'price', 'best', 'seller',
    'bestseller', 'ready', 'stock', 'stok', 'limited',
    'bagus', 'kualitas', 'berkualitas', 'hari', 'ini',
    'jadi', 'gratis',
])


titles = [
    title.translate(str.maketrans({_: ' ' for _ in string.punctuation}))
    for title in df['title'].str.lower().values
]

tokenizer = TweetTokenizer()
tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words, 
                                   binary=True, 
                                   min_df=2, 
                                   token_pattern='(?u)\\b\\w+\\b', 
                                   tokenizer=tokenizer.tokenize,
                                   dtype=np.float32,
                                   norm='l2')

tfidf_feats = tfidf_vectorizer.fit_transform(titles)

simmat_tfidf = tfidf_feats @ tfidf_feats.T

In [29]:
with timer('load'):
    st_sizes, img_hs, img_ws = joblib.load('/tmp/lyk_img_meta_data.pkl')
    similarities_img = np.load('/tmp/img_D_qe.npy')[:, :k]
    indexes_img = np.load('/tmp/img_I_qe.npy')[:, :k]

    similarities_bert = np.load('/tmp/brt_D_qe.npy')[:, :k]
    indexes_bert = np.load('/tmp/brt_I_qe.npy')[:, :k]

    similarities_mm = np.load('/tmp/mut_D_qe.npy')[:, :k]
    indexes_mm = np.load('/tmp/mut_I_qe.npy')[:, :k]
    
    row = indexes_img.ravel()
    col = np.arange(len(indexes_img)).repeat(k)
    data = similarities_img.ravel()
    simmat_img = {(i, j): d for i, j, d in zip(col, row, data)}
    
    row = indexes_bert.ravel()
    col = np.arange(len(indexes_bert)).repeat(k)
    data = similarities_bert.ravel()
    simmat_bert = {(i, j): d for i, j, d in zip(col, row, data)}

    row = indexes_mm.ravel()
    col = np.arange(len(indexes_mm)).repeat(k)
    data = similarities_mm.ravel()
    simmat_mm = {(i, j): d for i, j, d in zip(col, row, data)}

del row, col, data
gc.collect()

  load - done in 0.07559s


0

In [30]:
ckpt = torch.load('../input/shopee-meta-models/v135.pth')
params = ckpt['params']

In [31]:
params

{'ver': 'v135',
 'lr': 0.003,
 'batch_size': 1024,
 'optimizer': 'adamw',
 'epochs': 12,
 'wd': 0.01,
 'filter_wd': True,
 'nhid': 16,
 'nheads': 16,
 'pooling': 'first',
 'dropout': 0.0,
 'k': 10}

In [32]:
# a dict storing top similars for each item
top_neighbors = defaultdict(list)
# a dict of dict, acting like a matrix, storing target edge/pair_tuple features
feats = defaultdict(lambda: defaultdict())

pair_tuples = []
for i in tqdm(range(len(df))):
    # obtain the top k img_similar and top k bert_similar indices
    right_indexes = set(indexes_img[i, :k].tolist() + indexes_bert[i, :k].tolist())
    # remove self
    right_indexes.remove(i)
    right_indexes = list(right_indexes)
    
    scores = {}
    for j in right_indexes:
        pair_tuples.append((i, j))

        sim_img = simmat_img.get((i, j), 0)
        sim_bert = simmat_bert.get((i, j), 0)
        sim_mm = simmat_mm.get((i, j), 0)
        sim_tfidf = simmat_tfidf[i, j]
        if sim_img == 0 and sim_bert == 0:
            continue

        feats[i][j] = [
            sim_img,
            sim_tfidf,
            sim_bert,
            sim_mm,
        ]
        scores[j] = sim_img + sim_bert + sim_mm + sim_tfidf

    top_neighbors[i] = sorted(right_indexes, key=lambda x: scores[x], reverse=True)[:params['k']]

100%|██████████| 1000/1000 [00:04<00:00, 226.06it/s]


In [33]:
dataset = GraphDataset(
    feats=feats,
    pair_tuples=pair_tuples,
    k=params['k'],
    top_neighbors=top_neighbors,
)

loader = DataLoader(dataset, batch_size=2 ** 12, shuffle=False, drop_last=False, num_workers=2, pin_memory=True)

gat = GATPairClassifier(nfeat=len(feats[i][j]), nhid=params['nhid'],
                        dropout=params['dropout'], nheads=params['nheads'], pooling=params['pooling'])
gat.to('cuda').eval()
gat.load_state_dict(ckpt['model'])

del tfidf_feats
gc.collect()

26

In [34]:
preds = []
for feats, neighbor_feats in tqdm(loader, desc='predict', leave=False):
    feats = feats.to('cuda', non_blocking=True)
    neighbor_feats = neighbor_feats.to('cuda', non_blocking=True)
    with torch.no_grad():
        pred = gat(feats, neighbor_feats).sigmoid().detach().cpu().numpy().tolist()
        preds.extend(pred)

conf_th_gcn = 0.3
df_pair = pd.DataFrame()
col, row = list(zip(*pair_tuples))
df_pair['i'] = col
df_pair['j'] = row

df_pair['posting_id'] = df['posting_id'].values[df_pair['i'].values]
df_pair['posting_id_target'] = df['posting_id'].values[df_pair['j'].values]

df_pair = df_pair[['posting_id', 'posting_id_target']]
df_pair['pred'] = preds
df_pair['pred'] -= conf_th_gcn

df_pair.to_pickle('submission_gcn.pkl')
df_pair

                                                        

Unnamed: 0,posting_id,posting_id_target,pred
0,train_129225211,train_496484454,-0.289052
1,train_129225211,train_3009013664,-0.296500
2,train_129225211,train_1180155871,-0.289716
3,train_129225211,train_4204187863,-0.205241
4,train_129225211,train_916038872,-0.297061
...,...,...,...
93227,train_4148177698,train_3978236021,-0.299419
93228,train_4148177698,train_3755240481,-0.299024
93229,train_4148177698,train_2592728479,-0.299977
93230,train_4148177698,train_1045170109,-0.298511


# LGB

In [35]:
map_used_time = defaultdict(float)

@contextmanager
def timer(title):
    t0 = time.time()
    yield
    tt = time.time() - t0
    map_used_time[title] += tt
    print("  {} - done in {:.5f}s".format(title, tt))

In [36]:
df, img_dir = load_data()

stop_words = set([
    'promo','diskon','baik','terbaik', 'murah',
    'termurah', 'harga', 'price', 'best', 'seller',
    'bestseller', 'ready', 'stock', 'stok', 'limited',
    'bagus', 'kualitas', 'berkualitas', 'hari', 'ini',
    'jadi', 'gratis',
])

titles = [
    title.translate(str.maketrans({_: ' ' for _ in string.punctuation}))
    for title in df['title'].str.lower().values
]

In [37]:
tokenizer = TweetTokenizer()
tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words, 
                                   binary=True, 
                                   min_df=2, 
                                   token_pattern='(?u)\\b\\w+\\b', 
                                   tokenizer=tokenizer.tokenize,
                                   dtype=np.float32,
                                   norm='l2')
tfidf_feats = tfidf_vectorizer.fit_transform(titles)

with timer('load'):
    similarities_bert, indexes_bert = joblib.load('/tmp/lyk_bert_data.pkl')
    similarities_img, indexes_img = joblib.load('/tmp/lyk_img_data.pkl')
    st_sizes, img_hs, img_ws = joblib.load('/tmp/lyk_img_meta_data.pkl')
    similarities_mm, indexes_mm = joblib.load('/tmp/lyk_mm_data.pkl')
    
    row = indexes_bert.ravel()
    col = np.arange(len(indexes_bert)).repeat(k)
    data = similarities_bert.ravel()
    simmat_bert = {(i, j): d for i, j, d in zip(col, row, data)}

    row = indexes_img.ravel()
    col = np.arange(len(indexes_img)).repeat(k)
    data = similarities_img.ravel()
    simmat_img = {(i, j): d for i, j, d in zip(col, row, data)}

    row = indexes_mm.ravel()
    col = np.arange(len(indexes_mm)).repeat(k)
    data = similarities_mm.ravel()
    simmat_mm = {(i, j): d for i, j, d in zip(col, row, data)}

del row, col, data
gc.collect()

  load - done in 0.29678s


0

In [38]:
mean_sim_img_top5 = similarities_img[:, :5].mean(1)
mean_sim_bert_top5 = similarities_bert[:, :5].mean(1)
mean_sim_img_top5 = (mean_sim_img_top5 - mean_sim_img_top5.mean()) / mean_sim_img_top5.std()
mean_sim_bert_top5 = (mean_sim_bert_top5 - mean_sim_bert_top5.mean()) / mean_sim_bert_top5.std()

mean_mean_sim_img_top5 = mean_sim_img_top5[indexes_img[:, :5]].mean(1)
mean_mean_sim_bert_top5 = mean_sim_bert_top5[indexes_bert[:, :5]].mean(1)
mean_mean_sim_img_top5 = (mean_mean_sim_img_top5 - mean_mean_sim_img_top5.mean()) / mean_mean_sim_img_top5.std()
mean_mean_sim_bert_top5 = (mean_mean_sim_bert_top5 - mean_mean_sim_bert_top5.mean()) / mean_mean_sim_bert_top5.std()

mean_sim_img_top15 = similarities_img[:, :15].mean(1)
mean_sim_bert_top15 = similarities_bert[:, :15].mean(1)
mean_sim_img_top15 = (mean_sim_img_top15 - mean_sim_img_top15.mean()) / mean_sim_img_top15.std()
mean_sim_bert_top15 = (mean_sim_bert_top15 - mean_sim_bert_top15.mean()) / mean_sim_bert_top15.std()

mean_sim_img_top30 = similarities_img[:, :30].mean(1)
mean_sim_bert_top30 = similarities_bert[:, :30].mean(1)
mean_sim_img_top30 = (mean_sim_img_top30 - mean_sim_img_top30.mean()) / mean_sim_img_top30.std()
mean_sim_bert_top30 = (mean_sim_bert_top30 - mean_sim_bert_top30.mean()) / mean_sim_bert_top30.std()

mean_sim_mm_top5 = similarities_mm[:, :5].mean(1)
mean_sim_mm_top5 = (mean_sim_mm_top5 - mean_sim_mm_top5.mean()) / mean_sim_mm_top5.std()

mean_mean_sim_mm_top5 = mean_sim_mm_top5[indexes_mm[:, :5]].mean(1)
mean_mean_sim_mm_top5 = (mean_mean_sim_mm_top5 - mean_mean_sim_mm_top5.mean()) / mean_mean_sim_mm_top5.std()

mean_sim_mm_top15 = similarities_mm[:, :15].mean(1)
mean_sim_mm_top15 = (mean_sim_mm_top15 - mean_sim_mm_top15.mean()) / mean_sim_mm_top15.std()

mean_sim_mm_top30 = similarities_mm[:, :30].mean(1)
mean_sim_mm_top30 = (mean_sim_mm_top30 - mean_sim_mm_top30.mean()) / mean_sim_mm_top30.std()

In [39]:
row_titles = df['title'].values
posting_ids = df['posting_id'].values

tmp_dir = Path('/tmp/rows')
tmp_dir.mkdir(exist_ok=True, parents=True)

rows = []
for i in tqdm(range(len(df))):
    right_indexes = set(indexes_img[i].tolist() + indexes_bert[i].tolist())

    for _, j in enumerate(right_indexes):
        if i == j:
            continue
        sim_img = simmat_img.get((i, j), 0)
        sim_bert = simmat_bert.get((i, j), 0)
        sim_mm = simmat_mm.get((i, j), 0)
        if sim_img == 0 and sim_bert == 0:
            continue

        rows.append({
            'i': i,
            'j': j,
            'posting_id': posting_ids[i],
            'posting_id_target': posting_ids[j],
            'sim_img': sim_img,
            'sim_bert': sim_bert,
            'sim_mm': sim_mm,
            'edit_distance': editdistance.eval(titles[i], titles[j]),
            'title_len': len(row_titles[i]),
            'title_len_target': len(row_titles[j]),
            'title_num_words': len(row_titles[i].split()),
            'title_num_words_target': len(row_titles[j].split()),
            'mean_sim_img_top5': mean_sim_img_top5[i],
            'mean_sim_img_target_top5': mean_sim_img_top5[j],
            'mean_sim_bert_top5': mean_sim_bert_top5[i],
            'mean_sim_bert_target_top5': mean_sim_bert_top5[j],
            'mean_sim_img_top15': mean_sim_img_top15[i],
            'mean_sim_img_target_top15': mean_sim_img_top15[j],
            'mean_sim_bert_top15': mean_sim_bert_top15[i],
            'mean_sim_bert_target_top15': mean_sim_bert_top15[j],
            'mean_sim_img_top30': mean_sim_img_top30[i],
            'mean_sim_img_target_top30': mean_sim_img_top30[j],
            'mean_sim_bert_top30': mean_sim_bert_top30[i],
            'mean_sim_bert_target_top30': mean_sim_bert_top30[j],
            'st_size': st_sizes[i],
            'st_size_target': st_sizes[j],
            'wxh/st_size': img_ws[i] * img_hs[i] / st_sizes[i],
            'wxh/st_size_target': img_ws[j] * img_hs[j] / st_sizes[j],
            'mean_mean_sim_img_top5': mean_mean_sim_img_top5[i],
            'mean_mean_sim_img_target_top5': mean_mean_sim_img_top5[j],
            'mean_mean_sim_bert_top5': mean_mean_sim_bert_top5[i],
            'mean_mean_sim_bert_target_top5': mean_mean_sim_bert_top5[j],
            'mean_sim_mm_top5': mean_sim_mm_top5[i],
            'mean_sim_mm_target_top5': mean_sim_mm_top5[j],
            'mean_sim_mm_top15': mean_sim_mm_top15[i],
            'mean_sim_mm_target_top15': mean_sim_mm_top15[j],
            'mean_sim_mm_top30': mean_sim_mm_top30[i],
            'mean_sim_mm_target_top30': mean_sim_mm_top30[j],
            'mean_mean_sim_mm_top5': mean_mean_sim_mm_top5[i],
            'mean_mean_sim_mm_target_top5': mean_mean_sim_mm_top5[j],
        })

    if i % 10000 == 9999 or i == len(df) - 1:
        tmp_df = pd.DataFrame(rows)
        for col in tmp_df.columns:
            if tmp_df[col].dtype == 'float64':
                tmp_df[col] = tmp_df[col].astype('float32')
            elif tmp_df[col].dtype == 'int64':
                tmp_df[col] = tmp_df[col].astype('int32')
        tmp_df.to_feather(tmp_dir / f'{i}.feather')
        rows = []

df.drop(['image', 'title'], axis=1, inplace=True)
del (
    mean_sim_img_top5, mean_sim_img_top15, mean_sim_img_top30, mean_mean_sim_img_top5,
    mean_sim_bert_top5, mean_sim_bert_top15, mean_sim_bert_top30, mean_mean_sim_bert_top5,
    mean_sim_mm_top5, mean_sim_mm_top15, mean_sim_mm_top30, mean_mean_sim_mm_top5,
    simmat_img, simmat_bert, simmat_mm,
    similarities_img, indexes_img,
    similarities_bert, indexes_bert,
    similarities_mm, indexes_mm,
)
gc.collect()

with timer('to_frame'):
    df_pair = pd.concat([pd.read_feather(path) for path in tmp_dir.glob('**/*.feather')], axis=0).reset_index(drop=True)
    
del rows
gc.collect()

with timer('sim_tfidf'):
    df_pair['sim_tfidf'] = tfidf_feats[df_pair['i'].values].multiply(tfidf_feats[df_pair['j'].values]).sum(axis=1)
df_pair['title_len_diff'] = np.abs(df_pair['title_len'] - df_pair['title_len_target'])
df_pair['title_num_words_diff'] = np.abs(df_pair['title_num_words'] - df_pair['title_num_words_target'])

del tfidf_feats
gc.collect()

100%|██████████| 1000/1000 [00:03<00:00, 301.83it/s]


  to_frame - done in 0.04775s
  sim_tfidf - done in 0.01812s


13

In [40]:
from cuml import ForestInference
import treelite

list_clf = []
for clf in joblib.load('../input/shopee/boosters_v34_v45_mm.pickle'):
    clf.save_model('/tmp/tmp.lgb')
    fi = ForestInference()
    fi.load_from_treelite_model(treelite.Model.load('/tmp/tmp.lgb', model_format='lightgbm'))
    list_clf.append(fi)

X = df_pair[[
    'sim_img', 'sim_tfidf', 'sim_bert', 'sim_mm', 'edit_distance',
    'title_len', 'title_len_target', 'title_len_diff',
    'title_num_words', 'title_num_words_target', 'title_num_words_diff',
    'mean_sim_img_top5', 'mean_sim_img_target_top5',
    'mean_sim_bert_top5', 'mean_sim_bert_target_top5',
    'mean_sim_mm_top5', 'mean_sim_mm_target_top5',
    'mean_sim_img_top15', 'mean_sim_img_target_top15',
    'mean_sim_bert_top15', 'mean_sim_bert_target_top15',
    'mean_sim_mm_top15', 'mean_sim_mm_target_top15',
    'mean_sim_img_top30', 'mean_sim_img_target_top30',
    'mean_sim_bert_top30', 'mean_sim_bert_target_top30',
    'mean_sim_mm_top30', 'mean_sim_mm_target_top30',
    'st_size', 'st_size_target',
    'wxh/st_size', 'wxh/st_size_target',
    'mean_mean_sim_img_top5', 'mean_mean_sim_img_target_top5',
    'mean_mean_sim_bert_top5', 'mean_mean_sim_bert_target_top5',
    'mean_mean_sim_mm_top5', 'mean_mean_sim_mm_target_top5',
]]

# passing as cupy array might be able to avoid multipy copy to GPU.
X = cp.asarray(X[clf.feature_name()].values.astype(np.float32))
df_pair = df_pair[['posting_id', 'posting_id_target']]

gc.collect()

with timer('predict'):
    df_pair['pred'] = np.mean([clf.predict(X).get() for clf in list_clf], axis=0) - conf_th

df_pair.to_pickle('submission_lgb.pkl')

[11:40:27] ../src/frontend/lightgbm.cc:544: model.num_tree = 1630
[11:40:28] ../src/frontend/lightgbm.cc:544: model.num_tree = 1630
[11:40:28] ../src/frontend/lightgbm.cc:544: model.num_tree = 1630
[11:40:29] ../src/frontend/lightgbm.cc:544: model.num_tree = 1630
[11:40:29] ../src/frontend/lightgbm.cc:544: model.num_tree = 1630
  predict - done in 0.56358s


# Postprocess

In [41]:
df_gcn = pd.read_pickle('submission_gcn.pkl')
df_lgb = pd.read_pickle('submission_lgb.pkl')

df_gcn['pred'] *= 2
df_lgb['pred'] *= 1

# for adding self-to-self rows back
if DEBUG:
    nrows = 1000
    df_self = pd.read_csv('../input/shopee-product-matching/train.csv', nrows=nrows, usecols=['posting_id'])
else:
    df_self = pd.read_csv('../input/shopee-product-matching/test.csv', usecols=['posting_id'])
df_self['posting_id_target'] = df['posting_id']
df['pred'] = 0

In [42]:
df_pred = pd.concat([df_gcn, df_lgb, df_self], axis=0, ignore_index=True).groupby(['posting_id', 'posting_id_target'])[['pred']].sum() / 3

df_pred.reset_index(inplace=True)
df_pred.loc[df_pred['posting_id'] == df_pred['posting_id_target'], 'pred'] = 0.5
df_pred.set_index(['posting_id', 'posting_id_target'], inplace=True)

df_pred = df_pred.query('pred > 0')
df_pred = df_pred[df_pred.apply(lambda row: (row.name[1], row.name[0]) in df_pred.index, axis=1)].reset_index()

df_pred

Unnamed: 0,posting_id,posting_id_target,pred
0,train_1002628427,train_1002628427,0.500000
1,train_1010868925,train_1010868925,0.500000
2,train_1010868925,train_4184037897,0.562619
3,train_1011324296,train_1011324296,0.500000
4,train_1011324296,train_1343380721,0.229234
...,...,...,...
1847,train_993003820,train_993003820,0.500000
1848,train_993079226,train_993079226,0.500000
1849,train_998568945,train_2413283241,0.226483
1850,train_998568945,train_998568945,0.500000


Remove edges by betweenness centrality:

In [43]:
from networkx import edge_betweenness_centrality

G = nx.Graph()
for i, j, w in df_pred[['posting_id', 'posting_id_target', 'pred']].values:
    G.add_edge(i, j, weight=w)

list_remove_edges = []
list_add_edges = []

def split_graph(G):
    list_comp = list(nx.connected_components(G))
    n = len(G.nodes)
    if len(list_comp) == 1:
        map_bet = edge_betweenness_centrality(G, normalized=True)
        map_bet = {(i, j): w  for (i, j), w in map_bet.items() 
                   if G[i][j]['weight'] < 0.15780210284453428}
        if len(map_bet) == 0:
            return
        edge, val = max(map_bet.items(), key=lambda x: x[1])
        if val > 0.11766651703447985:
            G.remove_edge(*edge)
            list_remove_edges.append(edge)
            return split_graph(G)
    else:
        iters = list_comp
        for comp in iters:
            if len(comp) > 6:
                split_graph(nx.Graph(G.subgraph(comp)))
                
split_graph(G)

for edge in list_remove_edges:
    G.remove_edge(*edge)

def get_score(i, j):
    try:
        return G[i][j]['weight']
    except KeyError:
        return -1

posting_ids = df_pred['posting_id'].unique()

matches = []
for i in posting_ids:
    if i in G:
        m = list(set([i] + list(G.neighbors(i))))
    else:
        m = [i]
    if len(m) > 51:
        m = sorted(m, key=lambda x: get_score(i, x), reverse=True)[:51]
    matches.append(' '.join(m))
    
matched = pd.DataFrame(dict(posting_id=posting_ids, matches=matches))

matched.to_csv('submission.csv', index=False)
matched

Unnamed: 0,posting_id,matches
0,train_1002628427,train_1002628427
1,train_1010868925,train_4184037897 train_1010868925
2,train_1011324296,train_1011324296 train_1343380721
3,train_1012209986,train_1012209986 train_1969570411
4,train_1014754278,train_1014754278
...,...,...
995,train_990204630,train_990204630 train_1724760722
996,train_993003820,train_993003820
997,train_993079226,train_993079226
998,train_998568945,train_2413283241 train_998568945
