Reference: https://www.kaggle.com/code/finlay/unsupervised-image-text-baseline-in-20min

In [1]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import cudf, cuml, cupy

DATA_PATH = '../input/shopee-product-matching/'
test = pd.read_csv(DATA_PATH + 'test.csv')
COMPUTE_CV = len(test) == 3

In [2]:
def getMetric(col):
    def f1score(row):
        n = len(np.intersect1d(row.target, row[col]) )
        return 2*n / (len(row.target) + len(row[col]))
    return f1score

In [3]:
if COMPUTE_CV:  # for training
    train = pd.read_csv(DATA_PATH + 'train.csv')
    train['image'] = DATA_PATH + 'train_images/' + train['image']
    target_mapping = train.groupby('label_group').posting_id.agg('unique').to_dict()
    train['target'] = train.label_group.map(target_mapping)
    train_gf = cudf.read_csv(DATA_PATH + 'train.csv')
else:  # for submitting
    train = pd.read_csv(DATA_PATH + 'test.csv')
    train['image'] = DATA_PATH + 'test_images/' + train['image']
    train_gf = cudf.read_csv(DATA_PATH + 'test.csv')
    
print('train shape is', train.shape )
train.head()

train shape is (34250, 6)


Unnamed: 0,posting_id,image,image_phash,title,label_group,target
0,train_129225211,../input/shopee-product-matching/train_images/...,94974f937d4c2433,Paper Bag Victoria Secret,249114794,"[train_129225211, train_2278313361]"
1,train_3386243561,../input/shopee-product-matching/train_images/...,af3f9460c2838f0f,"Double Tape 3M VHB 12 mm x 4,5 m ORIGINAL / DO...",2937985045,"[train_3386243561, train_3423213080]"
2,train_2288590299,../input/shopee-product-matching/train_images/...,b94cb00ed3e50f78,Maling TTS Canned Pork Luncheon Meat 397 gr,2395904891,"[train_2288590299, train_3803689425]"
3,train_2406599165,../input/shopee-product-matching/train_images/...,8514fc58eafea283,Daster Batik Lengan pendek - Motif Acak / Camp...,4093212188,"[train_2406599165, train_3342059966]"
4,train_3369186413,../input/shopee-product-matching/train_images/...,a6f319f924ad708c,Nescafe \xc3\x89clair Latte 220ml,3648931069,"[train_3369186413, train_921438619]"


## Image Hash

In [4]:
phash_mapping = train.groupby('image_phash').posting_id.agg('unique').to_dict()
train['oof_hash'] = train.image_phash.map(phash_mapping)
train.head()

Unnamed: 0,posting_id,image,image_phash,title,label_group,target,oof_hash
0,train_129225211,../input/shopee-product-matching/train_images/...,94974f937d4c2433,Paper Bag Victoria Secret,249114794,"[train_129225211, train_2278313361]",[train_129225211]
1,train_3386243561,../input/shopee-product-matching/train_images/...,af3f9460c2838f0f,"Double Tape 3M VHB 12 mm x 4,5 m ORIGINAL / DO...",2937985045,"[train_3386243561, train_3423213080]",[train_3386243561]
2,train_2288590299,../input/shopee-product-matching/train_images/...,b94cb00ed3e50f78,Maling TTS Canned Pork Luncheon Meat 397 gr,2395904891,"[train_2288590299, train_3803689425]",[train_2288590299]
3,train_2406599165,../input/shopee-product-matching/train_images/...,8514fc58eafea283,Daster Batik Lengan pendek - Motif Acak / Camp...,4093212188,"[train_2406599165, train_3342059966]",[train_2406599165]
4,train_3369186413,../input/shopee-product-matching/train_images/...,a6f319f924ad708c,Nescafe \xc3\x89clair Latte 220ml,3648931069,"[train_3369186413, train_921438619]",[train_3369186413]


In [5]:
if COMPUTE_CV:
    train['f1'] = train.apply(getMetric('oof_hash'), axis=1)
    print('CV score for baseline =', train.f1.mean())

CV score for baseline = 0.5530933399168149


## Image CNN

In [6]:
import torch
torch.manual_seed(0)
torch.backends.cudnn.deterministic = False
torch.backends.cudnn.benchmark = True

import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import torchvision.models as models
import torchvision.transforms as transforms

from PIL import Image

from sklearn.preprocessing import normalize

In [7]:
# MODEL_PATH = "../input/pretrained-pytorch-models/resnet18-5c106cde.pth"

In [8]:
class ShopeeImageDataset(Dataset):
    def __init__(self, img_path, transform):
        self.img_path = img_path
        self.transform = transform
        
    def __getitem__(self, index):
        img = Image.open(self.img_path[index]).convert('RGB')
        img = self.transform(img)
        return img
    
    def __len__(self):
        return len(self.img_path)

In [9]:
image_dataset = ShopeeImageDataset(
    train['image'].values,
    transforms.Compose([
        transforms.Resize((512, 512)),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
]))
    
image_loader = DataLoader(
    image_dataset,
    batch_size=10, shuffle=False, num_workers=2
)

In [10]:
class ShopeeImageEmbeddingNet(nn.Module):
    def __init__(self):
        super(ShopeeImageEmbeddingNet, self).__init__()
        
        model = models.resnet18(pretrained=True)
#         model = models.resnet18(pretrained=False)
#         model_state = torch.load(MODEL_PATH)
#         model.load_state_dict(model_state, strict=True)
        model.avgpool = nn.AdaptiveMaxPool2d(output_size=(1, 1))
        model = nn.Sequential(*list(model.children())[:-1])
        model.eval()
        self.model = model
        
    def forward(self, img):        
        out = self.model(img)
        return out

In [11]:
device = 'cuda'
img_model = ShopeeImageEmbeddingNet().to(device)

image_feat = []
with torch.no_grad():
    for data in tqdm(image_loader):
        data = data.to(device)
        feat = img_model(data)
        feat = feat.reshape(feat.shape[0], feat.shape[1])
        feat = feat.data.cpu().numpy()
        
        image_feat.append(feat)

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth


  0%|          | 0.00/44.7M [00:00<?, ?B/s]

  0%|          | 0/3425 [00:00<?, ?it/s]

In [12]:
# L2 normalize to 0-1
image_feat = normalize(np.vstack(image_feat))

In [13]:
preds = []
CHUNK = 1024 * 4

image_feat = cupy.array(image_feat)

print('Finding similar images...')

CTS = len(image_feat) // CHUNK
if len(image_feat) % CHUNK != 0:
    CTS += 1
    
for j in range(CTS):
    a = j * CHUNK
    b = (j + 1) * CHUNK
    b = min(b, len(image_feat))
    print('chunk', a, 'to', b)
    
    distances = cupy.matmul(image_feat, image_feat[a:b].T).T
    
    for k in range(b - a):
        idx = cupy.where(distances[k,] > 0.95)[0]
        o = train.iloc[cupy.asnumpy(idx)].posting_id.values
        preds.append(o)

del image_feat, img_model

Finding similar images...
chunk 0 to 4096
chunk 4096 to 8192
chunk 8192 to 12288
chunk 12288 to 16384
chunk 16384 to 20480
chunk 20480 to 24576
chunk 24576 to 28672
chunk 28672 to 32768
chunk 32768 to 34250


In [14]:
train['oof_cnn'] = preds

if COMPUTE_CV:
    train['f1'] = train.apply(getMetric('oof_cnn'), axis=1)
    print('CV score for baseline =', train.f1.mean())

CV score for baseline = 0.6527899883423682


## TF-IDF

In [15]:
from cuml.feature_extraction.text import TfidfVectorizer

In [16]:
model = TfidfVectorizer(stop_words=None, binary=True, max_features=25000)
text_embeddings = model.fit_transform(train_gf.title).toarray()
print('text embeddings shape', text_embeddings.shape)

text embeddings shape (34250, 25000)


In [17]:
preds = []
CHUNK = 1024 * 4

print('Finding similar titles...')

CTS = len(train) // CHUNK
if len(train) % CHUNK != 0:
    CTS += 1
    
for j in range(CTS):
    a = j * CHUNK
    b = (j + 1) * CHUNK
    b = min(b, len(train))
    print('chunk', a, 'to', b)
    
    cts = cupy.matmul(text_embeddings, text_embeddings[a:b].T).T
    
    for k in range(b - a):
        idx = cupy.where(cts[k,] > 0.7)[0]
        o = train.iloc[cupy.asnumpy(idx)].posting_id.values
        preds.append(o)
        
del model, text_embeddings

Finding similar titles...
chunk 0 to 4096
chunk 4096 to 8192
chunk 8192 to 12288
chunk 12288 to 16384
chunk 16384 to 20480
chunk 20480 to 24576
chunk 24576 to 28672
chunk 28672 to 32768
chunk 32768 to 34250


In [18]:
train['oof_tfidf'] = preds

if COMPUTE_CV:
    train['f1'] = train.apply(getMetric('oof_tfidf'), axis=1)
    print('CV score for baseline =', train.f1.mean())

CV score for baseline = 0.6137154152579002


## Combination

In [19]:
def combine_for_cv(row):
    x = np.concatenate([row.oof_tfidf, row.oof_cnn, row.oof_hash])
    return np.unique(x)

def combine_for_sub(row):
    x = np.concatenate([row.oof_tfidf, row.oof_cnn, row.oof_hash])
    return ' '.join(np.unique(x))

In [20]:
if COMPUTE_CV:
    target_mapping = train.groupby('label_group').posting_id.agg('unique').to_dict()
    train['target'] = train.label_group.map(target_mapping)
    train['oof'] = train.apply(combine_for_cv, axis=1)
    train['f1'] = train.apply(getMetric('oof'), axis=1)
    print('CV Score =', train.f1.mean() )

CV Score = 0.7342468156992659


In [21]:
train.head()

Unnamed: 0,posting_id,image,image_phash,title,label_group,target,oof_hash,f1,oof_cnn,oof_tfidf,oof
0,train_129225211,../input/shopee-product-matching/train_images/...,94974f937d4c2433,Paper Bag Victoria Secret,249114794,"[train_129225211, train_2278313361]",[train_129225211],1.0,[train_129225211],"[train_129225211, train_2278313361]","[train_129225211, train_2278313361]"
1,train_3386243561,../input/shopee-product-matching/train_images/...,af3f9460c2838f0f,"Double Tape 3M VHB 12 mm x 4,5 m ORIGINAL / DO...",2937985045,"[train_3386243561, train_3423213080]",[train_3386243561],0.666667,[train_3386243561],[train_3386243561],[train_3386243561]
2,train_2288590299,../input/shopee-product-matching/train_images/...,b94cb00ed3e50f78,Maling TTS Canned Pork Luncheon Meat 397 gr,2395904891,"[train_2288590299, train_3803689425]",[train_2288590299],0.666667,[train_2288590299],[train_2288590299],[train_2288590299]
3,train_2406599165,../input/shopee-product-matching/train_images/...,8514fc58eafea283,Daster Batik Lengan pendek - Motif Acak / Camp...,4093212188,"[train_2406599165, train_3342059966]",[train_2406599165],0.285714,[train_2406599165],"[train_2406599165, train_3576714541, train_150...","[train_1508100548, train_1744956981, train_240..."
4,train_3369186413,../input/shopee-product-matching/train_images/...,a6f319f924ad708c,Nescafe \xc3\x89clair Latte 220ml,3648931069,"[train_3369186413, train_921438619]",[train_3369186413],0.666667,[train_3369186413],[train_3369186413],[train_3369186413]


## Submission

In [22]:
train['matches'] = train.apply(combine_for_sub, axis=1)

train[['posting_id','matches']].to_csv('submission.csv', index=False)
sub = pd.read_csv('submission.csv')
sub.head()

Unnamed: 0,posting_id,matches
0,train_129225211,train_129225211 train_2278313361
1,train_3386243561,train_3386243561
2,train_2288590299,train_2288590299
3,train_2406599165,train_1508100548 train_1744956981 train_240659...
4,train_3369186413,train_3369186413
