# Initialization

In [1]:
!nvidia-smi

Fri Mar 26 16:20:19 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.56       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   37C    P0    28W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
# Install RAPIDS
!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
!bash rapidsai-csp-utils/colab/rapids-colab.sh stable

import sys, os

dist_package_index = sys.path.index('/usr/local/lib/python3.7/dist-packages')
sys.path = sys.path[:dist_package_index] + ['/usr/local/lib/python3.7/site-packages'] + sys.path[dist_package_index:]
sys.path
exec(open('rapidsai-csp-utils/colab/update_modules.py').read(), globals())

Cloning into 'rapidsai-csp-utils'...
remote: Enumerating objects: 34, done.[K
remote: Counting objects: 100% (34/34), done.[K
remote: Compressing objects: 100% (34/34), done.[K
remote: Total 205 (delta 16), reused 3 (delta 0), pack-reused 171[K
Receiving objects: 100% (205/205), 62.72 KiB | 892.00 KiB/s, done.
Resolving deltas: 100% (78/78), done.
PLEASE READ
********************************************************************************************************
Changes:
1. IMPORTANT SCRIPT CHANGES: Colab has updated to Python 3.7, and now runs our STABLE and NIGHTLY versions (0.18 and 0.19)!  PLEASE update your older install script code as follows:
	!bash rapidsai-csp-utils/colab/rapids-colab.sh 0.18

	import sys, os

	dist_package_index = sys.path.index('/usr/local/lib/python3.7/dist-packages')
	sys.path = sys.path[:dist_package_index] + ['/usr/local/lib/python3.7/site-packages'] + sys.path[dist_package_index:]
	sys.path
	exec(open('rapidsai-csp-utils/colab/update_modules.py').re

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Download Kaggle Data

In [None]:
# Download data and move to Google Drive
!pip install -q kaggle
from google.colab import files
files.upload()

!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle competitions download -c shopee-product-matching
!mv shopee-product-matching.zip ./drive/MyDrive/data/

In [4]:
# Copy from Google Drive and unzip
!cp ./drive/MyDrive/data/shopee-product-matching.zip ./
!unzip -q shopee-product-matching.zip
!rm shopee-product-matching.zip

## Import Packages

In [27]:
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras import layers, Model
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from cuml.neighbors import NearestNeighbors
from cuml.feature_extraction.text import TfidfVectorizer
from cuml.cluster import DBSCAN
import gc
import pandas as pd
import numpy as np
import cudf, cuml, cupy
import pickle
import matplotlib.pyplot as plt
import random
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
AUTOTUNE = tf.data.experimental.AUTOTUNE
SEED = 100
BATCH_SIZE = 32
CHUNK_SIZE = 4096
IMAGE_HEIGHT = 256
IMAGE_WIDTH = 256
LIMIT = 2.0
physical_devices = tf.config.list_physical_devices('GPU')
print("Num GPUs:", len(physical_devices))

if physical_devices:
    tf.config.experimental.set_virtual_device_configuration(
        physical_devices[0], [
            tf.config.experimental.VirtualDeviceConfiguration(
                memory_limit=1024 * LIMIT)
        ])
    print('TensorFlow usage is restricted to max %iGB GPU RAM' % LIMIT)

Num GPUs: 1
TensorFlow usage is restricted to max 2GB GPU RAM


# Process Data

In [7]:
def preprocess_image(image):
    image = tf.image.decode_jpeg(image, channels=3)
    image = tf.image.resize(image, [IMAGE_WIDTH, IMAGE_HEIGHT])
#     image /= 255  # normalize to [0,1] rangeI'm not a 
    return image

def load_and_preprocess_image(path):
    image = tf.io.read_file(path)
    return preprocess_image(image)

def augmentation(ds):
    data_augmentation = tf.keras.Sequential([
      tf.keras.layers.experimental.preprocessing.RandomFlip("horizontal"),
      layers.experimental.preprocessing.RandomRotation(0.3),
      layers.experimental.preprocessing.RandomTranslation(
          height_factor=(-0.2, 0.2), width_factor=(-0.2, 0.2)),
      layers.experimental.preprocessing.RandomZoom(0.2, 0.2),
    ])
    
    # Batch all datasets
    ds = ds.batch(BATCH_SIZE)

    # Use data augmentation only on the training set
    ds = ds.map(lambda x: data_augmentation(x))

    # Prefecting on all datasets
    return ds.prefetch(1)

def prepare_data(df, augment=False):
    # Load images
    path_ds = tf.data.Dataset.from_tensor_slices(df['image_paths'])
    image_ds = path_ds.map(load_and_preprocess_image, num_parallel_calls=AUTOTUNE)

    if augment:
        ds = augmentation(image_ds)
    else:
        ds = image_ds.batch(BATCH_SIZE).prefetch(1)
    
    return ds

In [9]:
load_dir = os.getcwd()
# Load and process images
df_train = pd.read_csv(load_dir + '/train.csv')
df_train['image_paths'] = load_dir + '/train_images/' + df_train['image'] 

# Ground truth
tmp = df_train.groupby('label_group').posting_id.agg('unique').to_dict()
df_train['target'] = df_train.label_group.map(tmp)

# Duplicate train set for runtime testing 
# df_train = pd.concat([df_train, df_train], ignore_index=True)
train_ds = prepare_data(df_train, augment=False)

# Embeddings

In [14]:
def find_similar_items(embeddings, threshold, fine_tune=False):
    """
    Using Nearest Neighbors to figure out similar items
    """
    knn = NearestNeighbors(n_neighbors=50)
    knn.fit(embeddings)

    num_chunk = round(embeddings.shape[0] / CHUNK_SIZE)
    item_index = []
    for i in range(num_chunk + 1):
        start_idx = i * CHUNK_SIZE
        end_idx = min((i + 1) * CHUNK_SIZE, embeddings.shape[0])
        if not fine_tune:
            print('Chunk', start_idx, 'to', end_idx)

        dist, idx = knn.kneighbors(embeddings[start_idx:end_idx, :])
        counts = (dist < threshold).sum(axis=1)
        chunk_index = [
            idx[i, :counts[i]].tolist() for i in range(end_idx - start_idx)
        ]
        item_index += chunk_index

    return item_index


def find_similar_dbscan(embeddings, min_dist, fine_tune=False):
    """
    Using Nearest Neighbors and DBSCAN to figure out similar items
    """
    knn = NearestNeighbors(n_neighbors=50)
    knn.fit(embeddings)
    dbscan = DBSCAN(eps=min_dist, min_samples=1)

    num_chunk = round(embeddings.shape[0] / CHUNK_SIZE)
    item_index = []
    for i in range(num_chunk + 1):
        start_idx = i * CHUNK_SIZE
        end_idx = min((i + 1) * CHUNK_SIZE, embeddings.shape[0])
        if not fine_tune:
            print('Chunk', start_idx, 'to', end_idx)

        dist, idx = knn.kneighbors(embeddings[start_idx:end_idx, :])
        for i in range(end_idx - start_idx):
            dbscan.fit(dist[i, :].reshape(-1, 1))
            labels = dbscan.labels_
            counts = len(
                labels[:labels.argmax()]) if labels.max() != 0 else len(labels)
            chunk_index = idx[i, :counts].tolist()
            item_index.append(chunk_index)

    return item_index

## Image Embedding

In [10]:
# Image embedding
effnet = EfficientNetB0(weights='imagenet', include_top=False, pooling='avg', input_shape=None)
embeddings_image = effnet.predict(train_ds, verbose=1)

del effnet
_ = gc.collect()

Downloading data from https://storage.googleapis.com/keras-applications/efficientnetb0_notop.h5


In [15]:
# Similiar images
image_index = find_similar_dbscan(embeddings_image, 1.2)

Chunk 0 to 4096
Chunk 4096 to 8192
Chunk 8192 to 12288
Chunk 12288 to 16384
Chunk 16384 to 20480
Chunk 20480 to 24576
Chunk 24576 to 28672
Chunk 28672 to 32768
Chunk 32768 to 34250


## Text Embedding

In [28]:
# stopwords = stopwords.words('idud')
stopwords.fileids()

['arabic',
 'azerbaijani',
 'danish',
 'dutch',
 'english',
 'finnish',
 'french',
 'german',
 'greek',
 'hungarian',
 'indonesian',
 'italian',
 'kazakh',
 'nepali',
 'norwegian',
 'portuguese',
 'romanian',
 'russian',
 'slovene',
 'spanish',
 'swedish',
 'tajik',
 'turkish']

In [56]:
cache = set(stopwords.words(['english', 'indonesian', 'german']))
title = df_train.title.tolist()
sentences = [' '.join(word for word in sentence.lower().split() if word not in cache) for sentence in title]

In [76]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(title)
sequences = tokenizer.texts_to_sequences(title)
sentences = tokenizer.sequences_to_texts(sequences)

In [80]:
title[:100]

['Paper Bag Victoria Secret',
 'Double Tape 3M VHB 12 mm x 4,5 m ORIGINAL / DOUBLE FOAM TAPE',
 'Maling TTS Canned Pork Luncheon Meat 397 gr',
 'Daster Batik Lengan pendek - Motif Acak / Campur - Leher Kancing (DPT001-00) Batik karakter Alhadi',
 'Nescafe \\xc3\\x89clair Latte 220ml',
 'CELANA WANITA  (BB 45-84 KG)Harem wanita (bisa cod)',
 'Jubah anak size 1-12 thn',
 'KULOT PLISKET SALUR /CANDY PLISKET /WISH KULOT PREMIUM /KULOT PELANGI PREMIUM/HIEKA KULOT',
 '[LOGU] Tempelan kulkas magnet angka, tempelan angka magnet',
 'BIG SALE SEPATU PANTOFEL KULIT KEREN KERJA KANTOR LAKI PRIA COWOK DINAS RESMI FORMAL PESTA KICKERS',
 'Atasan Rajut Wanita LISDIA SWEATER',
 'PASHMINA KUSUT RAWIS POLOS CRINKLE SHAWL MURAH BANGET',
 'PASHMINA KUSUT RAWIS POLOS CRINKLE SHAWL MURAH BANGET PART 2',
 'Lampu led t5 Speedometer Dashboard Motor Mobil 5050 Speedo Bright',
 'Charger VIZZ VZ-TC11 / batok charger vizz 1A ORIGINAL REAL KAPASITAS',
 'Korek Kuping LED untuk balita CherryBabyKidsShop SP LC',
 'MAR

In [79]:
sentences[:100]

['paper bag victoria secret',
 'double tape 3m vhb 12 mm x 4 5 m original double foam tape',
 'maling tts canned pork luncheon meat 397 gr',
 'daster batik lengan pendek motif acak campur leher kancing dpt001 00 batik karakter alhadi',
 'nescafe xc3 x89clair latte 220ml',
 'celana wanita bb 45 84 kg harem wanita bisa cod',
 'jubah anak size 1 12 thn',
 'kulot plisket salur candy plisket wish kulot premium kulot pelangi premium hieka kulot',
 'logu tempelan kulkas magnet angka tempelan angka magnet',
 'big sale sepatu pantofel kulit keren kerja kantor laki pria cowok dinas resmi formal pesta kickers',
 'atasan rajut wanita lisdia sweater',
 'pashmina kusut rawis polos crinkle shawl murah banget',
 'pashmina kusut rawis polos crinkle shawl murah banget part 2',
 'lampu led t5 speedometer dashboard motor mobil 5050 speedo bright',
 'charger vizz vz tc11 batok charger vizz 1a original real kapasitas',
 'korek kuping led untuk balita cherrybabykidsshop sp lc',
 'marks spencer rose hand body

In [16]:
# Text embedding
cudf_train = cudf.DataFrame(df_train)
sentences = cudf_train.title
vectorizer = TfidfVectorizer(binary=True, max_features=25000)
embeddings_text = vectorizer.fit_transform(sentences).toarray()

del sentences, vectorizer
_ = gc.collect()

In [17]:
# Similar texts
text_index = []
num_chunk = round(embeddings_text.shape[0] / CHUNK_SIZE)
for i in range(num_chunk+1):
    start_idx = i * CHUNK_SIZE                  
    end_idx = min((i + 1) * CHUNK_SIZE, embeddings_text.shape[0])
    print('Chunk', start_idx, 'to', end_idx) 
    
    cts = cupy.matmul(embeddings_text, embeddings_text[start_idx:end_idx].T).T
    for k in range(end_idx - start_idx):
        idx = cupy.where(cts[k, ]>0.72)[0]
        text_index.append(cupy.asnumpy(idx))

del embeddings_text
_ = gc.collect()

Chunk 0 to 4096
Chunk 4096 to 8192
Chunk 8192 to 12288
Chunk 12288 to 16384
Chunk 16384 to 20480
Chunk 20480 to 24576
Chunk 24576 to 28672
Chunk 28672 to 32768
Chunk 32768 to 34250


# Evaluation

In [18]:
def row_wise_f1_score(y_true, y_pred):

    y_true = y_true.apply(lambda x: set(x))
    y_pred = y_pred.apply(lambda x: set(x))

    tp = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    fp = y_pred.apply(lambda x: len(x)).values - tp
    fn = y_true.apply(lambda x: len(x)).values - tp

    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1 = 2 * ((precision * recall) / (precision + recall))
    return f1


def evaluation(df_data, image_index, text_index):
    df_data['matches'] = [
        set(df_data['posting_id'][text].tolist() +
            df_data['posting_id'][image].tolist())
        for text, image in zip(text_index, image_index)
    ]

    df_data['f1'] = row_wise_f1_score(df_data['target'], df_data['matches'])
    return df_data['f1'].mean()


print('f1-score:', evaluation(df_train, image_index, text_index))

f1-score: 0.7463547261512918
