# Initialization

In [1]:
!nvidia-smi

Wed Mar 31 06:54:53 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.67       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P0    26W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
# Install RAPIDS
!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
!bash rapidsai-csp-utils/colab/rapids-colab.sh stable

import sys, os

dist_package_index = sys.path.index('/usr/local/lib/python3.7/dist-packages')
sys.path = sys.path[:dist_package_index] + ['/usr/local/lib/python3.7/site-packages'] + sys.path[dist_package_index:]
sys.path
exec(open('rapidsai-csp-utils/colab/update_modules.py').read(), globals())

Cloning into 'rapidsai-csp-utils'...
remote: Enumerating objects: 34, done.[K
remote: Counting objects: 100% (34/34), done.[K
remote: Compressing objects: 100% (34/34), done.[K
remote: Total 205 (delta 16), reused 3 (delta 0), pack-reused 171[K
Receiving objects: 100% (205/205), 62.72 KiB | 2.73 MiB/s, done.
Resolving deltas: 100% (78/78), done.
PLEASE READ
********************************************************************************************************
Changes:
1. IMPORTANT SCRIPT CHANGES: Colab has updated to Python 3.7, and now runs our STABLE and NIGHTLY versions (0.18 and 0.19)!  PLEASE update your older install script code as follows:
	!bash rapidsai-csp-utils/colab/rapids-colab.sh 0.18

	import sys, os

	dist_package_index = sys.path.index('/usr/local/lib/python3.7/dist-packages')
	sys.path = sys.path[:dist_package_index] + ['/usr/local/lib/python3.7/site-packages'] + sys.path[dist_package_index:]
	sys.path
	exec(open('rapidsai-csp-utils/colab/update_modules.py').read

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Download Kaggle Data

In [None]:
# Download data and move to Google Drive
!pip install -q kaggle
from google.colab import files
files.upload()

!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle competitions download -c shopee-product-matching
!mv shopee-product-matching.zip ./drive/MyDrive/data/

In [4]:
# Copy from Google Drive and unzip
!cp ./drive/MyDrive/data/shopee-product-matching.zip ./
!unzip -q shopee-product-matching.zip
!rm shopee-product-matching.zip

## Import Packages

In [5]:
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras import layers, Model
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from cuml.neighbors import NearestNeighbors
from cuml.feature_extraction.text import TfidfVectorizer
from cuml.cluster import DBSCAN
import gc
import pandas as pd
import numpy as np
import cudf, cuml, cupy
import pickle
import matplotlib.pyplot as plt
import random
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [7]:
AUTOTUNE = tf.data.experimental.AUTOTUNE
SEED = 100
BATCH_SIZE = 32
CHUNK_SIZE = 4096
IMAGE_HEIGHT = 256
IMAGE_WIDTH = 256
LIMIT = 2.0
physical_devices = tf.config.list_physical_devices('GPU')
print("Num GPUs:", len(physical_devices))

if physical_devices:
    tf.config.experimental.set_virtual_device_configuration(
        physical_devices[0], [
            tf.config.experimental.VirtualDeviceConfiguration(
                memory_limit=1024 * LIMIT)
        ])
    print('TensorFlow usage is restricted to max %iGB GPU RAM' % LIMIT)

Num GPUs: 1
TensorFlow usage is restricted to max 2GB GPU RAM


# Process Data

In [8]:
def preprocess_image(image):
    image = tf.image.decode_jpeg(image, channels=3)
    image = tf.image.resize(image, [IMAGE_WIDTH, IMAGE_HEIGHT])
#     image /= 255  # normalize to [0,1] rangeI'm not a 
    return image

def load_and_preprocess_image(path):
    image = tf.io.read_file(path)
    return preprocess_image(image)

def augmentation(ds):
    data_augmentation = tf.keras.Sequential([
      tf.keras.layers.experimental.preprocessing.RandomFlip("horizontal"),
      layers.experimental.preprocessing.RandomRotation(0.3),
      layers.experimental.preprocessing.RandomTranslation(
          height_factor=(-0.2, 0.2), width_factor=(-0.2, 0.2)),
      layers.experimental.preprocessing.RandomZoom(0.2, 0.2),
    ])
    
    # Batch all datasets
    ds = ds.batch(BATCH_SIZE)

    # Use data augmentation only on the training set
    ds = ds.map(lambda x: data_augmentation(x))

    # Prefecting on all datasets
    return ds.prefetch(1)

def prepare_data(df, augment=False):
    # Load images
    path_ds = tf.data.Dataset.from_tensor_slices(df['image_paths'])
    image_ds = path_ds.map(load_and_preprocess_image, num_parallel_calls=AUTOTUNE)

    if augment:
        ds = augmentation(image_ds)
    else:
        ds = image_ds.batch(BATCH_SIZE).prefetch(1)
    
    return ds

In [170]:
load_dir = os.getcwd()
# Load and process images
df_train = pd.read_csv(load_dir + '/train.csv')
df_train['image_paths'] = load_dir + '/train_images/' + df_train['image'] 

# Ground truth
tmp = df_train.groupby('label_group').posting_id.agg('unique').to_dict()
df_train['target'] = df_train.label_group.map(tmp)

# Duplicate train set for runtime testing 
# df_train = pd.concat([df_train, df_train], ignore_index=True)
train_ds = prepare_data(df_train, augment=False)

# Embeddings

In [163]:
class SimilarSearch:
    """
    Finding the similar embeddings with NearestNeighbors or Cosine similarity.
    Dynamically learning the threshold for each item with DBSCAN.
    RAPIDS.AI accelerated.
    """
    def __init__(self, k=50):
        self.CHUNK_SIZE = 4096 
        self.k = k
        self.MIN_SAMPLES = 1

    @staticmethod
    def dbscan_cluster(dbscan, idx, dist, strict=False):
        """
        Figure out number of similar items by excluding the last cluster.

        Parameters:
            dbscan: DBSCAN model
            idx: index of nearest items
            dist: distance between items 
        Returns:
            index: index of similar items
        """
        dbscan.fit(dist.reshape(-1, 1))
        labels = dbscan.labels_
    
        if strict:
            counts = len(labels[labels==0])
        else:
            counts = len(
                labels[:labels.argmax()]) if labels.max() != 0 else len(labels)
        
        similar_item_index = idx[:counts].tolist()
        
        return similar_item_index

    def nearest_neighbors(self, embeddings, min_dist, verbose=True):
        """
        Get the k nearest items.

        Parameters:
            embeddings: embeddings of images
            min_dist: the maximum distance between two samples (DBSCAN)
            verbose: wether to print out chunk verbose
        Returns:
            item_index: indexes of similar items
        """
        knn = NearestNeighbors(n_neighbors=self.k)
        knn.fit(embeddings)
        dbscan = DBSCAN(eps=min_dist, min_samples=self.MIN_SAMPLES)

        num_chunk = round(embeddings.shape[0] / self.CHUNK_SIZE)
        item_index = []
        for i in range(num_chunk + 1):
            start_idx = i * self.CHUNK_SIZE
            end_idx = min((i + 1) * self.CHUNK_SIZE, embeddings.shape[0])
            if verbose:
                print('Chunk', start_idx, 'to', end_idx)

            dist, idx = knn.kneighbors(embeddings[start_idx:end_idx, :])
            for j in range(end_idx - start_idx):
                index = self.dbscan_cluster(dbscan, idx[j], dist[j])
                item_index.append(index)

        return item_index

    def cosine_similarity(self, embeddings, threshold, verbose=True):
        """
        Get similar items with cosine similarity, applicable for unit vectors.
        """
        num_chunk = round(embeddings.shape[0] / self.CHUNK_SIZE)
        item_index = []
        for i in range(num_chunk + 1):
            start_idx = i * self.CHUNK_SIZE                  
            end_idx = min((i + 1) * self.CHUNK_SIZE, embeddings.shape[0])
            if verbose:
                print('Chunk', start_idx, 'to', end_idx) 
            
            cos_sim = cupy.matmul(embeddings, embeddings[start_idx:end_idx].T).T
            for j in range(end_idx - start_idx):
                idx = cupy.where(cos_sim[j] > threshold)[0]
                item_index.append(cupy.asnumpy(idx))

        return item_index

sim = SimilarSearch()

## Image Embedding

In [46]:
# Image embedding
effnet = EfficientNetB0(weights='imagenet', include_top=False, pooling='avg', input_shape=None)
embeddings_image = effnet.predict(train_ds, verbose=1)

del effnet
_ = gc.collect()



In [100]:
print(f"Image Similarity Search:")
image_index = sim.nearest_neighbors(embeddings_image, 1.2)

del embeddings_image
_ = gc.collect()

Image Similarity Search:
Chunk 0 to 4096
Chunk 4096 to 8192
Chunk 8192 to 12288
Chunk 12288 to 16384
Chunk 16384 to 20480
Chunk 20480 to 24576
Chunk 24576 to 28672
Chunk 28672 to 32768
Chunk 32768 to 34250


## Text Embedding

In [171]:
# Text embedding
cudf_train = cudf.DataFrame(df_train)
sentences = cudf_train.title
vectorizer = TfidfVectorizer(binary=True, max_features=25000)
embeddings_text = vectorizer.fit_transform(sentences).toarray()

del sentences, vectorizer
_ = gc.collect()

In [172]:
print(f"Text Similarity Search:")
text_index = sim.cosine_similarity(embeddings_text, 0.72)

del embeddings_text
_ = gc.collect()

Text Similarity Search:
Chunk 0 to 4096
Chunk 4096 to 8192
Chunk 8192 to 12288
Chunk 12288 to 16384
Chunk 16384 to 20480
Chunk 20480 to 24576
Chunk 24576 to 28672
Chunk 28672 to 32768
Chunk 32768 to 34250


# Evaluation

In [173]:
def row_wise_f1_score(y_true, y_pred):

    y_true = y_true.apply(lambda x: set(x))
    y_pred = y_pred.apply(lambda x: set(x))

    tp = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    fp = y_pred.apply(lambda x: len(x)).values - tp
    fn = y_true.apply(lambda x: len(x)).values - tp

    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1 = 2 * ((precision * recall) / (precision + recall))
    return f1


def evaluation(df_data, image_index, text_index):
    df_data['matches'] = [
        set(df_data['posting_id'][text].tolist() +
            df_data['posting_id'][image].tolist())
        for text, image in zip(text_index, image_index)
    ]

    df_data['f1'] = row_wise_f1_score(df_data['target'], df_data['matches'])
    return df_data['f1'].mean()


print('f1-score:', evaluation(df_train, image_index, text_index))

f1-score: 0.7463547261512918


# Parameter Tunning

In [29]:
def parameter_tuning(df_data, embeddings_text, image_index, 
                     threshold):
  
    text_index = sim.cosine_similarity(embeddings_text, threshold, verbose=False)
    f1_score = evaluation(df_data, image_index, text_index)
    print('threshold_text: {}, f1-score: {}'.format(
        round(threshold, 2), round(f1_score, 4)))

    return f1_score

In [41]:
results = {}
for i in np.arange(.06, 0.15, .01):
    f1 = parameter_tuning(df_train, embeddings_text, image_index, i)
    results[round(i,2)] = f1

print('The best thresholds is {}, yield a f1-score of {}'.format(max(results, key=results.get), max(results.values())))

threshold_text: 0.06, f1-score: 0.7114
threshold_text: 0.07, f1-score: 0.7254
threshold_text: 0.08, f1-score: 0.7352
threshold_text: 0.09, f1-score: 0.7412
threshold_text: 0.1, f1-score: 0.7449
threshold_text: 0.11, f1-score: 0.747
threshold_text: 0.12, f1-score: 0.7471
threshold_text: 0.13, f1-score: 0.7465
threshold_text: 0.14, f1-score: 0.7437
The best thresholds is 0.12, yield a f1-score of 0.7470603407640282


# Results Inspection

In [None]:
def results_inspection(df_train, df_example, col_name):

    df_target = df_train[df_train.posting_id.isin(df_example[col_name])] 
    print('Number of items:', len(df_target))
    print(df_target.title.values)
    n_img = len(df_target)
    n_cols = 5 if n_img >= 5 else n_img
    n_rows = n_img // n_cols if n_cols <5 else (n_img // n_cols) + 1

    plt.figure(figsize=(10, 10))
    for i, image_path, in enumerate(df_target['image_paths']):
        image = tf.io.read_file(image_path)
        image = preprocess_image(image) / 255
        plt.subplot(n_rows, n_cols, i+1)
        plt.imshow(image)
    plt.show()

df_train['image_matches'] = [df_train['posting_id'][image].tolist() for image in image_index]
df_train['text_matches'] = [df_train['posting_id'][text].tolist() for text in text_index]

df_perfect = df_train[df_train.f1==1]
print('Number of perfect matches:', len(df_perfect))
df_bad = df_train[df_train.f1 < 0.7]
print('Number of bad matches:', len(df_bad))

## Good Match Example

In [None]:
df_example = df_perfect.iloc[random.randint(0, len(df_perfect))] # index: 33119
print('Ground Truth:')
results_inspection(df_train, df_example, 'target')
print('Image Embeddings:')
results_inspection(df_train, df_example, 'image_matches')
print('Text Embeddings:')
results_inspection(df_train, df_example, 'text_matches')

## Bad Match Example

In [None]:
df_example = df_bad.iloc[random.randint(0, len(df_bad))]
print('Ground Truth:')
results_inspection(df_train, df_example, 'target')
print('Image Embeddings:')
results_inspection(df_train, df_example, 'image_matches')
print('Text Embeddings:')
results_inspection(df_train, df_example, 'text_matches')

# Test Preprocessing

In [69]:
def remove_bad_chars(sentences):
    # Remove characters e.g. \\xc3\\x89
    return [re.sub(r'\\x[a-z0-9]{2}', ' ', sentence) for sentence in sentences]

def remove_parenthesized_content(sentences):
    # Remove contect in () or []
    return [re.sub(r'\[[^)]*\]|\([^)]*\)', ' ', sentence) for sentence in sentences]

In [None]:
sentences = remove_bad_chars(title)
sentences = remove_parenthesized_content(sentences)
df_train['title'] = sentences

In [None]:
cache = set(stopwords.words(['english', 'indonesian', 'german']))
title = df_train.title.tolist()
sentences = [' '.join(word for word in sentence.split() if word not in cache) for sentence in title]

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(title)
sequences = tokenizer.texts_to_sequences(title)
sentences = tokenizer.sequences_to_texts(sequences)