# Initialization

In [1]:
!nvidia-smi

Fri Mar 19 15:58:26 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.56       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   67C    P8    10W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
# Install RAPIDS
!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
!bash rapidsai-csp-utils/colab/rapids-colab.sh stable

import sys, os

dist_package_index = sys.path.index('/usr/local/lib/python3.7/dist-packages')
sys.path = sys.path[:dist_package_index] + ['/usr/local/lib/python3.7/site-packages'] + sys.path[dist_package_index:]
sys.path
exec(open('rapidsai-csp-utils/colab/update_modules.py').read(), globals())

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Download Kaggle Data

In [None]:
!pip install -q kaggle
from google.colab import files
files.upload()

!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle competitions download -c shopee-product-matching
!unzip -q shopee-product-matching.zip -d ./drive/MyDrive/data/shopee
!rm shopee-product-matching.zip

## Import Packages

In [3]:
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras import layers, Model
from tensorflow.keras.applications import EfficientNetB3
from cuml.neighbors import NearestNeighbors
from cuml.feature_extraction.text import TfidfVectorizer
import gc
import pandas as pd
import numpy as np
import cudf, cuml, cupy

ModuleNotFoundError: ignored

In [None]:
AUTOTUNE = tf.data.experimental.AUTOTUNE
SEED = 100
BATCH_SIZE =32
CHUNK_SIZE = 2048
IMAGE_HEIGHT = 224
IMAGE_WIDTH = 224
LIMIT = 2.0
physical_devices = tf.config.list_physical_devices('GPU')
print("Num GPUs:", len(physical_devices))

if physical_devices:
  tf.config.experimental.set_virtual_device_configuration(
      physical_devices[0],
      [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024*LIMIT)])
  print('TensorFlow usage is restricted to max %iGB GPU RAM'%LIMIT)

# Process Data

In [None]:
def preprocess_image(image):
    image = tf.image.decode_jpeg(image, channels=3)
    image = tf.image.resize(image, [IMAGE_WIDTH, IMAGE_HEIGHT])
#     image /= 255  # normalize to [0,1] rangeI'm not a 
    return image

def load_and_preprocess_image(path):
    image = tf.io.read_file(path)
    return preprocess_image(image)

def augmentation(ds):
    data_augmentation = tf.keras.Sequential([
        tf.keras.layers.experimental.preprocessing.RandomFlip("horizontal"),
        layers.experimental.preprocessing.RandomRotation(0.3),
        layers.experimental.preprocessing.RandomTranslation(
            height_factor=(-0.2, 0.2), width_factor=(-0.2, 0.2)),
        layers.experimental.preprocessing.RandomZoom(0.2, 0.2),
    ])
    
    # Batch all datasets
    ds = ds.batch(BATCH_SIZE)

    # Use data augmentation only on the training set
    ds = ds.map(lambda x: data_augmentation(x))

    # Prefecting on all datasets
    return ds.prefetch(1)

def prepare_data(df, augment=False):
    # Load images
    path_ds = tf.data.Dataset.from_tensor_slices(df['image_paths'])
    image_ds = path_ds.map(load_and_preprocess_image, num_parallel_calls=AUTOTUNE)

    if augment:
        ds = augmentation(image_ds)
    else:
        ds = image_ds.batch(BATCH_SIZE).prefetch(1)
    
    return ds

In [None]:
load_dir = './drive/MyDrive/data/shopee'

# Load and process images
df_train = pd.read_csv(load_dir + '/train.csv')
df_train['image_paths'] = load_dir + '/train_images/' + df_train['image'] 

# Duplicate train set for runtime testing 
# df_train = pd.concat([df_train, df_train], ignore_index=True)
train_ds = prepare_data(df_train, augment=False)

# Embeddings

In [None]:
def find_similar_items(embeddings, threshold, fine_tune=False):
  """
  Using Nearest Neighbors to figure out similar items
  """
  knn = NearestNeighbors(n_neighbors=50)
  knn.fit(embeddings)

  num_chunk = round(embeddings.shape[0] / CHUNK_SIZE)
  item_index = []
  for i in range(num_chunk+1):
      start_idx = i * CHUNK_SIZE                  
      end_idx = min((i + 1) * CHUNK_SIZE, embeddings.shape[0])
      if not fine_tune:
        print('Chunk', start_idx, 'to', end_idx) 

      dist, idx = knn.kneighbors(embeddings[start_idx:end_idx, :])
      counts = (dist < threshold).sum(axis=1)
      chunk_index = [idx[i, :counts[i]].tolist() for i in range(end_idx - start_idx)]
      item_index += chunk_index

  del embeddings, dist, idx, counts, chunk_index, knn
  _ = gc.collect()
  return item_index

In [None]:
# Image embedding
effnet = EfficientNetB3(weights='imagenet', include_top=False, pooling='avg', input_shape=None)
embeddings_image = effnet.predict(train_ds, verbose=1)

del effnet
_ = gc.collect()

In [None]:
# Similiar images
image_index = find_similar_items(embeddings_image, 6.8)

In [None]:
# Text embedding
cudf_train = cudf.DataFrame(df_train)
sentences = cudf_train.title
vectorizer = TfidfVectorizer(binary=True, max_features=15000)
embeddings_text = vectorizer.fit_transform(sentences).toarray()

del sentences, vectorizer
_ = gc.collect()

In [None]:
# Similar texts
text_index = find_similar_items(embeddings_text, 0.8)

# Evaluation

In [None]:
def row_wise_f1_score(y_true, y_pred):

    y_true = y_true.apply(lambda x: set(x))
    y_pred = y_pred.apply(lambda x: set(x.split()))

    tp = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    fp = y_pred.apply(lambda x: len(x)).values - tp
    fn = y_true.apply(lambda x: len(x)).values - tp

    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1 = 2 * ((precision * recall) / (precision + recall))
    return f1

In [None]:
# Ground truth
tmp = df_train.groupby('label_group').posting_id.agg('unique').to_dict()
df_train['target'] = df_train.label_group.map(tmp)

df_train['matches'] = [
    ' '.join(
        set(df_train['posting_id'][text].tolist() +
            df_train['posting_id'][image].tolist()))
    for text, image in zip(text_index, image_index)
]

df_train['f1'] = row_wise_f1_score(df_train['target'], df_train['matches'])
df_train['f1'].mean()