In [110]:
! nvidia-smi

Tue Mar 16 14:33:41 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.56       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   75C    P0    34W /  70W |  13500MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import gc
import pickle
import tensorflow as tf
from tensorflow.keras import layers, Model
from tensorflow.keras.applications import EfficientNetB0
from cuml.neighbors import NearestNeighbors
from cuml.feature_extraction.text import TfidfVectorizer
from scipy import sparse
import sys, os

dist_package_index = sys.path.index('/usr/local/lib/python3.7/dist-packages')
sys.path = sys.path[:dist_package_index] + ['/usr/local/lib/python3.7/site-packages'] + sys.path[dist_package_index:]
sys.path
exec(open('rapidsai-csp-utils/colab/update_modules.py').read(), globals())
import cudf, cuml, cupy, cupyx

In [102]:
AUTOTUNE = tf.data.experimental.AUTOTUNE
SEED = 100
BATCH_SIZE =128
CHUNK_SIZE = 2048
IMAGE_HEIGHT = 224
IMAGE_WIDTH = 224

physical_devices = tf.config.list_physical_devices('GPU')
print("Num GPUs:", len(physical_devices))

Num GPUs: 1


# Load Data

In [103]:
def preprocess_image(image):
    image = tf.image.decode_jpeg(image, channels=3)
    image = tf.image.resize(image, [IMAGE_WIDTH, IMAGE_HEIGHT])
#     image /= 255  # normalize to [0,1] rangeI'm not a 
    return image

def load_and_preprocess_image(path):
    image = tf.io.read_file(path)
    return preprocess_image(image)

def augmentation(ds):
    data_augmentation = tf.keras.Sequential([
        tf.keras.layers.experimental.preprocessing.RandomFlip("horizontal"),
        layers.experimental.preprocessing.RandomRotation(0.3),
#         layers.experimental.preprocessing.RandomTranslation(
#             height_factor=(-0.2, 0.2), width_factor=(-0.2, 0.2)),
#         layers.experimental.preprocessing.RandomZoom(0.2, 0.2),
    ])
    
    # Batch all datasets
    ds = ds.batch(BATCH_SIZE)

    # Use data augmentation only on the training set
    ds = ds.map(lambda x: data_augmentation(x))

    # Prefecting on all datasets
    return ds.prefetch(1)

def prepare_data(df, augment=False):
    # Load images
    path_ds = tf.data.Dataset.from_tensor_slices(df['image_paths'])
    image_ds = path_ds.map(load_and_preprocess_image, num_parallel_calls=AUTOTUNE)

    if augment:
        ds = augmentation(image_ds)
    else:
        ds = image_ds.batch(BATCH_SIZE).prefetch(1)
    
    return ds

In [104]:
load_dir = os.getcwd() + '/data_shopee' #'/kaggle/input/ranzcr-clip-catheter-line-classification'

# Load and process images
df_train = pd.read_csv(load_dir + '/train.csv')
df_train['image_paths'] = load_dir + '/train_images/' + df_train['image'] 

df_test = pd.read_csv(load_dir + '/test.csv')
df_test['image_paths'] = load_dir + '/test_images/' + df_test['image'] 

train_ds = prepare_data(df_train, augment=False)
test_ds = prepare_data(df_test, augment=False)

# Ground truth
tmp = df_train.groupby('label_group').posting_id.agg('unique').to_dict()
df_train['target'] = df_train.label_group.map(tmp)

# load as cudf
cudf_train = cudf.read_csv(load_dir + '/train.csv')
cudf_test = cudf.read_csv(load_dir + '/test.csv')

# Image embedding

In [105]:
model = EfficientNetB0(weights='imagenet', include_top=False, pooling='avg', input_shape=None)
train_embeddings_image = model.predict(train_ds, verbose=1)

del model
_ = gc.collect()



## K-Nearest Neighbors

In [106]:
knn = NearestNeighbors(n_neighbors=50)
knn.fit(train_embeddings_image)

NearestNeighbors(n_neighbors=50, verbose=4, handle=<cuml.raft.common.handle.Handle object at 0x7fb9b01cd950>, algorithm='brute', metric='euclidean', p=2, algo_params=None, metric_params=None, output_type='input')

In [107]:
image_index = []

for i in range(int(np.ceil(len(df_train) / CHUNK_SIZE))):
    start_idx = i * CHUNK_SIZE
    end_idx = min((i + 1) * CHUNK_SIZE, len(df_train))
    print('Chunk', start_idx, 'to', end_idx)

    dist, idx = knn.kneighbors(train_embeddings_image[start_idx:end_idx, :])
    counts = (dist < 7).sum(axis=1)
    chunk_index = [idx[i, :counts[i]].tolist() for i in range(end_idx - start_idx)]
    image_index += chunk_index
  
del train_embeddings_image, dist, idx, counts, chunk_index, knn
gc.collect()

Chunk 0 to 2048
Chunk 2048 to 4096
Chunk 4096 to 6144
Chunk 6144 to 8192
Chunk 8192 to 10240
Chunk 10240 to 12288
Chunk 12288 to 14336
Chunk 14336 to 16384
Chunk 16384 to 18432
Chunk 18432 to 20480
Chunk 20480 to 22528
Chunk 22528 to 24576
Chunk 24576 to 26624
Chunk 26624 to 28672
Chunk 28672 to 30720
Chunk 30720 to 32768
Chunk 32768 to 34250


19

# Text Embedding

In [108]:
model_text = TfidfVectorizer(stop_words='english', binary=True, max_features=25000)
text_embeddings = model_text.fit_transform(cudf_train.title).toarray()

MemoryError: ignored

In [None]:
cudf_train.title

In [None]:
def cos_sim(A, B):
    num = np.dot(A, B.T)
    p1 = np.sqrt(np.sum(A.power(2), axis=1))
    p2 = np.sqrt(np.sum(B.power(2), axis=1)).T

    return num / (p1 * p2)

In [None]:
%memit
text_index = []

for i in range(np.ceil(len(df_train) / CHUNK_SIZE)):
    start_idx = i * CHUNK_SIZE
    end_idx = min((i + 1) * CHUNK_SIZE, len(df_train))
    print('Chunk', start_idx, 'to', end_idx)

    sim_chunk = cos_sim(text_embeddings, text_embeddings[start_idx:end_idx, :]).T
    r, c = np.where(sim_chunk > 0.7)
    text_index += np.split(c, np.flatnonzero(r[1:] != r[:-1])+1)

del sim_chunk, r, c, text_embeddings
_= gc.collect()
%memit

peak memory: 2505.50 MiB, increment: 0.00 MiB
Chunk 0 to 4096
Chunk 4096 to 8192
Chunk 8192 to 12288
Chunk 12288 to 16384
Chunk 16384 to 20480
Chunk 20480 to 24576
Chunk 24576 to 28672
Chunk 28672 to 32768
Chunk 32768 to 34250
peak memory: 2505.95 MiB, increment: -0.08 MiB


# Evaluation

In [None]:
%memit
df_train['matches'] = [
    ' '.join(set(df_train['posting_id'][text].tolist() + df_train['posting_id'][image].tolist()))
    for text, image in zip(text_index, image_index)
]
%memit

peak memory: 2506.65 MiB, increment: 0.00 MiB
peak memory: 2506.68 MiB, increment: 0.00 MiB


In [None]:
def row_wise_f1_score(y_true, y_pred):

    y_true = y_true.apply(lambda x: set(x))
    y_pred = y_pred.apply(lambda x: set(x.split()))

    tp = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    fp = y_pred.apply(lambda x: len(x)).values - tp
    fn = y_true.apply(lambda x: len(x)).values - tp

    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1 = 2 * ((precision * recall) / (precision + recall))
    return f1

df_train['f1'] = row_wise_f1_score(df_train['target'], df_train['matches'])
df_train['f1'].mean()

0.72503026259278

# Submission

In [None]:
df_submit = pd.read_csv('./data/sample_submission.csv')
df_submit

Unnamed: 0,posting_id,matches
0,test_2255846744,test_2255846744
1,test_3588702337,test_3588702337
2,test_4015706929,test_4015706929


In [None]:
df_submit.to_csv('submission.csv', index=False)

Unnamed: 0,posting_id,matches
0,test_2255846744,test_2255846744 test_4015706929 test_3588702337
1,test_3588702337,test_3588702337 test_4015706929 test_2255846744
2,test_4015706929,test_4015706929 test_2255846744 test_3588702337
