In [1]:
%load_ext memory_profiler
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import gc
import pickle
import tensorflow as tf
from tensorflow.keras import layers, Model
from tensorflow.keras.applications import EfficientNetB0
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse
import sys, os

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

[nltk_data] Downloading package stopwords to /Users/ygong/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
AUTOTUNE = tf.data.experimental.AUTOTUNE
SEED = 100
BATCH_SIZE =128
CHUNK_SIZE = 2048
IMAGE_HEIGHT = 224
IMAGE_WIDTH = 224

physical_devices = tf.config.list_physical_devices('GPU')
print("Num GPUs:", len(physical_devices))

Num GPUs: 0


# Load Data

In [3]:
def preprocess_image(image):
    image = tf.image.decode_jpeg(image, channels=3)
    image = tf.image.resize(image, [IMAGE_WIDTH, IMAGE_HEIGHT])
#     image /= 255  # normalize to [0,1] rangeI'm not a 
    return image

def load_and_preprocess_image(path):
    image = tf.io.read_file(path)
    return preprocess_image(image)

def augmentation(ds):
    data_augmentation = tf.keras.Sequential([
        tf.keras.layers.experimental.preprocessing.RandomFlip("horizontal"),
        layers.experimental.preprocessing.RandomRotation(0.3),
#         layers.experimental.preprocessing.RandomTranslation(
#             height_factor=(-0.2, 0.2), width_factor=(-0.2, 0.2)),
#         layers.experimental.preprocessing.RandomZoom(0.2, 0.2),
    ])
    
    # Batch all datasets
    ds = ds.batch(BATCH_SIZE)

    # Use data augmentation only on the training set
    ds = ds.map(lambda x: data_augmentation(x))

    # Prefecting on all datasets
    return ds.prefetch(1)

def prepare_data(df, augment=False):
    # Load images
    path_ds = tf.data.Dataset.from_tensor_slices(df['image_paths'])
    image_ds = path_ds.map(load_and_preprocess_image, num_parallel_calls=AUTOTUNE)

    if augment:
        ds = augmentation(image_ds)
    else:
        ds = image_ds.batch(BATCH_SIZE).prefetch(1)
    
    return ds

In [4]:
load_dir = os.getcwd() + '/data' #'/kaggle/input/ranzcr-clip-catheter-line-classification'

# Load and process images
df_train = pd.read_csv(load_dir + '/train.csv')
df_train['image_paths'] = load_dir + '/train_images/' + df_train['image'] 

df_test = pd.read_csv(load_dir + '/test.csv')
df_test['image_paths'] = load_dir + '/test_images/' + df_test['image'] 

train_ds = prepare_data(df_train, augment=False)
test_ds = prepare_data(df_test, augment=False)

# Ground truth
tmp = df_train.groupby('label_group').posting_id.agg('unique').to_dict()
df_train['target'] = df_train.label_group.map(tmp)

# load as cudf
# cudf_train = cudf.read_csv(load_dir + '/train.csv')
# cudf_test = cudf.read_csv(load_dir + '/test.csv')

# Image embedding

In [105]:
model = EfficientNetB0(weights='imagenet', include_top=False, pooling='avg', input_shape=None)
train_embeddings_image = model.predict(train_ds, verbose=1)

del model
_ = gc.collect()



In [5]:
with open('./data/image_embeddings.pkl', 'rb') as f:
    train_embeddings_image = pickle.load(f)

## K-Nearest Neighbors

In [6]:
knn = NearestNeighbors(n_neighbors=50)
knn.fit(train_embeddings_image)

NearestNeighbors(n_neighbors=50)

In [7]:
image_index = []

for i in range(int(np.ceil(len(df_train) / CHUNK_SIZE))):
    start_idx = i * CHUNK_SIZE
    end_idx = min((i + 1) * CHUNK_SIZE, len(df_train))
    print('Chunk', start_idx, 'to', end_idx)

    dist, idx = knn.kneighbors(train_embeddings_image[start_idx:end_idx, :])
    counts = (dist < 7).sum(axis=1)
    chunk_index = [idx[i, :counts[i]].tolist() for i in range(end_idx - start_idx)]
    image_index += chunk_index

del train_embeddings_image, dist, idx, counts, chunk_index, knn
_ = gc.collect()

Chunk 0 to 2048
Chunk 2048 to 4096
Chunk 4096 to 6144
Chunk 6144 to 8192
Chunk 8192 to 10240
Chunk 10240 to 12288
Chunk 12288 to 14336
Chunk 14336 to 16384
Chunk 16384 to 18432
Chunk 18432 to 20480
Chunk 20480 to 22528
Chunk 22528 to 24576
Chunk 24576 to 26624
Chunk 26624 to 28672
Chunk 28672 to 30720
Chunk 30720 to 32768
Chunk 32768 to 34250


# Text Embedding

In [8]:
def remove_stopwords(sentence):
    
    languages = ['english', 'indonesian', 'portuguese']
    cache = set(stopwords.words(languages))
    sentence = ' '.join([word for word in sentence.split() if word not in cache])
              
    return sentence

In [9]:
sentences = df_train.title.tolist()
sentences = [remove_stopwords(sentence) for sentence in sentences]

In [10]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)
sentences = tokenizer.sequences_to_texts(sequences)

In [None]:
curl -L "https://tfhub.dev/google/universal-sentence-encoder-multilingual/3" | tar -zxvC /data/moduleA

In [32]:
import tensorflow_hub as hub
embed = hub.KerasLayer('./data/nnlm-id-dim128-with-normalization_2/')
# "https://tfhub.dev/google/nnlm-en-dim128/2"
# "https://tfhub.dev/google/universal-sentence-encoder-multilingual/3"
# "https://tfhub.dev/google/nnlm-id-dim128-with-normalization/2"









In [33]:
embeddings = embed(sentences)





In [34]:
embeddings.shape

TensorShape([34250, 128])

In [40]:
text_embeddings = sparse.csr_matrix(embeddings)

def cos_sim(A, B):
    num = np.dot(A, B.T)
    p1 = np.sqrt(np.sum(A.power(2), axis=1))
    p2 = np.sqrt(np.sum(B.power(2), axis=1)).T

    return num / (p1 * p2)

In [22]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity(embeddings, embeddings)

array([[0.9999999 , 0.5101355 , 0.5266119 , ..., 0.5022289 , 0.29337603,
        0.37285593],
       [0.5101355 , 1.0000001 , 0.42481694, ..., 0.43734324, 0.26885647,
        0.38233668],
       [0.5266119 , 0.42481694, 0.99999976, ..., 0.27655095, 0.5073701 ,
        0.32950372],
       ...,
       [0.5022289 , 0.43734324, 0.27655095, ..., 1.0000001 , 0.21162239,
        0.43034253],
       [0.29337603, 0.26885647, 0.5073701 , ..., 0.21162239, 1.0000004 ,
        0.23478188],
       [0.37285593, 0.38233668, 0.32950372, ..., 0.43034253, 0.23478188,
        0.99999994]], dtype=float32)

In [27]:
%memit
text_index = []

for i in range(int(np.ceil(len(df_train) / CHUNK_SIZE))):
    start_idx = i * CHUNK_SIZE
    end_idx = min((i + 1) * CHUNK_SIZE, len(df_train))
    print('Chunk', start_idx, 'to', end_idx)

    sim_chunk = cosine_similarity(embeddings, embeddings[start_idx:end_idx, :]).T
    r, c = np.where(sim_chunk > 0.9)
    text_index += np.split(c, np.flatnonzero(r[1:] != r[:-1])+1)

del sim_chunk, r, c
_= gc.collect()
%memit

peak memory: 7746.26 MiB, increment: 0.00 MiB
Chunk 0 to 2048
Chunk 2048 to 4096
Chunk 4096 to 6144
Chunk 6144 to 8192
Chunk 8192 to 10240
Chunk 10240 to 12288
Chunk 12288 to 14336
Chunk 14336 to 16384
Chunk 16384 to 18432
Chunk 18432 to 20480
Chunk 20480 to 22528
Chunk 22528 to 24576
Chunk 24576 to 26624
Chunk 26624 to 28672
Chunk 28672 to 30720
Chunk 30720 to 32768
Chunk 32768 to 34250
peak memory: 7746.41 MiB, increment: -0.08 MiB


In [28]:
text_index

[array([    0, 33161]),
 array([1]),
 array([2]),
 array([    3,  2522, 20105, 28878, 32290]),
 array([4]),
 array([5]),
 array([6]),
 array([7]),
 array([8]),
 array([9]),
 array([   10, 14135, 14136]),
 array([11, 12]),
 array([11, 12]),
 array([13]),
 array([14]),
 array([15]),
 array([   16, 22789]),
 array([   17, 20243]),
 array([18]),
 array([19]),
 array([20]),
 array([21]),
 array([22]),
 array([23]),
 array([   24, 25635]),
 array([25]),
 array([26]),
 array([27]),
 array([28, 29]),
 array([   28,    29, 31410]),
 array([30]),
 array([   31,   739, 31422]),
 array([   32, 27816]),
 array([   33, 18770, 20169, 24202]),
 array([34]),
 array([35]),
 array([36]),
 array([   37, 25287]),
 array([38]),
 array([   39, 15125, 18182]),
 array([40]),
 array([41]),
 array([   42, 12715, 26662]),
 array([   43, 13315]),
 array([   44, 15617, 30270]),
 array([45]),
 array([   46,  3835, 15994, 23621]),
 array([47]),
 array([   48, 19570]),
 array([49]),
 array([50]),
 array([51]),
 array(

# Evaluation

In [29]:
%memit
df_train['matches'] = [
    ' '.join(set(df_train['posting_id'][text].tolist() + df_train['posting_id'][image].tolist()))
    for text, image in zip(text_index, image_index)
]
%memit

peak memory: 7749.11 MiB, increment: 0.00 MiB
peak memory: 7749.11 MiB, increment: 0.00 MiB


In [30]:
def row_wise_f1_score(y_true, y_pred):

    y_true = y_true.apply(lambda x: set(x))
    y_pred = y_pred.apply(lambda x: set(x.split()))

    tp = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    fp = y_pred.apply(lambda x: len(x)).values - tp
    fn = y_true.apply(lambda x: len(x)).values - tp

    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1 = 2 * ((precision * recall) / (precision + recall))
    return f1

df_train['f1'] = row_wise_f1_score(df_train['target'], df_train['matches'])
df_train['f1'].mean()

0.6985514113093104

# Submission

In [None]:
df_submit = pd.read_csv('./data/sample_submission.csv')
df_submit

Unnamed: 0,posting_id,matches
0,test_2255846744,test_2255846744
1,test_3588702337,test_3588702337
2,test_4015706929,test_4015706929


In [None]:
df_submit.to_csv('submission.csv', index=False)

Unnamed: 0,posting_id,matches
0,test_2255846744,test_2255846744 test_4015706929 test_3588702337
1,test_3588702337,test_3588702337 test_4015706929 test_2255846744
2,test_4015706929,test_4015706929 test_2255846744 test_3588702337
