In [1]:
%load_ext memory_profiler
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import gc
import pickle
import tensorflow as tf
from tensorflow.keras import layers, Model
from tensorflow.keras.applications import EfficientNetB0
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse
import sys, os

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

In [2]:
AUTOTUNE = tf.data.experimental.AUTOTUNE
SEED = 100
BATCH_SIZE =128
CHUNK_SIZE = 4096
IMAGE_HEIGHT = 224
IMAGE_WIDTH = 224

physical_devices = tf.config.list_physical_devices('GPU')
print("Num GPUs:", len(physical_devices))

Num GPUs: 0


# Load Data

In [3]:
def preprocess_image(image):
    image = tf.image.decode_jpeg(image, channels=3)
    image = tf.image.resize(image, [IMAGE_WIDTH, IMAGE_HEIGHT])
#     image /= 255  # normalize to [0,1] rangeI'm not a 
    return image

def load_and_preprocess_image(path):
    image = tf.io.read_file(path)
    return preprocess_image(image)

def augmentation(ds):
    data_augmentation = tf.keras.Sequential([
        tf.keras.layers.experimental.preprocessing.RandomFlip("horizontal"),
        layers.experimental.preprocessing.RandomRotation(0.3),
#         layers.experimental.preprocessing.RandomTranslation(
#             height_factor=(-0.2, 0.2), width_factor=(-0.2, 0.2)),
#         layers.experimental.preprocessing.RandomZoom(0.2, 0.2),
    ])
    
    # Batch all datasets
    ds = ds.batch(BATCH_SIZE)

    # Use data augmentation only on the training set
    ds = ds.map(lambda x: data_augmentation(x))

    # Prefecting on all datasets
    return ds.prefetch(1)

def prepare_data(df, augment=False):
    # Load images
    path_ds = tf.data.Dataset.from_tensor_slices(df['image_paths'])
    image_ds = path_ds.map(load_and_preprocess_image, num_parallel_calls=AUTOTUNE)

    if augment:
        ds = augmentation(image_ds)
    else:
        ds = image_ds.batch(BATCH_SIZE).prefetch(1)
    
    return ds

In [4]:
load_dir = os.getcwd() + '/data' #'/kaggle/input/ranzcr-clip-catheter-line-classification'

# Load and process images
df_train = pd.read_csv(load_dir + '/train.csv')
df_train['image_paths'] = load_dir + '/train_images/' + df_train['image'] 

df_test = pd.read_csv(load_dir + '/test.csv')
df_test['image_paths'] = load_dir + '/test_images/' + df_test['image'] 

train_ds = prepare_data(df_train, augment=False)
test_ds = prepare_data(df_test, augment=False)

# Ground truth
tmp = df_train.groupby('label_group').posting_id.agg('unique').to_dict()
df_train['target'] = df_train.label_group.map(tmp)

# Image embedding

In [12]:
model = EfficientNetB0(weights='imagenet', include_top=False, pooling='avg', input_shape=None)
embeddings_image = model.predict(train_ds, verbose=1)

del model
_ = gc.collect()



In [4]:
with open('./data/image_embeddings.pkl', 'rb') as f:
    embeddings_image = pickle.load(f)

## K-Nearest Neighbors

In [5]:
knn = NearestNeighbors(n_neighbors=50)
knn.fit(embeddings_image)

NearestNeighbors(n_neighbors=50)

In [7]:
image_index = []

for i in range(int(np.ceil(len(df_train) / CHUNK_SIZE))):
    start_idx = i * CHUNK_SIZE
    end_idx = min((i + 1) * CHUNK_SIZE, len(df_train))
    print('Chunk', start_idx, 'to', end_idx)

    dist, idx = knn.kneighbors(embeddings_image[start_idx:end_idx, :])
    counts = (dist < 6.8).sum(axis=1)
    chunk_index = [idx[i, :counts[i]].tolist() for i in range(end_idx - start_idx)]
    image_index += chunk_index

del embeddings_image, dist, idx, counts, chunk_index, knn
_ = gc.collect()

Chunk 0 to 4096
Chunk 4096 to 8192
Chunk 8192 to 12288
Chunk 12288 to 16384
Chunk 16384 to 20480
Chunk 20480 to 24576
Chunk 24576 to 28672
Chunk 28672 to 32768
Chunk 32768 to 34250


# Text Embedding

In [68]:
# def remove_stopwords(sentence):
    
#     languages = ['english', 'indonesian']
#     cache = set(stopwords.words(languages))
#     sentence = ' '.join([word for word in sentence.split() if word not in cache])
              
#     return sentence

# sentences = [remove_stopwords(sentence) for sentence in sentences]

In [8]:
sentences = df_train.title.tolist()

In [159]:
# tokenizer = Tokenizer()
# tokenizer.fit_on_texts(sentences)
# sequences = tokenizer.texts_to_sequences(sentences)
# sentences = tokenizer.sequences_to_texts(sequences)

In [25]:
vectorizer = TfidfVectorizer(binary=True, max_features=20000)
embeddings_text = vectorizer.fit_transform(sentences)

In [157]:
# import tensorflow_hub as hub
# embed2 = hub.KerasLayer('./data/nnlm-id-dim128-with-normalization_2/')
# "https://tfhub.dev/google/nnlm-en-dim128/2"
# "https://tfhub.dev/google/universal-sentence-encoder-multilingual/3"
# "https://tfhub.dev/google/nnlm-id-dim128-with-normalization/2"

In [10]:
def search_similar_image(embeddings, threshold):
    image_index = []

    for i in range(int(np.ceil(embeddings.shape[0] / CHUNK_SIZE))):
        start_idx = i * CHUNK_SIZE
        end_idx = min((i + 1) * CHUNK_SIZE, embeddings.shape[0])

        dist, idx = knn.kneighbors(embeddings[start_idx:end_idx, :])
        counts = (dist < threshold).sum(axis=1)
        chunk_index = [idx[i, :counts[i]].tolist() for i in range(end_idx - start_idx)]
        image_index += chunk_index
    
    return image_index

def search_similar_text(embeddings, threshold):

    text_index = []

    for i in range(int(np.ceil(embeddings.shape[0] / CHUNK_SIZE))):
        start_idx = i * CHUNK_SIZE
        end_idx = min((i + 1) * CHUNK_SIZE, embeddings.shape[0])

        dist, idx = knn_text.kneighbors(embeddings[start_idx:end_idx, :])
        counts = (dist < threshold).sum(axis=1)
        chunk_index = [idx[i, :counts[i]].tolist() for i in range(end_idx - start_idx)]
        text_index += chunk_index
    
    return text_index


def parameter_tuning(df_train, text_index, embeddings, threshold):
    print('threshold:', threshold)
    image_index = search_similar_image(embeddings, threshold)
    df_train['matches'] = [
        ' '.join(
            set(df_train['posting_id'][text].tolist() +
                df_train['posting_id'][image].tolist()))
        for text, image in zip(text_index, image_index)
    ]
    df_train['f1'] = row_wise_f1_score(df_train['target'], df_train['matches'])
    f1_score = df_train['f1'].mean()
    
    print('f1-score:', f1_score)
    print('\n')

In [176]:
for i in np.arange(6, 7, 0.1):
    parameter_tuning(df_train, text_index, embeddings_image, i)

threshold: 6.0
f1-score: 0.722843683896605


threshold: 6.1
f1-score: 0.7232640924332517


threshold: 6.199999999999999
f1-score: 0.7236380503705018


threshold: 6.299999999999999
f1-score: 0.7242337449288548


threshold: 6.399999999999999
f1-score: 0.7243238397305913


threshold: 6.499999999999998
f1-score: 0.7246730488185004


threshold: 6.599999999999998
f1-score: 0.7247354028855194


threshold: 6.6999999999999975
f1-score: 0.7247448445564315


threshold: 6.799999999999997
f1-score: 0.7248163077903594


threshold: 6.899999999999997
f1-score: 0.7244262841167836




In [26]:
knn_text = NearestNeighbors(n_neighbors=50)
knn_text.fit(embeddings_text)

NearestNeighbors(n_neighbors=50)

In [27]:
text_index = []

for i in range(int(np.ceil(len(df_train) / CHUNK_SIZE))):
    start_idx = i * CHUNK_SIZE
    end_idx = min((i + 1) * CHUNK_SIZE, len(df_train))
    print('Chunk', start_idx, 'to', end_idx)

    dist, idx = knn_text.kneighbors(embeddings_text[start_idx:end_idx, :])
    counts = (dist < .8).sum(axis=1)
    chunk_index = [idx[i, :counts[i]].tolist() for i in range(end_idx - start_idx)]
    text_index += chunk_index

del embeddings_text, dist, idx, counts, chunk_index, knn_text
_ = gc.collect()

Chunk 0 to 4096
Chunk 4096 to 8192
Chunk 8192 to 12288
Chunk 12288 to 16384
Chunk 16384 to 20480
Chunk 20480 to 24576
Chunk 24576 to 28672
Chunk 28672 to 32768
Chunk 32768 to 34250


In [163]:
# %memit
# text_index = []

# for i in range(int(np.ceil(len(embeddings) / CHUNK_SIZE))):
#     start_idx = i * CHUNK_SIZE
#     end_idx = min((i + 1) * CHUNK_SIZE, len(df_train))
#     print('Chunk', start_idx, 'to', end_idx)

#     sim_chunk = cosine_similarity(embeddings, embeddings[start_idx:end_idx, :]).T
#     r, c = np.where(sim_chunk > 0.95)
#     text_index += np.split(c, np.flatnonzero(r[1:] != r[:-1])+1)

# del sim_chunk, r, c
# _= gc.collect()
# %memit

# Evaluation

In [28]:
%memit
df_train['matches'] = [
    ' '.join(
        set(df_train['posting_id'][text].tolist() +
            df_train['posting_id'][image].tolist()))
    for text, image in zip(text_index, image_index)
]
%memit

peak memory: 2097.66 MiB, increment: 0.00 MiB
peak memory: 2097.66 MiB, increment: 0.00 MiB


In [29]:
def row_wise_f1_score(y_true, y_pred):

    y_true = y_true.apply(lambda x: set(x))
    y_pred = y_pred.apply(lambda x: set(x.split()))

    tp = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    fp = y_pred.apply(lambda x: len(x)).values - tp
    fn = y_true.apply(lambda x: len(x)).values - tp

    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1 = 2 * ((precision * recall) / (precision + recall))
    return f1

df_train['f1'] = row_wise_f1_score(df_train['target'], df_train['matches'])
df_train['f1'].mean()

0.7324739189966178

# Submission

In [None]:
df_submit = pd.read_csv('./data/sample_submission.csv')
df_submit

Unnamed: 0,posting_id,matches
0,test_2255846744,test_2255846744
1,test_3588702337,test_3588702337
2,test_4015706929,test_4015706929


In [None]:
df_submit.to_csv('submission.csv', index=False)

Unnamed: 0,posting_id,matches
0,test_2255846744,test_2255846744 test_4015706929 test_3588702337
1,test_3588702337,test_3588702337 test_4015706929 test_2255846744
2,test_4015706929,test_4015706929 test_2255846744 test_3588702337
