<a href="https://colab.research.google.com/github/fjpa121197/ImageCLEF2021/blob/main/ImageCLEF2021_Submissions_Notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
from zipfile import ZipFile
os.environ['KAGGLE_USERNAME'] = "####" # username from the json file
os.environ['KAGGLE_KEY'] = "####" # key from the json file


# File containing features for training, validation and testing images.
# Also a merged csv file containing the actual concepts for the training and validation images
!kaggle datasets download -d fjpa121197/imageclef-2021-final-features-and-concepts
!kaggle datasets download -d fjpa121197/imageclef-2021-test-images

imageclef-2021-final-features-and-concepts.zip: Skipping, found more recently modified local copy (use --force to force download)
Downloading imageclef-2021-test-images.zip to /content
 44% 9.00M/20.5M [00:00<00:00, 33.8MB/s]
100% 20.5M/20.5M [00:00<00:00, 51.8MB/s]


In [None]:
clef2021_final_features = "/content/imageclef-2021-final-features-and-concepts.zip"
with ZipFile(clef2021_final_features, 'r') as zip:
  zip.extractall()
  print('done with final features file')

done with final features file


In [None]:
# Unzip 2021 data test images
clef2021_test_images = "/content/imageclef-2021-test-images.zip"
with ZipFile(clef2021_test_images, 'r') as zip:
  zip.extractall()
  print('done with 2021 image test dataset')

done with 2021 image test dataset


In [None]:
import scipy
from sklearn.neighbors import NearestNeighbors
import numpy as np
import pandas as pd
from tqdm import tqdm


import tensorflow as tf
import csv
from sklearn.preprocessing import MultiLabelBinarizer

import json
import pickle

# Submission 1 (Information Retrieval Approach):
The features for the training, validation and testing images have been extracted using a fine-tuned densenet-121 model (using training and validation images). The layer used as feature extractor is the average pool layer (dim 1024). Then, a KNN (n=1 and metric = cosine) was used to get the closest image and assign those tags.

**Aicrowd submission id: 132945**

**F1 score: 0.469**

In [None]:
# Path to where all the extracted features of the training images are located
train_features_path = '/content/imageclef-2021-final-features-and-concepts/train-val-images-features.npy'
train_data_1 = np.load(train_features_path)
train_data_2 = train_data_1[:,1:] #This is done because the first index of the arrays have the image id

# Path to where all the extracted features of the validation images are located, these images will be used as query images.
test_features_path = '/content/imageclef-2021-final-features-and-concepts/test-images-features.npy'
test_data_1 = np.load(test_features_path)
test_data_2 = test_data_1[:,1:]#This is done because the first index of the arrays have the image id


# Read the actual tags for the training images
db_images_tags = pd.read_csv('/content/imageclef-2021-final-features-and-concepts/merged-train-val-concepts.csv',names=['ImageId', 'Tags'], sep='\t')

In [None]:
# Initialize and fit data using Nearest neighbours, with n_neighbours = 1
neigh = NearestNeighbors(n_neighbors=1, metric=scipy.spatial.distance.cosine)
neigh.fit(train_data_2)

# Get the results for our query images, which will return a list of lists of lists. One containing the distances, other one returning the indices
# of the closest images
results = neigh.kneighbors(test_data_2, return_distance=True)

In [None]:
final_list_predictions = []

for idx,test_image in enumerate(test_data_1):

  test_image_id = 'synpic'+str(int(test_image[0])) # Get the actual image_id for the query image

  # This will iterate through the results array and retrieve the index of the top 10 closest images, which will allow to map to an image_id
  # The actual distance between the query image and the indexed images is given, however, this is optional.
  top_1_images_ids_scores = []
  for idy,result in enumerate(results[1][idx]):
    top_1_images_ids_scores.append(['synpic'+str(int(train_data_1[result][0])),results[0][idx][idy]])

  # Pass results to a dataframe so an inner join can be performed with the indexed images tags dataframe.
  can = pd.DataFrame(top_1_images_ids_scores, columns=['ImageId','Canberra'])
  candidate_images_tags = pd.merge(can, db_images_tags, on= 'ImageId')
  candidate_tags_str = ';'.join(set(candidate_images_tags['Tags'][0].split(";")))
  # Run the tag selection function, which will return a string containing the tags selected separated by ";"
  final_list_predictions.append([test_image_id,candidate_tags_str])

# Save list containing the query images ids and its predicted tags, separated (ImageId and tags) by tabular space.
np.savetxt("submission-1.csv",final_list_predictions, delimiter='|',fmt = '% s')

# Submission 2 (Multi-label classification approach):



**Aicrowd submission id: 133912**

**F1 score: 0.412**

In [None]:
def transform_images(path_to_image):
  #path_to_image = os.path.join(training_images_dir, image)
  img = tf.keras.preprocessing.image.load_img(path = path_to_image, target_size= (224,224))
  img = tf.keras.preprocessing.image.img_to_array(img)
  img = tf.keras.applications.densenet.preprocess_input(img)

  return img

In [None]:
# Load images for them to be passed to each model (diagnostic procedure and bpo)
test_images_ids = []
test_images = []
test_images_directory = '/content/ImageCLEF2021_CaptionConceptsTasks_TestSet_444_Images'
for image in os.listdir(test_images_directory):
  test_images_ids.append(image.split(".")[0])
  test_images.append(transform_images(os.path.join(test_images_directory, image)))

## Diagnostic Procedure Predictions

In [None]:
# Load model and pickle object
dp_model = tf.keras.models.load_model('/content/dp-classifier-partial-unfreeze-threshold40-use-for-predictions.h5', compile= False)

with open("/content/mlb_dp_classifier.pkl", 'rb') as f:
    mlb = pickle.load(f)

In [None]:
dp_predictions = dp_model.predict(np.array(test_images))

In [None]:
# Use previous threshold with better f1-score
dp_predictions[dp_predictions>=0.4] = 1
dp_predictions[dp_predictions<0.4] = 0
test_images_labels_predicted_dp = mlb.inverse_transform(dp_predictions)

In [None]:
# The concept(s) are needed as strings separated by ; if applicable
val_labels_united_dp = []
for idx,prediction in enumerate(test_images_labels_predicted_dp):
  str_concepts = ''
  for concept in prediction:
    str_concepts += concept+';'
  val_labels_united_dp.append([test_images_ids[idx],str_concepts[0:-1]])

In [None]:
dp_predictions_df = pd.DataFrame(val_labels_united_dp, columns=['ImageId', 'dp_predictions'])

## BPO Predictions

In [None]:
# Load model and pickle object
bpo_model = tf.keras.models.load_model('/content/bpo-classifier-partial-unfreeze-threshold1-use-for-predictions.h5', compile= False)

with open("/content/mlb_bpo_classifier.pkl", 'rb') as f:
    mlb_bpo = pickle.load(f)

In [None]:
bpo_predictions = bpo_model.predict(np.array(test_images))

In [None]:
# Use previous threshold with better f1-score
bpo_predictions[bpo_predictions>=0.1] = 1
bpo_predictions[bpo_predictions<0.1] = 0
test_images_labels_predicted_bpo = mlb_bpo.inverse_transform(bpo_predictions)

In [None]:
# The concept(s) are needed as strings separated by ; if applicable
val_labels_united_bpo = []
for idx,prediction in enumerate(test_images_labels_predicted_bpo):
  str_concepts = ''
  for concept in prediction:
    str_concepts += concept+';'
  
  if len(str_concepts) > 1:
    val_labels_united_bpo.append([test_images_ids[idx],str_concepts[0:-1]])
  else:
    val_labels_united_bpo.append([test_images_ids[idx],np.nan])

In [None]:
bpo_predictions_df = pd.DataFrame(val_labels_united_bpo, columns=['ImageId', 'bpo_predictions'])

## Merge predictions from both dfs and create submission file

In [None]:
final_prediction = pd.merge(dp_predictions_df,bpo_predictions_df, on='ImageId',how='outer')

In [None]:
final_prediction['dp_bpo_tags'] = final_prediction[final_prediction.columns[1:]].apply(lambda row: ';'.join(row.dropna()), axis = 1)

In [None]:
final_prediction.to_csv('/content/submission-2.csv', index= False, sep ='|', header= False, columns=['ImageId','dp_bpo_tags'])