In [1]:
# import faiss
import numpy as np
import pandas as pd
import json
import os
import sys
from tqdm import tqdm
from PIL import Image
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import euclidean_distances

# Load all data

In [2]:
# database
txt_json_path = '/kaggle/input/eventa-json-cieldt/database.json'
with open(txt_json_path, 'r') as file:
    database_txt_json = json.load(file)

img_json_path_siglip = '/kaggle/input/eventa-database-img-siglip/database_index.json'
with open(img_json_path_siglip, 'r') as file:
    database_img_ids = json.load(file)

# database SigLIP
database_img_np_siglip = np.load('/kaggle/input/eventa-database-img-siglip/database_SigLIP_embeddings.npy').astype('float32')

# database CLIP
database_img_np_clip = np.load('/kaggle/input/eventa-database-clip-vit-l14/databse_clip.npy').astype('float32')

# database DINOV2
database_img_np_dino = np.load('/kaggle/input/dinov2/database_img.npy').astype('float32')

In [3]:
getArticle_database = {}
for art_id, metadata in database_txt_json.items():
    for img_id in metadata['images']:
        getArticle_database[img_id] = art_id

# Retrieving

In [4]:
# query
query_json_path_dino = '/kaggle/input/dinov2/public_test_img_index.json'
with open(query_json_path_dino, 'r') as file:
    query_json = json.load(file)

# query Siglip
query_np_siglip = np.load('/kaggle/input/eventa-public-test-siglip/public_test_img.npy').astype('float32')

# query clip
query_np_clip = np.load('/kaggle/input/eventa-puclic-test-embedding/public_test_embedding.npy').astype('float32')

# query DINOV2
query_np_dino = np.load('/kaggle/input/dinov2/public_test_img.npy').astype('float32')

In [5]:
def find_all_dist(vector_lists, database_lists):
    all_dist = []
    for i, query_vectors in enumerate(vector_lists):
        print(f'Calculating {i+1}(th) type distances')
        distances = euclidean_distances(query_vectors, database_lists[i])
        all_dist.append(distances)
    return all_dist
    
def retrieve(all_dist, query_json, scores_list, Top_K=50, Top_N_Art=10):
    score_list = scores_list / scores_list.sum()
    num_queries = len(query_json)
    output_filename = "submission.csv"

    print(f"Starting to generate {output_filename} for {num_queries} queries...")

    with open(output_filename, 'w') as f_out:
        header_cols = [f"article_id_{i+1}" for i in range(Top_N_Art)]
        f_out.write(f"query_id,{','.join(header_cols)},generated_caption\n")

        for q_idx in range(num_queries):
            try:
                combined_dist = np.zeros_like(all_dist[0][q_idx])
                for i in range(len(scores_list)):
                    if scores_list[i] == 0:
                        continue
                    combined_dist += scores_list[i] * all_dist[i][q_idx]

                # Lấy top-K chỉ số nhỏ nhất
                top_k_indices = np.argpartition(combined_dist, Top_K)[:Top_K]
                top_k_indices_sorted = top_k_indices[np.argsort(combined_dist[top_k_indices])]

                # Trích xuất ID bài viết không trùng lặp
                retrieved_article_ids = []
                for idx in top_k_indices_sorted:
                    img_id = database_img_ids[idx]
                    article_id = getArticle_database[img_id]
                    if article_id not in retrieved_article_ids:
                        retrieved_article_ids.append(article_id)
                    if len(retrieved_article_ids) == Top_N_Art:
                        break

                # Ghi kết quả
                query_id = query_json[q_idx]
                output_article_ids = [str(aid) for aid in retrieved_article_ids]
                while len(output_article_ids) < Top_N_Art:
                    output_article_ids.append("#")
                caption = "Nothing"
                f_out.write(f"{query_id},{','.join(output_article_ids)},\"{caption}\"\n")

            except Exception as e:
                print(f"\nError processing query index {q_idx} (query_id: {query_json[q_idx]}): {e}")
                output_article_ids = ["#" for _ in range(Top_N_Art)]
                caption = "ERROR"
                f_out.write(f"{query_json[q_idx]},{','.join(output_article_ids)},\"{caption}\"\n")

    print(f"Completed! Submission file created: {output_filename}")


In [None]:
vector_lists = [query_np_siglip, query_np_dino, query_np_clip]
database_lists = [database_img_np_siglip, database_img_np_dino, database_img_np_clip]
all_dist = find_all_dist(vector_lists, database_lists)

In [7]:
# giải phóng bộ nhớ
del query_np_siglip, query_np_dino, query_np_clip
del database_img_np_siglip, database_img_np_dino, database_img_np_clip
del vector_lists, database_lists

In [None]:
# tiện cho tinh chỉnh
scores = np.array([0.3, 0.5, 0.3]) #siglip, dino, clip
retrieve(all_dist, query_json, scores)
res_df = pd.read_csv('/kaggle/working/submission.csv')