In [1]:
import pandas as pd
import csv
from matching import search_hybrid
import joblib
from gensim.models import Word2Vec
import numpy as np

[nltk_data] Downloading package punkt to C:\Users\Firas
[nltk_data]     ka\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Firas
[nltk_data]     ka\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Firas
[nltk_data]     ka\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
hybrid_matrix = np.load("cl_hybrid_svd_representation.npy")
svd_model = joblib.load("cl_hybrid_svd_model.joblib")
hybrid_vectorizer = joblib.load("D:/dataset_downloader/cl_hybrid_vectorizer.joblib")
word2vec_model = Word2Vec.load("cl_word2vec.model")
dataset = pd.read_csv("D:/IR_data/data/clinicaltrials/docs.csv", usecols=[0, 1])
qrels_file = "D:/IR_Data/data/clinicaltrials/qrels.csv"
queries_file = "D:/IR_data/data/clinicaltrials/queries.csv"

In [3]:
def get_relevant_id_from_qrel(min_rel_val, query_id, csv_file):
    relevant_ids = []
    relevance_scores = {}

    with open(csv_file, "r", encoding="utf-8") as file:
        reader = csv.reader(file)
        for row in reader:
            if len(row) < 3:
                continue
            qid, doc_id, rel = row[0], row[1], row[2]
            if qid == query_id:
                rel_val = int(rel)
                if rel_val >= min_rel_val:
                    relevant_ids.append(doc_id)
                    relevance_scores[doc_id] = 1
                else:
                    relevance_scores[doc_id] = 0
    return relevant_ids, relevance_scores

In [4]:
def precision_at_k(retrieved_docs, relevant_docs, relevance_scores, k):
    relevant_in_top_k = sum(1 for doc in retrieved_docs[:k] if doc in relevant_docs)
    return relevant_in_top_k / k

def calculate_recall(min_rel_val, retrieved_docs, relevant_docs, relevance_scores):
    relevant_retrieved = sum(1 for doc in retrieved_docs if doc in relevant_docs and relevance_scores[doc] > 0)
    total_relevant = sum(1 for rel in relevance_scores.values() if rel > 0)
    return relevant_retrieved / total_relevant if total_relevant > 0 else 0

def average_precision_at_k(retrieved_docs, relevant_docs, relevance_scores, k):
    precision_sum = 0.0
    relevant_count = 0
    for i in range(min(k, len(retrieved_docs))):
        if retrieved_docs[i] in relevant_docs:
            relevant_count += 1
            precision_sum += precision_at_k(retrieved_docs, relevant_docs, relevance_scores, i + 1)
    return precision_sum / relevant_count if relevant_count > 0 else 0

def reciprocal_rank_at_k(min_rel_val, retrieved_docs, relevant_docs, relevance_scores, k=10):
    for i, doc in enumerate(retrieved_docs[:k]):
        if doc in relevant_docs and relevance_scores.get(doc, 0) >= min_rel_val:
            return 1 / (i + 1)
    return 0

In [None]:
df_queries = pd.read_csv(queries_file, encoding='utf-8', usecols=[0, 1], names=["query_id", "text"], header=0)

results = []
ap_list = []
mrr_list = []
k = 10
min_rel_val = 0

for idx, row in df_queries.iterrows():
    query_id = str(row["query_id"]).strip()
    query = str(row["text"]).strip()

    if not query:
        continue

    top_ids, top_docs = search_hybrid(hybrid_matrix, dataset, query, hybrid_vectorizer, word2vec_model, svd_model, 10)
    if not top_ids:
        continue

    relevant_docs, relevance_scores = get_relevant_id_from_qrel(min_rel_val, query_id, qrels_file)

    p = precision_at_k(top_ids, relevant_docs, relevance_scores, k)
    r = calculate_recall(min_rel_val, top_ids, relevant_docs, relevance_scores)
    ap = average_precision_at_k(top_ids, relevant_docs, relevance_scores, k)
    rr = reciprocal_rank_at_k(min_rel_val, top_ids, relevant_docs, relevance_scores, k)

    results.append({
        "Query ID": query_id,
        "Precision@10": p * 100,
        "Recall@10": r * 100,
        "AP@10": ap * 100,
        "RR@10": rr
    })
    ap_list.append(ap)
    mrr_list.append(rr)

In [None]:
df_results = pd.DataFrame(results)
df_results.head(20) 

Unnamed: 0,Query ID,Precision@10,Recall@10,AP@10,RR@10
0,1,10.0,0.30303,10.0,0.1
1,2,10.0,0.217391,100.0,1.0
2,3,0.0,0.0,0.0,0.0
3,4,30.0,0.520833,43.333333,0.5
4,5,40.0,0.973236,62.202381,0.5
5,6,0.0,0.0,0.0,0.0
6,7,50.0,0.929368,81.0,1.0
7,8,20.0,0.417537,18.333333,0.166667
8,9,20.0,0.881057,27.777778,0.333333
9,10,0.0,0.0,0.0,0.0


In [7]:
map_score = sum(ap_list) / len(ap_list) * 100
mrr_score = sum(mrr_list) / len(mrr_list) * 100

print(f"📊 Final MAP: {map_score:.2f}%")
print(f"📊 Final MRR@10: {mrr_score:.2f}%")

📊 Final MAP: 34.45%
📊 Final MRR@10: 36.28%
