In [1]:
import pandas as pd
import csv
from matching import search, search_word2vec  # أو import من مسارك الحالي
from scipy import sparse
import joblib
from dataset_cleaner import clean_process_text
from gensim.models import Word2Vec
import numpy as np

[nltk_data] Downloading package punkt to C:\Users\Firas
[nltk_data]     ka\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Firas
[nltk_data]     ka\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Firas
[nltk_data]     ka\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# clinctrials tf-df
vectorizer = joblib.load("D:/dataset_downloader/cl_vectorizer.joblib")
tfidf_matrix = sparse.load_npz("D:/dataset_downloader/cl_tfidf_matrix.npz")
dataset = pd.read_csv("D:/IR_data/data/clinicaltrials/docs.csv", usecols=[0, 1])
queries_file = 'D:/IR_data/data/clinicaltrials/queries.csv'
qrels_file = 'D:/IR_data/data/clinicaltrials/qrels.csv'
print(type(vectorizer))

<class 'sklearn.feature_extraction.text.TfidfVectorizer'>


In [4]:
def get_relevant_id_from_qrel_v2(min_rel_val, query_id, csv_file):
    relevant_ids = set()  # استخدام set لتجنب التكرار
    relevance_scores = {}
    
    try:
        df_qrels = pd.read_csv(csv_file, names=['query_id', 'doc_id', 'relevance'])
        
        # تنظيف البيانات
        df_qrels['query_id'] = df_qrels['query_id'].astype(str).str.strip()
        df_qrels['doc_id'] = df_qrels['doc_id'].astype(str).str.strip()
        df_qrels['relevance'] = pd.to_numeric(df_qrels['relevance'], errors='coerce')
        
        # فلترة للـ query المطلوب
        query_qrels = df_qrels[df_qrels['query_id'] == query_id]
        
        if query_qrels.empty:
            print(f"Warning: No qrels found for query {query_id}")
            return list(relevant_ids), relevance_scores
        
        # بناء النتائج
        for _, row in query_qrels.iterrows():
            doc_id = row['doc_id']
            rel_val = row['relevance']
            
            if pd.isna(rel_val):
                continue
                
            rel_val = int(rel_val)
            relevance_scores[doc_id] = rel_val
            
            if rel_val >= min_rel_val:
                relevant_ids.add(doc_id)
        
        print(f"Debug - Query {query_id}: Found {len(relevant_ids)} relevant docs, {len(relevance_scores)} total scores")
        print(f"Debug - Relevance values: {sorted(set(relevance_scores.values()))}")
        
        return list(relevant_ids), relevance_scores
        
    except Exception as e:
        print(f"Error reading qrels file: {e}")
        return [], {}

In [5]:
def precision_at_k(retrieved_docs, relevant_docs, relevance_scores, k):
    relevant_in_top_k = sum(1 for doc in retrieved_docs[:k] if doc in relevant_docs)
    return relevant_in_top_k / k

def calculate_recall(min_rel_val, retrieved_docs, relevant_docs, relevance_scores):
    relevant_retrieved = sum(1 for doc in retrieved_docs if doc in relevant_docs and relevance_scores[doc] > 0)
    total_relevant = sum(1 for rel in relevance_scores.values() if rel > 0)
    return relevant_retrieved / total_relevant if total_relevant > 0 else 0

def average_precision_at_k(retrieved_docs, relevant_docs, relevance_scores, k):
    precision_sum = 0.0
    relevant_count = 0
    for i in range(min(k, len(retrieved_docs))):
        if retrieved_docs[i] in relevant_docs:
            relevant_count += 1
            precision_sum += precision_at_k(retrieved_docs, relevant_docs, relevance_scores, i + 1)
    return precision_sum / relevant_count if relevant_count > 0 else 0

def reciprocal_rank_at_k(min_rel_val, retrieved_docs, relevant_docs, relevance_scores, k=10):
    for i, doc in enumerate(retrieved_docs[:k]):
        if doc in relevant_docs and relevance_scores.get(doc, 0) >= min_rel_val:
            return 1 / (i + 1)
    return 0

In [None]:
df_queries = pd.read_csv(queries_file, encoding='utf-8', usecols=[0, 1], names=["query_id", "text"], header=0)

results = []
ap_list = []
mrr_list = []
k = 10
min_rel_val = 1

for idx, row in df_queries.iterrows():
    query_id = str(row["query_id"]).strip()
    query = str(row["text"]).strip()

    if not query:
        continue
    
    top_ids, top_docs = search(vectorizer, tfidf_matrix, dataset, query, k)
    if not top_ids:
        continue

    relevant_docs, relevance_scores = get_relevant_id_from_qrel_v2(min_rel_val, query_id, qrels_file)

    p = precision_at_k(top_ids, relevant_docs, relevance_scores, k)
    r = calculate_recall(min_rel_val, top_ids, relevant_docs, relevance_scores)
    ap = average_precision_at_k(top_ids, relevant_docs, relevance_scores, k)
    rr = reciprocal_rank_at_k(min_rel_val, top_ids, relevant_docs, relevance_scores, k)

    results.append({
        "Query ID": query_id,
        "Precision@10": p * 100,
        "Recall@10": r * 100,
        "AP@10": ap * 100,
        "RR@10": rr
    })

    ap_list.append(ap)
    mrr_list.append(rr)

Debug - Query 1: Found 17 relevant docs, 330 total scores
Debug - Relevance values: [0, 1, 2]
Debug - Query 2: Found 137 relevant docs, 460 total scores
Debug - Relevance values: [0, 1, 2]
Debug - Query 3: Found 24 relevant docs, 245 total scores
Debug - Relevance values: [0, 1, 2]
Debug - Query 4: Found 57 relevant docs, 576 total scores
Debug - Relevance values: [0, 1, 2]
Debug - Query 5: Found 36 relevant docs, 411 total scores
Debug - Relevance values: [0, 1, 2]
Debug - Query 6: Found 27 relevant docs, 372 total scores
Debug - Relevance values: [0, 1]
Debug - Query 7: Found 205 relevant docs, 538 total scores
Debug - Relevance values: [0, 1, 2]
Debug - Query 8: Found 61 relevant docs, 479 total scores
Debug - Relevance values: [0, 1, 2]
Debug - Query 9: Found 62 relevant docs, 227 total scores
Debug - Relevance values: [0, 1, 2]
Debug - Query 11: Found 19 relevant docs, 508 total scores
Debug - Relevance values: [0, 1, 2]
Debug - Query 12: Found 39 relevant docs, 421 total scores
D

In [7]:
df_results = pd.DataFrame(results)
df_results.head(20)  # عرض أول 10 صفوف

Unnamed: 0,Query ID,Precision@10,Recall@10,AP@10,RR@10
0,1,30.0,17.647059,50.0,0.5
1,2,50.0,3.649635,47.555556,0.333333
2,3,10.0,4.166667,100.0,1.0
3,4,20.0,3.508772,75.0,1.0
4,5,10.0,2.777778,14.285714,0.142857
5,6,30.0,11.111111,68.055556,1.0
6,7,30.0,1.463415,38.690476,0.5
7,8,0.0,0.0,0.0,0.0
8,9,50.0,8.064516,80.0,1.0
9,10,0.0,0.0,0.0,0.0


In [8]:
map_score = sum(ap_list) / len(ap_list) * 100
mrr_score = sum(mrr_list) / len(mrr_list) * 100

print(f"📊 Final MAP: {map_score:.2f}%")
print(f"📊 Final MRR@10: {mrr_score:.2f}%")


📊 Final MAP: 33.75%
📊 Final MRR@10: 39.24%
