In [None]:
import pandas as pd
from matching import search, search_word2vec 
from dataset_cleaner import clean_process_text
from gensim.models import Word2Vec
import numpy as np

[nltk_data] Downloading package punkt to C:\Users\Firas
[nltk_data]     ka\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Firas
[nltk_data]     ka\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Firas
[nltk_data]     ka\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
#antique word2vec
word2vec_model = Word2Vec.load("qu_word2vec.model")
doc_vectors = np.load("qu_doc_vectors_tfidf_weighted.npy")
dataset = pd.read_csv("D:/IR_data/data/quora/docs.csv", usecols=[0, 1])
qrels_file = "D:/IR_Data/data/quora/qrels.csv"
queries_file = "D:/IR_data/data/quora/queries.csv"

In [3]:
def get_relevant_id_from_qrel_v2(min_rel_val, query_id, csv_file):
    relevant_ids = set()  
    relevance_scores = {}
    
    try:
        df_qrels = pd.read_csv(csv_file, names=['query_id', 'doc_id', 'relevance'])
        
        df_qrels['query_id'] = df_qrels['query_id'].astype(str).str.strip()
        df_qrels['doc_id'] = df_qrels['doc_id'].astype(str).str.strip()
        df_qrels['relevance'] = pd.to_numeric(df_qrels['relevance'], errors='coerce')
        
        query_qrels = df_qrels[df_qrels['query_id'] == query_id]
        
        if query_qrels.empty:
            print(f"Warning: No qrels found for query {query_id}")
            return list(relevant_ids), relevance_scores
        
        for _, row in query_qrels.iterrows():
            doc_id = row['doc_id']
            rel_val = row['relevance']
            
            if pd.isna(rel_val):
                continue
                
            rel_val = int(rel_val)
            relevance_scores[doc_id] = rel_val
            
            if rel_val >= min_rel_val:
                relevant_ids.add(doc_id)
        
        print(f"Debug - Query {query_id}: Found {len(relevant_ids)} relevant docs, {len(relevance_scores)} total scores")
        print(f"Debug - Relevance values: {sorted(set(relevance_scores.values()))}")
        
        return list(relevant_ids), relevance_scores
        
    except Exception as e:
        print(f"Error reading qrels file: {e}")
        return [], {}

In [4]:
def precision_at_k(retrieved_docs, relevant_docs, relevance_scores, k):
    relevant_in_top_k = sum(1 for doc in retrieved_docs[:k] if doc in relevant_docs)
    return relevant_in_top_k / k

def calculate_recall(min_rel_val, retrieved_docs, relevant_docs, relevance_scores):
    relevant_retrieved = sum(1 for doc in retrieved_docs if doc in relevant_docs and relevance_scores.get(doc,0) > 0)
    total_relevant = sum(1 for rel in relevance_scores.values() if rel > 0)
    return relevant_retrieved / total_relevant if total_relevant > 0 else 0

def average_precision_at_k(retrieved_docs, relevant_docs, relevance_scores, k):
    precision_sum = 0.0
    relevant_count = 0
    for i in range(min(k, len(retrieved_docs))):
        if retrieved_docs[i] in relevant_docs:
            relevant_count += 1
            precision_sum += precision_at_k(retrieved_docs, relevant_docs, relevance_scores, i + 1)
    return precision_sum / relevant_count if relevant_count > 0 else 0

def reciprocal_rank_at_k(min_rel_val, retrieved_docs, relevant_docs, relevance_scores, k=10):
    for i, doc in enumerate(retrieved_docs[:k]):
        if doc in relevant_docs and relevance_scores.get(doc, 0) >= min_rel_val:
            return 1 / (i + 1)
    return 0


In [5]:
df_queries = pd.read_csv(queries_file, encoding='utf-8', usecols=[0, 1], names=["query_id", "text"], header=0)

results = []
ap_list = []
mrr_list = []
k = 10
min_rel_val = 1

for idx, row in df_queries.iterrows():
    query_id = str(row["query_id"]).strip()
    query = str(row["text"]).strip()

    if not query:
        continue
    
    # pr_query = clean_process_text(query)

    top_ids, top_docs = search_word2vec(query, word2vec_model, doc_vectors, dataset, 10, 100)

    relevant_docs, relevance_scores = get_relevant_id_from_qrel_v2(min_rel_val, query_id, qrels_file)
    relevant_docs = [int(doc_id) for doc_id in relevant_docs]
    relevance_scores = {int(k): v for k, v in relevance_scores.items()}

    p = precision_at_k(top_ids, relevant_docs, relevance_scores, k)
    r = calculate_recall(min_rel_val, top_ids, relevant_docs, relevance_scores)
    ap = average_precision_at_k(top_ids, relevant_docs, relevance_scores, k)
    rr = reciprocal_rank_at_k(min_rel_val, top_ids, relevant_docs, relevance_scores, k)

    results.append({
        "Query ID": query_id,
        "Precision@10": p * 100,
        "Recall@10": r * 100,
        "AP@10": ap * 100,
        "RR@10": rr
    })

    ap_list.append(ap)
    mrr_list.append(rr)

Debug - Query 318: Found 1 relevant docs, 1 total scores
Debug - Relevance values: [1]
Debug - Query 378: Found 1 relevant docs, 1 total scores
Debug - Relevance values: [1]
Debug - Query 379: Found 5 relevant docs, 5 total scores
Debug - Relevance values: [1]
Debug - Query 399: Found 28 relevant docs, 28 total scores
Debug - Relevance values: [1]
Debug - Query 420: Found 1 relevant docs, 1 total scores
Debug - Relevance values: [1]
Debug - Query 540: Found 2 relevant docs, 2 total scores
Debug - Relevance values: [1]
Debug - Query 548: Found 1 relevant docs, 1 total scores
Debug - Relevance values: [1]
Debug - Query 609: Found 2 relevant docs, 2 total scores
Debug - Relevance values: [1]
Debug - Query 744: Found 10 relevant docs, 10 total scores
Debug - Relevance values: [1]
Debug - Query 784: Found 2 relevant docs, 2 total scores
Debug - Relevance values: [1]
Debug - Query 858: Found 2 relevant docs, 2 total scores
Debug - Relevance values: [1]
Debug - Query 975: Found 1 relevant doc

In [6]:
df_results = pd.DataFrame(results)
df_results.head(20) 

Unnamed: 0,Query ID,Precision@10,Recall@10,AP@10,RR@10
0,318,0.0,0.0,0.0,0.0
1,378,0.0,0.0,0.0,0.0
2,379,40.0,80.0,87.5,1.0
3,399,90.0,32.142857,92.826279,1.0
4,420,10.0,100.0,33.333333,0.333333
5,540,10.0,50.0,100.0,1.0
6,548,10.0,100.0,100.0,1.0
7,609,20.0,100.0,58.333333,0.5
8,744,40.0,40.0,80.416667,1.0
9,784,10.0,50.0,100.0,1.0


In [7]:
map_score = sum(ap_list) / len(ap_list) * 100
mrr_score = sum(mrr_list) / len(mrr_list) * 100

print(f"📊 Final MAP: {map_score:.2f}%")
print(f"📊 Final MRR@10: {mrr_score:.2f}%")

📊 Final MAP: 26.62%
📊 Final MRR@10: 27.10%
