In [None]:
import numpy as np
import requests

response = requests.post(
    "http://localhost:8000/tfidf/build_tfidf_model",
    json={
        "collection_name": "beir_quora",
        "model_path": "../models/tfidf/beir_quora_tfidf.joblib",
        "vector_path": "../models/tfidf/beir_quora_tfidf_vectors.joblib",
        "svd_path": "../models/tfidf/beir_quora_svd.joblib"
    }
)
response.raise_for_status() 
print(response.json()["status"])

response2 = requests.post(
    "http://localhost:8000/tfidf/load_tfidf_model",
    json={
        "model_path": "../models/tfidf/beir_quora_tfidf.joblib",
        "vector_path": "../models/tfidf/beir_quora_tfidf_vectors.joblib",
    }
)
response2.raise_for_status()
feature_names = response2.json()["vectorizerQ"]
print(f"Loaded {len(feature_names)} features")

queries = [
    "what is machine learning",
    "how to learn python programming",
    "difference between ai and ml"
]

for query in queries:
        print(f"\nProcessing query: '{query}'")
        response3 = requests.post(
            "http://localhost:8000/tfidf/transform_query",
            json={
                "model_path": "../models/tfidf/beir_quora_tfidf.joblib",
                "vector_path": "../models/tfidf/beir_quora_tfidf_vectors.joblib",
                "query_text": query, 
            }
        )
        response3.raise_for_status()
        print(f"Cleaned query: '{response3.json()['cleaned_text']}'")
        print(f"Query vector shape: {response3.json()['vector_shape']}")

        query_array = np.array(response3.json()["query_vec"])
        non_zero_indices = np.where(query_array > 0)[0]
        non_zero_features = [(feature_names[i], query_array[i]) for i in non_zero_indices]
        print("Non-zero features and their weights:")
        for feature, weight in non_zero_features:
            print(f"  {feature}: {weight:.4f}")


Model built successfully
Loaded 50000 features

Processing query: 'what is machine learning'
Cleaned query: 'what machine learning'
Query vector shape: (1, 50000)
Non-zero features and their weights:
  learning: 0.6561
  machine: 0.7271
  what: 0.2023

Processing query: 'how to learn python programming'
Cleaned query: 'how learn python programming'
Query vector shape: (1, 50000)
Non-zero features and their weights:
  how: 0.2128
  learn: 0.4736
  programming: 0.5609
  python: 0.6448

Processing query: 'difference between ai and ml'
Cleaned query: 'difference ai ml'
Query vector shape: (1, 50000)
Non-zero features and their weights:
  ai: 0.6382
  difference: 0.3526
  ml: 0.6844


In [None]:
import numpy as np
import requests

response = requests.post(
    "http://localhost:8000/bert/build_bert_embeddings",
    json={
        "collection_name": "beir_quora",
        "model_path": "../models/embeddings/beir_quora_bert.joblib",
        "vector_path": "../models/embeddings/beir_quora_vectors.joblib",
        "batch_size": 16
    }
)
response.raise_for_status()
print(response.json()["status"])


response2 = requests.post(
    "http://localhost:8000/bert/load_bert_embeddings",
    json={
        "model_path": "../models/embeddings/beir_quora_bert.joblib",
        "vector_path": "../models/embeddings/beir_quora_vectors.joblib"
    }
)
response2.raise_for_status()
print(f"Loaded embeddings with shape: {response2.json()['embeddings_shape']}")
print(f"Number of document IDs: {response2.json()['doc_ids_count']}")

queries = [
    "what is machine learning",
    "how to learn python programming",
    "difference between ai and ml"
]

from sklearn.metrics.pairwise import cosine_similarity
query_embeddings = []
for query in queries:
        print(f"\nProcessing query: '{query}'")
        response3 = requests.post(
            "http://localhost:8000/bert/transform_bert_query",
            json={
               "model_path": "../models/embeddings/beir_quora_bert.joblib",
                "vector_path": "../models/embeddings/beir_quora_vectors.joblib",
                "query_text": query
            }
        )
        response3.raise_for_status()
        query_embedding = np.array(response3.json()['query_embedding'])
      
        print(f"Query embedding shape: {query_embedding.shape}")  
        query_embeddings.append(query_embedding)
for i in range(len(queries)):
    for j in range(i+1, len(queries)):
        sim = cosine_similarity(query_embeddings[i], query_embeddings[j])[0][0]
        print(f"Similarity between '{queries[i]}' and '{queries[j]}': {sim:.4f}")

BERT embeddings built successfully
Loaded embeddings with shape: (522931, 384)
Number of document IDs: 522931

Processing query: 'what is machine learning'
Query embedding shape: (1, 384)

Processing query: 'how to learn python programming'
Query embedding shape: (1, 384)

Processing query: 'difference between ai and ml'
Query embedding shape: (1, 384)
Similarity between 'what is machine learning' and 'how to learn python programming': 0.2620
Similarity between 'what is machine learning' and 'difference between ai and ml': 0.3269
Similarity between 'how to learn python programming' and 'difference between ai and ml': -0.0640


In [None]:

import numpy as np
import requests
from sklearn.metrics.pairwise import cosine_similarity

response = requests.post(
    "http://localhost:8000/hybrid/build_hybrid_model",
    json={
        "tfidf_model_path": "../models/tfidf/beir_quora_tfidf.joblib",
        "tfidf_vector_path": "../models/tfidf/beir_quora_tfidf_vectors.joblib",
        "svd_path": "../models/tfidf/beir_quora_svd.joblib",
        "bert_model_path": "../models/embeddings/beir_quora_bert.joblib",
        "bert_vector_path": "../models/embeddings/beir_quora_vectors.joblib",
        "hybrid_vector_path": "../models/hybrid/beir_quora_hybrid_vectors.joblib",
        "alpha": 0.5,
        "n_components": 384
    }
)
response.raise_for_status()
print(response.json()["status"])


queries = [
    "what is machine learning",
    "how to learn python programming",
    "difference between ai and ml"
]

query_embeddings = []
for query in queries:
    response2 = requests.post(
        "http://localhost:8000/hybrid/transform_hybrid_query",
        json={
            "tfidf_model_path": "../models/tfidf/beir_quora_tfidf.joblib",
            "tfidf_vector_path": "../models/tfidf/beir_quora_tfidf_vectors.joblib",
            "svd_path": "../models/tfidf/beir_quora_svd.joblib",
            "bert_model_path": "../models/embeddings/beir_quora_bert.joblib",
            "bert_vector_path": "../models/embeddings/beir_quora_vectors.joblib",
            "hybrid_vector_path": "../models/hybrid/beir_quora_hybrid_vectors.joblib",
            "query_text": query,
            "alpha": 0.5
        }
    )
    response2.raise_for_status()
    result = response2.json()
    query_embeddings.append(np.array(response2.json()["query_vector"]))
    print(f"Query vector shape: {result['query_vector_shape']}")
    print("Top 5 documents:")
    for doc in result["top_documents"]:
        print(f"  Doc ID: {doc['doc_id']}, Similarity: {doc['similarity']:.4f}")

print("\nCosine Similarity between queries:")
for i in range(len(queries)):
    for j in range(i + 1, len(queries)):
        sim = cosine_similarity(query_embeddings[i], query_embeddings[j])[0][0]
        print(f"Similarity between '{queries[i]}' and '{queries[j]}': {sim:.4f}")


Hybrid model built successfully
Query vector shape: (1, 384)
Top 5 documents:
  Doc ID: 157066, Similarity: 1.0000
  Doc ID: 457427, Similarity: 1.0000
  Doc ID: 436634, Similarity: 1.0000
  Doc ID: 438273, Similarity: 0.9938
  Doc ID: 438272, Similarity: 0.9763
Query vector shape: (1, 384)
Top 5 documents:
  Doc ID: 305185, Similarity: 0.8787
  Doc ID: 215232, Similarity: 0.8756
  Doc ID: 371075, Similarity: 0.8537
  Doc ID: 86317, Similarity: 0.8534
  Doc ID: 401440, Similarity: 0.8357
Query vector shape: (1, 384)
Top 5 documents:
  Doc ID: 469293, Similarity: 0.7620
  Doc ID: 469294, Similarity: 0.7397
  Doc ID: 344740, Similarity: 0.7147
  Doc ID: 344741, Similarity: 0.7126
  Doc ID: 455019, Similarity: 0.7111

Cosine Similarity between queries:
Similarity between 'what is machine learning' and 'how to learn python programming': 0.1760
Similarity between 'what is machine learning' and 'difference between ai and ml': 0.1633
Similarity between 'how to learn python programming' and 'd

In [None]:
import os
import sys
from importlib import reload
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import joblib

project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.append(project_root)

import processing_ranking.query_processing as query_processing
reload(query_processing)

from src.representation.tfidf_model import load_tfidf_model
from src.representation.bert_model import load_bert_embeddings
from processing_ranking.query_processing import retrieve_documents_tfidf, retrieve_documents_bert, retrieve_documents_hybrid

tfidf_vectorizer, tfidf_matrix, tfidf_doc_ids = load_tfidf_model(
    "../models/tfidf/beir_quora_tfidf.joblib",
    "../models/tfidf/beir_quora_tfidf_vectors.joblib"
)
svd_model = joblib.load("../models/tfidf/beir_quora_svd.joblib")
bert_tokenizer, bert_model, bert_embeddings, bert_doc_ids = load_bert_embeddings(
    "../models/embeddings/beir_quora_bert.joblib",
    "../models/embeddings/beir_quora_vectors.joblib"
)
hybrid_vectors = joblib.load("../models/hybrid/beir_quora_hybrid_vectors.joblib")
hybrid_doc_ids = joblib.load("../models/hybrid/beir_quora_hybrid_vectors.joblib_doc_ids")

queries = [
    "what is machine learning",
    "how to learn python programming",
    "difference between ai and ml"
]
k = 5

for query in queries:
    print(f"\n=== Processing query: '{query}' ===")
   
    index_dir = os.path.join(project_root, "data/index_beir_quora")
    tfidf_results, tfidf_cleaned = retrieve_documents_tfidf(tfidf_vectorizer, tfidf_matrix, tfidf_doc_ids, query, None,index_dir, k)
    print(f"\nTF-IDF Results (Cleaned: '{tfidf_cleaned}')")
    for doc_id, score in tfidf_results:
        print(f"Doc ID: {doc_id}, Similarity: {score:.4f}")

    bert_results, bert_cleaned = retrieve_documents_bert(bert_tokenizer, bert_model, bert_embeddings, bert_doc_ids, query,index_dir, k)
    print(f"\nBERT Results (Cleaned: '{bert_cleaned}')")
    for doc_id, score in bert_results:
        print(f"Doc ID: {doc_id}, Similarity: {score:.4f}")

    hybrid_results, hybrid_cleaned = retrieve_documents_hybrid(
        tfidf_vectorizer, svd_model, bert_tokenizer, bert_model, hybrid_vectors, hybrid_doc_ids, query,index_dir, alpha=0.5, k=k
    )
    print(f"\nHybrid Results (Cleaned: '{hybrid_cleaned}')")
    for doc_id, score in hybrid_results:
        print(f"Doc ID: {doc_id}, Similarity: {score:.4f}")



=== Processing query: 'what is machine learning' ===
Retrieved 100 candidate documents: ['71021', '438272', '58718', '71020', '83290']...
Filtered 100 documents for similarity computation

TF-IDF Results (Cleaned: 'machin learn')
Doc ID: 71021, Similarity: 1.0000
Doc ID: 117935, Similarity: 1.0000
Doc ID: 457427, Similarity: 1.0000
Doc ID: 438273, Similarity: 1.0000
Doc ID: 436634, Similarity: 1.0000
Retrieved 0 candidate documents: []...
Query vector shape: (1, 384), Cleaned query: what is machine learning

BERT Results (Cleaned: 'what is machine learning')
Doc ID: 457427, Similarity: 1.0000
Doc ID: 436634, Similarity: 1.0000
Doc ID: 157066, Similarity: 1.0000
Doc ID: 438273, Similarity: 0.9695
Doc ID: 436635, Similarity: 0.9307
Retrieved 100 candidate documents: ['71021', '438272', '58718', '71020', '83290']...
Filtered 100 documents for similarity computation
Query vector shape: (1, 384), Cleaned query: what is machine learning

Hybrid Results (Cleaned: 'what is machine learning')


'\n## معالجة الاستعلامات والترتيب\n- **الاستعلامات المختبرة**:\n  - what is machine learning\n  - how to learn python programming\n  - difference between ai and ml\n- **النماذج**:\n  - TF-IDF (مع SVD)\n  - BERT (all-MiniLM-L6-v2)\n  - Hybrid (alpha=0.5)\n- **النتائج**:\n  - تم استرجاع أعلى 5 وثائق لكل استعلام باستخدام Cosine Similarity.\n- **الملاحظات**: [أضيفي ملاحظاتك، مثل جودة الوثائق المسترجعة أو الفروقات بين النماذج].\n'