In [10]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [11]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m65.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [12]:
import re
import csv
import json
import math
import time
import string
import collections
import numpy as np
from numpy import linalg as la
from collections import defaultdict
from array import array
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import PorterStemmer, WordNetLemmatizer
import pandas as pd
import matplotlib.pyplot as plt
from gensim.downloader import load
from gensim.utils import simple_preprocess
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter
from wordcloud import WordCloud
import seaborn as sns

nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')
nltk.download('punkt', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('maxent_ne_chunker', quiet=True)
nltk.download('words', quiet=True)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [13]:
# Paths of the documents used in this notebook.
# Change as necessary
docs_path = '/content/drive/MyDrive/fashion_products_dataset.json'
validation_path = '/content/drive/MyDrive/validation_labels.csv'
our_validation_path = '/content/drive/MyDrive/our_queries_validation.csv'


with open(docs_path, 'r') as fp:
    lines = json.load(fp)

In [14]:
queries = {
    1: "men cotton blend grey t shirt",
    2: "light blue jeans for men slim fit",
    3: "women casual wear cotton shirt",
    4: "printed navy top for women",
    5: "regular fit denim jeans"
}

In [15]:
def build_terms(line):
    """
    Preprocess the text (title + description):
    - Lowercase
    - Remove punctuation and special symbols (e.g., ₹, %, ™)
    - Normalize spaces
    - Tokenization
    - Remove stop words
    - Lemmatization
    - Stemming
    """
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words("english"))

    # Lowercase
    line = line.lower()

    # Remove punctuation and uncommon symbols
    line = re.sub(r"[^\w\s]", " ", line)

    # Normalize spaces
    line = re.sub(r"\s+", " ", line).strip()

    # Tokenize
    tokens = line.split()

    # Remove stop words
    tokens = [w for w in tokens if w not in stop_words]

    # Lemmatization (reduces words to their base form)
    tokens = [lemmatizer.lemmatize(w) for w in tokens]

    # Stemming (we don't need, explained in the report)
    tokens = [stemmer.stem(w) for w in tokens]

    return tokens


In [16]:
def create_index_with_tfidf(lines):
    # Initialize data structures
    index = defaultdict(list)
    products_info = {}
    tf = defaultdict(dict)
    df = defaultdict(int)
    idf = defaultdict(float)

    num_documents = len(lines)

    for doc in lines:
        pid = doc.get("pid", "")
        title = doc.get("title", "")
        description = doc.get("description", "")
        brand = doc.get("brand", "")
        category = doc.get("category", "")
        sub_category = doc.get("sub_category", "")
        product_details = doc.get("product_details", "")
        seller = doc.get("seller", "")
        out_of_stock = doc.get("out_of_stock", "")
        selling_price_str = doc.get("selling_price", "").replace(',', '')
        selling_price = float(selling_price_str) if selling_price_str else 0.0
        discount_str = doc.get("discount", "").strip('% off')
        discount = float(discount_str) if discount_str else 0.0
        actual_price_str = doc.get("actual_price", "").replace(',', '')
        actual_price = float(actual_price_str) if actual_price_str else 0.0
        average_rating_str = doc.get("average_rating", "")
        average_rating = float(average_rating_str) if average_rating_str else None
        url = doc.get("url", "")

        text = f"{title} {description}"
        terms = build_terms(text)

        products_info[pid] = {
            "pid": pid,
            "title": title,
            "description": description,
            "brand": brand,
            "category": category,
            "sub_category": sub_category,
            "product_details": product_details,
            "seller": seller,
            "out_of_stock": out_of_stock,
            "selling_price": selling_price,
            "discount": discount,
            "actual_price": actual_price,
            "average_rating": average_rating,
            "url": url
        }

        # Compute term frequency and positional index for the current document
        term_freq = defaultdict(int)
        current_page_index = {}

        for position, term in enumerate(terms):
            term_freq[term] += 1
            try:
                current_page_index[term][1].append(position)
            except:
                current_page_index[term] = [pid, array('I', [position])]

        # Update inverted index and document frequency for each term
        for term, posting in current_page_index.items():
            df[term] += 1
            index[term].append(posting)

        # Compute TF for each term in the doc
        for term, freq in term_freq.items():
            if freq > 0:
                tfidf_weight = (1 + math.log(freq))  # IDF will be applied later
                tf[term][pid] = round(tfidf_weight, 4)
            else:
                tf[term][pid] = 0.0

    # Compute IDF for each term
    for term in df:
        idf[term] = round(math.log(num_documents / df[term]), 4)

    return index, products_info, tf, df, idf


In [17]:
start_time = time.time()
index, products_info, tf, df, idf = create_index_with_tfidf(lines)
print("Total time to create the index: {} seconds".format(np.round(time.time() - start_time, 2)))

Total time to create the index: 53.96 seconds


# **PART 1**

# 1. RANKING WITH TF-IDF

In [18]:
def rank_products_tf_idf(query_terms, docs, index, idf, tf, products_info):
    # Initialize document vectors and query vector
    doc_vectors = defaultdict(lambda: [0] * len(query_terms))
    query_vector = [0] * len(query_terms)

    # Count term frequencies in the query
    query_terms_count = collections.Counter(query_terms)
    query_norm = la.norm(list(query_terms_count.values()))

    # Build the weighted query vector using TF-IDF
    for termIndex, term in enumerate(query_terms):
        if term not in index:
            continue
        query_vector[termIndex] = query_terms_count[term] / query_norm * idf[term]

        # Assign TF-IDF weights to documents containing the term
        for (pid, _) in index[term]:
            if pid in docs and pid in tf[term]:
                doc_vectors[pid][termIndex] = tf[term][pid] * idf[term]

    # Compute cosine similarity between query and each document
    doc_scores = []
    for pid, doc_vec in doc_vectors.items():
        doc_norm = la.norm(doc_vec)
        if doc_norm == 0:
            score = 0.0
        else:
            score = np.dot(doc_vec, query_vector) / doc_norm
        doc_scores.append([score, pid])

    # Sort documents by descending similarity score
    doc_scores.sort(reverse=True)
    result_docs = [x[1] for x in doc_scores]

    return result_docs, doc_scores

# 2. RANKING WITH BM25

In [19]:
def rank_products_bm25(query_terms, index, idf, tf, products_info, docs=None, k1=1.5, b=0.75):
    doc_scores = defaultdict(float)
    doc_lengths = {}
    total_length = 0

    if docs is None:
        docs = products_info.keys()

    # Compute document lengths
    for pid in docs:
        text = f"{products_info[pid]['title']} {products_info[pid]['description']}"
        doc_length = len(build_terms(text))
        doc_lengths[pid] = doc_length
        total_length += doc_length

    num_docs = len(docs)
    avg_doc_length = total_length / num_docs if num_docs > 0 else 0

    # Compute BM25 score for each document
    for term in query_terms:
        if term not in index:
            continue
        for pid, _ in index[term]:
            if pid not in docs:
                continue
            if pid in tf[term]:
                f = tf[term][pid]
                idf_term = idf[term]
                numerator = f * (k1 + 1)
                denominator = f + k1 * (1 - b + b * (doc_lengths[pid] / avg_doc_length))
                score = idf_term * (numerator / denominator)
                doc_scores[pid] += score

    # Sort documents by score
    ranked_docs = sorted(doc_scores.items(), key=lambda x: x[1], reverse=True)
    result_docs = [pid for pid, score in ranked_docs]

    return result_docs, ranked_docs


# 3. SIMPLE CUSTOMIZED RANKING

In [20]:
def rank_products_custom1(query_terms, index, idf, tf, products_info, docs=None, k1=1.5, b=0.75, alpha=0.7, beta=0.2, gamma=0.1):
    """
    Custom ranking that combines BM25 text relevance with product attributes
    like rating and discount.
    """
    if docs is None:
        docs = products_info.keys()

    # Compute basic BM25
    bm25_docs, bm25_scores = rank_products_bm25(query_terms, index, idf, tf, products_info, docs=list(docs), k1=k1, b=b)

    # Convert into dictionary
    bm25_dict = dict(bm25_scores)

    # Normalize rating and discount
    scores = {}
    for pid in docs:
        info = products_info[pid]
        bm25_val = bm25_dict.get(pid, 0)

        rating = info.get("average_rating", 0) or 0
        discount = info.get("discount", 0) or 0

        normalized_rating = rating / 5.0
        normalized_discount = discount / 100.0

        # Combine with weights
        custom = alpha * bm25_val + beta * normalized_rating + gamma * normalized_discount
        scores[pid] = custom

    ranked_docs = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    result_docs = [pid for pid, _ in ranked_docs]

    return result_docs, ranked_docs


# 4. ADVANCED CUSTOMIZED RANKING

In [21]:
def rank_products_custom2(query_terms, index, idf, tf, products_info, docs):
    doc_scores = defaultdict(float)

    for pid in docs:
        info = products_info[pid]

        title_text = info["title"].lower()
        description_text = info["description"].lower()

        #base textual score (TF-IDF)
        base_score = 0.0

        for term in query_terms:
            if term in tf and pid in tf[term]:
                tfidf_score = tf[term][pid] * idf.get(term, 0.0)

                #Title weighting applied
                if term in title_text:
                    tfidf_score *= 2.0

                base_score += tfidf_score

        #rating + review
        rating = float(info.get("average_rating", 0) or 0.0)
        rating_norm = rating / 5.0

        review_count = float(info.get("rating_count", 1.0) or 1.0)
        review_boost = math.log1p(review_count) / 3.0

        social_proof = 1 + (0.4 * rating_norm) * (0.3 + review_boost)

        #price and discount
        price = float(info.get("selling_price", 1.0) or 1.0)
        if price <= 0:
            price = 1.0

        price_penalty = 1 / (1 + math.log1p(price))

        discount = float(info.get("discount", 0.0) or 0.0) / 100.0
        discount_boost = 1 + 0.2 * discount


        #if no stock we don't want the product
        stock_boost = 0 if info.get("out_of_stock", False) else 1.0

        #final score
        final_score = (base_score * social_proof * discount_boost * price_penalty * stock_boost)

        doc_scores[pid] = final_score

    ranked_docs = sorted(doc_scores.items(), key=lambda x: x[1], reverse=True)
    result_docs = [pid for pid, _ in ranked_docs]

    return result_docs, ranked_docs


# SEARCH

In [22]:
def search_products(query, index, idf, tf, products_info, ranking_method):
    query_terms = build_terms(query)
    if not query_terms:
        return [], []

    first_term = query_terms[0]
    if first_term not in index:
        return [], []
    docs = set(posting[0] for posting in index[first_term])

    for term in query_terms[1:]:
        if term not in index:
            return [], []
        term_docs = set(posting[0] for posting in index[term])
        docs &= term_docs
        if not docs:
            return [], []

    if ranking_method.lower() == "bm25":
        ranked_docs, doc_scores = rank_products_bm25(query_terms, index, idf, tf, products_info, docs=list(docs))
    elif ranking_method.lower() == "custom1":
        ranked_docs, doc_scores = rank_products_custom1(query_terms, index, idf, tf, products_info, docs=list(docs))
    elif ranking_method.lower() == "custom2":
        ranked_docs, doc_scores = rank_products_custom2(query_terms, index, idf, tf, products_info, docs=list(docs))
    else:
        ranked_docs, doc_scores = rank_products_tf_idf(query_terms, list(docs), index, idf, tf, products_info)

    return ranked_docs, doc_scores

# IMPLEMENTATION

In [23]:
queries = {
    1: "men cotton blend grey t shirt",
    2: "light blue jeans for men slim fit",
    3: "women casual wear cotton shirt",
    4: "printed navy top for women",
    5: "regular fit denim jeans"
}
methods = ["tf-idf", "bm25", "custom1", "custom2"]

for qid, query in queries.items():
  print(f"\n---- QUERY {query} ----")
  for method in methods:
    ranked_docs, scores = search_products(query, index, idf, tf, products_info, ranking_method=method)
    print(f"\nTop 20 ({method.upper()}) results for: '{query}'")
    if method == 'custom1':
      for pid, score in scores[:20]:
          info = products_info[pid]
          print(f" - {pid}: {info['title']} → score = {round(score, 4)} | rating={info['average_rating']} | discount={info['discount']}")
    elif method == 'bm25':
      for pid, score in scores[:20]:
          info = products_info[pid]
          print(f" - {pid}: {info['title']} → score = {round(score, 4)}")
    elif method == 'custom2':
      for pid, score in scores[:20]:
          info = products_info[pid]
          print(f" - {pid}: {info['title']} → score = {round(score, 4)} | rating={info['average_rating']} | price={info['selling_price']}| discount={info['discount']}")
    else:
      for score, pid in scores[:20]:
        info = products_info[pid]
        print(f" - {pid}: {info['title']} → score = {round(score, 4)}")



---- QUERY men cotton blend grey t shirt ----

Top 20 (TF-IDF) results for: 'men cotton blend grey t shirt'
 - TKPFYGUNNFVNFQF7: Solid Men Grey Track Pants → score = 1.8456
 - TSHEXZ8KYFDUHVYY: Solid Men Round Neck Multicolor T-Shirt  (Pack of 2) → score = 1.8387
 - TSHEXZ8KHNVHEEK6: Solid Men V-neck Multicolor T-Shirt  (Pack of 2) → score = 1.8387
 - TSHEXZ8KBEKYQYTZ: Solid Men V-Neck Multicolor T-Shirt  (Pack of 2) → score = 1.8387
 - TSHER6FBWFYYR5GG: Solid Men Round Neck Multicolor T-Shirt  (Pack of 3) → score = 1.8387
 - TSHER6F4RWS54H9F: Solid Men Round Neck Multicolor T-Shirt  (Pack of 2) → score = 1.8387
 - TSHENYJ28ZCH4H4V: Solid Men Round Neck Multicolor T-Shirt  (Pack of 5) → score = 1.8387
 - TSHENYGRN9XHQUVF: Solid Men Round Neck Multicolor T-Shirt  (Pack of 3) → score = 1.8387
 - TSHENXFZHDWREHFT: Solid Men V-neck Multicolor T-Shirt  (Pack of 3) → score = 1.8387
 - TSHENXFW8CZCTVU5: Solid Men V-neck Multicolor T-Shirt  (Pack of 3) → score = 1.8387
 - TSHFZ3JDVAYZKCFC: So

# **PART 2**

# WORD2VEC + COSINE RANKING

In [24]:
def train_custom_word2vec(corpus_tokens, vector_size=50):
    print("Training custom Word2Vec model...")
    model = Word2Vec(
        sentences=corpus_tokens,
        vector_size=vector_size,
        window=5,
        min_count=1,
        workers=4
    )
    return model.wv

def tokens_to_vector(tokens, model):
    """Converteix una llista de tokens en un vector mitjà basat en Word2Vec."""
    vectors = [model[word] for word in tokens if word in model]
    if not vectors:
        return np.zeros(model.vector_size)
    return np.mean(vectors, axis=0)

def word2vec_ranking_topk(model, doc_vectors, products_info, queries, top_k=20):
    """
    Retorna els top-k documents per a cada query usant Word2Vec + cosine similarity.

    Args:
        model: model preentrenat (Word2Vec/Glove).
        doc_vectors (list of tuples): [(pid, vector), ...] vectors dels documents.
        products_info (dict): informació completa dels productes per pid.
        queries (dict): diccionari de queries {qid: "query text"}.
        top_k (int): nombre màxim de documents a retornar.

    Returns:
        dict: {qid: {"query": query_text, "results": [(product_info, score), ...]}}
    """
    # Extract only the vectors for more efficient computations
    vectors_only = np.array([vec for pid, vec in doc_vectors])

    results = {}

    for qid, query_text in queries.items():
        # Preprocess query and obtain its vector
        q_tokens = build_terms(query_text)
        q_vec = tokens_to_vector(q_tokens, model)

        # Comute cosinus similarity with all documents
        sims = cosine_similarity([q_vec], vectors_only)[0]

        # Top-k documents
        top_indices = sims.argsort()[::-1][:top_k]

        ranked_docs = []
        for idx in top_indices:
            pid, _ = doc_vectors[idx]
            info = products_info[pid]
            ranked_docs.append((info, float(sims[idx])))

        results[qid] = {
            "query": query_text,
            "results": ranked_docs
        }

    return results

In [25]:
corpus_tokens = []
for doc in lines:
    text = f"{doc.get('title', '')} {doc.get('description', '')}"
    corpus_tokens.append(build_terms(text))

# Train custom Word2Vec
model = train_custom_word2vec(corpus_tokens)

Training custom Word2Vec model...


In [26]:
# Compute vectors
print("Converting documents to vectors...")
doc_vectors = []  # list of tuples (pid, vector)
for doc in lines:
    pid = doc.get("pid", "")
    title = doc.get("title", "")
    description = doc.get("description", "")
    text = f"{title} {description}"
    terms = build_terms(text)
    vec = tokens_to_vector(terms, model)
    doc_vectors.append((pid, vec))

Converting documents to vectors...


In [27]:
rankings = word2vec_ranking_topk(model=model,doc_vectors=doc_vectors,products_info=products_info,queries=queries,top_k=20)

for qid, data in rankings.items():
    print(f"\nTop {20} results for Query {qid}: '{data['query']}'")
    for i, (info, score) in enumerate(data['results'], 1):
        print(f"{i:2d}. {info['pid']}: {info['title']} → score = {score:.4f}")


Top 20 results for Query 1: 'men cotton blend grey t shirt'
 1. TSHFE4JPXV56KKB2: Solid Women Polo Neck Black T-Shirt → score = 0.9032
 2. TSHFE4JRQ8HHCYYM: Solid Men Polo Neck Blue T-Shirt → score = 0.8994
 3. TSHFE4JQZ2TJBGMZ: Solid Men Polo Neck White T-Shirt → score = 0.8956
 4. TSHFE4JPXMUVVMJR: Solid Women Polo Neck Blue T-Shirt → score = 0.8939
 5. KTAFVHTGDWZG3NPC: Men Solid Cotton Blend Pathani Kurta  (Yellow) → score = 0.8765
 6. KTAFVHTGUGGGZCZV: Men Solid Cotton Blend Pathani Kurta  (Yellow) → score = 0.8765
 7. KTAFAKGUKA2SXSEQ: Men Solid Cotton Blend Pathani Kurta  (Blue) → score = 0.8738
 8. KTAFVHTGUUBZGYFU: Women Solid Cotton Blend Pathani Kurta  (Blue) → score = 0.8735
 9. KTAFVA6R7AQPFM3E: Women Solid Cotton Blend Asymmetric Kurta  (Orange) → score = 0.8623
10. KTAFVHTGZUPYKGZA: Men Solid Cotton Blend Pathani Kurta  (Orange) → score = 0.8570
11. KTAFWKJYFQBAAAQC: Women Polka Print Cotton Blend A-line Kurta  (Green) → score = 0.8565
12. TROFN8YNBWZKQXST: Tapered Men 

# EVALUATION - Analysis and Comparison of Approaches


Precision@k

In [28]:
def precision_at_k(doc_score, y_score, k=10):
    """
    Parameters
    ----------
    doc_score: Ground truth (true relevance labels).
    y_score: Predicted scores.
    k : number of doc to consider.

    Returns
    -------
    precision @k : float

    """
    order = np.argsort(y_score)[::-1]  # Bonus point: How to improve the effitiency of this part?
    #doc_score = np.take(doc_score, order[:k])
    doc_score = doc_score[order[:k]]
    relevant = sum(doc_score == 1)
    return float(relevant) / k

Recall@k

In [29]:
def recall_at_k(doc_score, y_score, k=10):
    """
    Parameters
    ----------
    doc_score : Ground truth (true relevance labels, binary: 1=relevant, 0=non-relevant)
    y_score   : Predicted scores
    k         : Number of top documents to consider

    Returns
    -------
    recall@k : float
    """
    order = np.argsort(y_score)[::-1]  # Sort documents by predicted score (descending)
    doc_score_k = doc_score[order[:k]]  # Take top-k documents
    relevant_retrieved = sum(doc_score_k == 1)
    total_relevant = sum(doc_score == 1)

    if total_relevant == 0:
        return 0.0  # Avoid division by zero

    return float(relevant_retrieved) / total_relevant


F1@k

In [30]:
def f1_at_k(doc_score, y_score, k=10):
    """
    Parameters
    ----------
    doc_score : Ground truth (true relevance labels, binary: 1=relevant, 0=non-relevant)
    y_score   : Predicted scores
    k         : Number of top documents to consider

    Returns
    -------
    F1@k : float
    """
    order = np.argsort(y_score)[::-1]  # sort by predicted score (descending)
    doc_score_k = doc_score[order[:k]]

    relevant_retrieved = sum(doc_score_k == 1)
    total_relevant = sum(doc_score == 1)

    if total_relevant == 0:
        return 0.0  # avoid division by zero

    precision = relevant_retrieved / k
    recall = relevant_retrieved / total_relevant

    if (precision + recall) == 0:
        return 0.0

    f1 = 2 * (precision * recall) / (precision + recall)
    return f1


In [38]:
# Load validation file with the 4 relevance types
validation_df = pd.read_csv(our_validation_path)

# Columns in the CSV used as different relevance definitions
relevance_strategies = ["Exact coincidence", "All - 1", ">= 70%", "product types"]

# Ranking methods implemented in your code
ranking_methods = ["tf-idf", "bm25", "custom1", "custom2"]

query_ids = validation_df["query_id"].unique()

results = []
k = 10  # or 20 if you prefer; must be <= number of docs per query

for qid in query_ids:
    # Skip if this query_id is not in your queries dict
    if qid not in queries:
        continue

    query_text = queries[qid]
    q_data = validation_df[validation_df["query_id"] == qid].copy()

    # NOTE: your validation CSV must have a 'pid' column listing candidate products
    all_pids = q_data["pid"].values

    for method in ranking_methods:
        # Get ranking from your search function
        ranked_docs, doc_scores = search_products(
            query_text, index, idf, tf, products_info, ranking_method=method
        )

        # Build a score vector aligned with all_pids
        # Higher score = better rank (top result has largest value)
        y_score = np.zeros(len(all_pids), dtype=float)
        for rank, pid in enumerate(ranked_docs):
            if pid in all_pids:
                idx = np.where(all_pids == pid)[0][0]
                y_score[idx] = len(ranked_docs) - rank  # invert rank

        # Add tiny random noise to break ties
        y_score = y_score + np.random.rand(len(y_score)) * 0.01

        # Evaluate for each relevance strategy/column
        for strategy in relevance_strategies:
            if strategy not in q_data.columns:
                continue

            # Binary ground truth for this strategy
            doc_score = q_data[strategy].astype(int).values

            results.append({
                "Query": f"Query {qid}",
                "Method": method.upper(),
                "Relevance Strategy": strategy,
                f"Precision@{k}": round(precision_at_k(doc_score, y_score, k), 3),
                f"Recall@{k}": round(recall_at_k(doc_score, y_score, k), 3),
                f"F1@{k}": round(f1_at_k(doc_score, y_score, k), 3),
            })

# Turn into DataFrame and display
results_df = pd.DataFrame(results)
display(results_df)

# Optional: average per (Method, Relevance Strategy)
avg_results = (
    results_df
    .groupby(["Method", "Relevance Strategy"])[[f"Precision@{k}", f"Recall@{k}", f"F1@{k}"]]
    .mean()
    .round(3)
)
display(avg_results)

Unnamed: 0,Query,Method,Relevance Strategy,Precision@10,Recall@10,F1@10
0,Query 1,TF-IDF,Exact coincidence,0.0,0.000,0.000
1,Query 1,TF-IDF,All - 1,0.0,0.000,0.000
2,Query 1,TF-IDF,>= 70%,1.0,0.556,0.714
3,Query 1,TF-IDF,product types,0.9,0.692,0.783
4,Query 1,BM25,Exact coincidence,0.0,0.000,0.000
...,...,...,...,...,...,...
75,Query 5,CUSTOM1,product types,1.0,0.400,0.571
76,Query 5,CUSTOM2,Exact coincidence,0.0,0.000,0.000
77,Query 5,CUSTOM2,All - 1,0.0,0.000,0.000
78,Query 5,CUSTOM2,>= 70%,0.0,0.000,0.000


Unnamed: 0_level_0,Unnamed: 1_level_0,Precision@10,Recall@10,F1@10
Method,Relevance Strategy,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
BM25,>= 70%,0.4,0.511,0.369
BM25,All - 1,0.2,0.4,0.226
BM25,Exact coincidence,0.14,0.4,0.2
BM25,product types,0.8,0.683,0.649
CUSTOM1,>= 70%,0.4,0.511,0.369
CUSTOM1,All - 1,0.2,0.4,0.226
CUSTOM1,Exact coincidence,0.14,0.4,0.2
CUSTOM1,product types,0.8,0.683,0.649
CUSTOM2,>= 70%,0.38,0.311,0.332
CUSTOM2,All - 1,0.2,0.3,0.223


In [39]:
# === Evaluation of Word2Vec ranking against the same relevance strategies ===

w2v_results = []
k = 10  # use same k as in the previous block

for qid in query_ids:
    # We need both the query text and its Word2Vec results
    if qid not in queries or qid not in rankings:
        continue

    q_data = validation_df[validation_df["query_id"] == qid].copy()
    if "pid" not in q_data.columns:
        continue

    all_pids = q_data["pid"].values

    # Extract ranked pids from Word2Vec results:
    # rankings[qid]["results"] is a list of (info_dict, score_float)
    ranked = rankings[qid]["results"]
    ranked_pids = [info["pid"] for info, _ in ranked]

    # Build score vector aligned with all_pids
    y_score = np.zeros(len(all_pids), dtype=float)
    for rank, pid in enumerate(ranked_pids):
        if pid in all_pids:
            idx = np.where(all_pids == pid)[0][0]
            y_score[idx] = len(ranked_pids) - rank  # invert rank so top has largest score

    # Add tiny random noise to break ties
    y_score = y_score + np.random.rand(len(y_score)) * 0.01

    # Evaluate for each relevance strategy / column
    for strategy in relevance_strategies:
        if strategy not in q_data.columns:
            continue

        # Binary ground truth for this strategy
        doc_score = q_data[strategy].astype(int).values

        w2v_results.append({
            "Query": f"Query {qid}",
            "Method": "WORD2VEC",
            "Relevance Strategy": strategy,
            f"Precision@{k}": round(precision_at_k(doc_score, y_score, k), 3),
            f"Recall@{k}": round(recall_at_k(doc_score, y_score, k), 3),
            f"F1@{k}": round(f1_at_k(doc_score, y_score, k), 3),
        })

w2v_df = pd.DataFrame(w2v_results)
display(w2v_df)

# Optional: average metrics for Word2Vec per relevance strategy
w2v_avg = (
    w2v_df
    .groupby("Relevance Strategy")[[f"Precision@{k}", f"Recall@{k}", f"F1@{k}"]]
    .mean()
    .round(3)
)
display(w2v_avg)

Unnamed: 0,Query,Method,Relevance Strategy,Precision@10,Recall@10,F1@10
0,Query 1,WORD2VEC,Exact coincidence,0.0,0.0,0.0
1,Query 1,WORD2VEC,All - 1,0.1,0.5,0.167
2,Query 1,WORD2VEC,>= 70%,1.0,0.556,0.714
3,Query 1,WORD2VEC,product types,0.7,0.538,0.609
4,Query 2,WORD2VEC,Exact coincidence,0.2,1.0,0.333
5,Query 2,WORD2VEC,All - 1,0.9,1.0,0.947
6,Query 2,WORD2VEC,>= 70%,0.9,1.0,0.947
7,Query 2,WORD2VEC,product types,0.9,1.0,0.947
8,Query 3,WORD2VEC,Exact coincidence,0.0,0.0,0.0
9,Query 3,WORD2VEC,All - 1,0.0,0.0,0.0


Unnamed: 0_level_0,Precision@10,Recall@10,F1@10
Relevance Strategy,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
>= 70%,0.38,0.311,0.332
All - 1,0.2,0.3,0.223
Exact coincidence,0.06,0.24,0.093
product types,0.78,0.668,0.632


In [40]:
# Combine all evaluations (text-based methods + Word2Vec)
all_eval_df = pd.concat([results_df, w2v_df], ignore_index=True)

overall_avg = (
    all_eval_df
    .groupby(["Method", "Relevance Strategy"])[[f"Precision@{k}", f"Recall@{k}", f"F1@{k}"]]
    .mean()
    .round(3)
)

display(overall_avg.sort_values(["Relevance Strategy", f"F1@{k}"], ascending=[True, False]))

Unnamed: 0_level_0,Unnamed: 1_level_0,Precision@10,Recall@10,F1@10
Method,Relevance Strategy,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
BM25,>= 70%,0.4,0.511,0.369
CUSTOM1,>= 70%,0.4,0.511,0.369
TF-IDF,>= 70%,0.4,0.511,0.369
CUSTOM2,>= 70%,0.38,0.311,0.332
WORD2VEC,>= 70%,0.38,0.311,0.332
BM25,All - 1,0.2,0.4,0.226
CUSTOM1,All - 1,0.2,0.4,0.226
TF-IDF,All - 1,0.2,0.4,0.226
CUSTOM2,All - 1,0.2,0.3,0.223
WORD2VEC,All - 1,0.2,0.3,0.223
