In [2]:
import json
import math
import os
from typing import List, Dict, Any
from collections import Counter
from transformers import AutoTokenizer

class TokenBasedReranker:
    def __init__(self, model_name: str = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"):
        """
        By loading a subword tokenizer (WordPiece), we inherently capture morphological
        variations (e.g., 'running' -> 'run', '##ning'). This allows the algorithm to match
        word roots even if the exact word forms differ between the query and the document.
        """
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

    def _extract_text(self, document: Any) -> str:
        """
        Since the constraint dictates not altering the original data structure, flattening
        the JSON dynamically into a single string ensures no text field is missed while
        strictly preserving the original object for the final output.
        """
        texts = []
        if isinstance(document, str):
            texts.append(document)
        elif isinstance(document, dict):
            for value in document.values():
                texts.append(self._extract_text(value))
        elif isinstance(document, list):
            for item in document:
                texts.append(self._extract_text(item))
        return " ".join(texts)

    def _tokenize(self, text: str) -> List[int]:
        """
        Special tokens ([CLS], [SEP]) are explicitly excluded because their guaranteed presence
        in every tokenized string would artificially inflate the baseline cosine similarity
        across all documents, ruining the strict 0.0 lower bound for irrelevant texts.
        """
        return self.tokenizer.encode(text, add_special_tokens=False)

    def _calculate_idf(self, documents: List[Dict]) -> Dict[int, float]:
        """
        Subword tokenization tends to produce highly frequent, uninformative fragments
        (like '##s' or '##ing'). A local Inverse Document Frequency (IDF) weights down
        these common subwords while heavily rewarding rare, highly specific query tokens.
        """
        num_docs = len(documents)
        if num_docs == 0:
            return {}

        df = Counter()
        for doc in documents:
            text = self._extract_text(doc)
            unique_tokens = set(self._tokenize(text))
            for token_id in unique_tokens:
                df[token_id] += 1

        idf = {}
        for token_id, freq in df.items():
            idf[token_id] = math.log(num_docs / (freq + 1)) + 1.0

        return idf

    def _compute_tf(self, tokens: List[int]) -> Dict[int, float]:
        """
        Normalizing Term Frequency (TF) prevents extremely long documents from
        dominating the score simply by having more raw token counts.
        """
        tf = Counter(tokens)
        total_tokens = len(tokens)
        if total_tokens == 0:
            return {}
        return {token_id: count / total_tokens for token_id, count in tf.items()}

    def score(self, query: str, documents: List[Dict]) -> List[Dict]:
        """
        Cosine Similarity of TF-IDF vectors is chosen over standard BM25 because
        it is mathematically bounded to [0.0, 1.0]. This perfectly fulfills the
        strict range constraint for the output scores.
        """
        if not documents:
            return []

        idf = self._calculate_idf(documents)

        query_tokens = self._tokenize(query)
        query_tf = self._compute_tf(query_tokens)

        query_mag = 0.0
        query_vec = {}

        # Build query TF-IDF vector
        for token_id, tf_val in query_tf.items():
            token_idf = idf.get(token_id, math.log(len(documents) + 1) + 1.0)
            weight = tf_val * token_idf
            query_vec[token_id] = weight
            query_mag += weight ** 2

        query_mag = math.sqrt(query_mag)

        results = []
        for doc in documents:
            text = self._extract_text(doc)
            doc_tokens = self._tokenize(text)
            doc_tf = self._compute_tf(doc_tokens)

            doc_mag = 0.0
            doc_vec = {}
            # Build document TF-IDF vector
            for token_id, tf_val in doc_tf.items():
                token_idf = idf.get(token_id, 1.0)
                weight = tf_val * token_idf
                doc_vec[token_id] = weight
                doc_mag += weight ** 2

            doc_mag = math.sqrt(doc_mag)

            dot_product = 0.0
            for token_id, q_weight in query_vec.items():
                if token_id in doc_vec:
                    dot_product += q_weight * doc_vec[token_id]

            score = 0.0
            if query_mag > 0 and doc_mag > 0:
                score = dot_product / (query_mag * doc_mag)

            score = max(0.0, min(1.0, score))

            results.append({
                "document": doc,
                "score": float(f"{score:.3f}")
            })

        results.sort(key=lambda x: x["score"], reverse=True)
        return results


def main():
    docs_path = 'documents.json'
    queries_path = 'queries.txt'

    if not os.path.exists(docs_path) or not os.path.exists(queries_path):
        print("Error: Ensure 'documents.json' and 'queries.txt' are in the same directory.")
        return

    with open(docs_path, 'r', encoding='utf-8') as f:
        documents = json.load(f)

    with open(queries_path, 'r', encoding='utf-8') as f:
        queries = [line.strip() for line in f if line.strip()]

    reranker = TokenBasedReranker()
    all_results = {}

    for query in queries:
        print(f"--- Query: '{query}' ---")
        scored_docs = reranker.score(query, documents)
        all_results[query] = scored_docs

        # Displaying only the top 3 highest-scoring documents for readability
        for res in scored_docs[:3]:
            doc_id = res['document'].get('id', 'N/A')
            doc_name = res['document'].get('name', 'N/A')
            print(f"Score: {res['score']:.3f} | ID: {doc_id} | Name: {doc_name}")
        print("\n")

    # Optional: Save the complete output (including the original unmodified documents) to a new file
    output_path = 'scored_results.json'
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(all_results, f, ensure_ascii=False, indent=4)
    print(f"Complete results successfully saved to {output_path}")

if __name__ == "__main__":
    main()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/526 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]



special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (669 > 512). Running this sequence through the model will result in indexing errors


--- Query: 'wooden lamp' ---
Score: 0.294 | ID: 157613 | Name: Continenta C4234 - Wooden bowl 25x4,8 cm walnut wood
Score: 0.281 | ID: 180416 | Name: Wall clock d. 56 cm 1xAA wood/metal
Score: 0.280 | ID: 180479 | Name: Wall clock d. 56 cm 1xAA wood/metal


--- Query: 'red table lamp' ---
Score: 0.318 | ID: 219696 | Name: LED Dimmable table lamp 2in1 USB TENUIX LED/14W/5V 3000-6000K white
Score: 0.310 | ID: 182941 | Name: Duolla - Lampshade for table lamp CLASSIC M E27 d. 24 cm red
Score: 0.293 | ID: 179593 | Name: SET 2x Coffee table BELLISIMO black/clear


--- Query: 'mirron with lights' ---
Score: 0.278 | ID: 93661 | Name: Rabalux 1445 - LED Under kitchen cabinet light BYRON LED/8W/230V silver 546 lm
Score: 0.145 | ID: 169597 | Name: Extol - Spirit level 400 mm
Score: 0.110 | ID: 140711 | Name: Steinel 052652-LED Solar light with a sensor LED/1,5W 2x2000mAh IP44 silver


--- Query: 'light above mirror' ---
Score: 0.458 | ID: 206097 | Name: Brilagi-LED Bathroom mirror lighting WOODY 

Here is the detailed explanation of the assumptions and algorithm design for the solution provided.

### 1. Assumptions

**a. What are your basic assumptions about using tokens for search?**

* **Morphological Resilience:** Unlike word-based search (which might fail to match "run" with "running"), I assume that subword tokens (WordPiece) act as a bridge. By breaking words into stems and suffixes, the system can find relevance between different grammatical forms of the same root.
* **Token Importance is Non-Uniform:** I assume that not all tokens are created equal. In a search for "LED bulb," the token for "LED" is likely more discriminative than the token for "the" or "with." Therefore, the frequency of a token across the whole document set must inversely affect its weight.
* **Structure Neutrality:** I assume that relevance can be found in any part of a JSON document (title, description, or tags). My approach treats the document as a unified "bag of tokens" to ensure no metadata is ignored.

**b. How did you transform your assumptions into the final score formula?**

* **From Tokens to TF-IDF:** To address the "importance" assumption, I implemented a **TF-IDF (Term Frequency-Inverse Document Frequency)** logic. The TF component rewards documents that mention query tokens frequently, while the IDF component (calculated from your `documents.json`) ensures that common filler tokens or common subword fragments (like `##ing`) don't skew the results.
* **From Vectors to Cosine Similarity:** To satisfy the strict requirement of a **0.0 to 1.0 score**, I transformed the tokens into multi-dimensional vectors. By calculating the **Cosine Similarity** (the angle between the query vector and the document vector), the output is naturally normalized, where 1.0 represents a perfect directional match in "token space."

---

### 2. Algorithm Design

**a. Why did you choose this specific approach to calculate relevance?**
I chose a **TF-IDF Vector Space Model** using the `paraphrase-multilingual-MiniLM-L12-v2` tokenizer for three reasons:

1. **Normalization:** Standard BM25 is excellent for ranking but produces unbounded scores (e.g., a score could be 15.4). Cosine Similarity is the most mathematically sound way to guarantee a 0.0–1.0 range without "faking" the math via arbitrary clipping.
2. **Multilingual Support:** The specific tokenizer requested is "multilingual." By using token IDs directly, the algorithm can handle the Czech and English text found in your `documents.json` (e.g., matching "warranty" and "záruka" contexts if they share sub-tokens or simply handling the characters correctly).
3. **Efficiency:** This approach is purely statistical. It doesn't require a GPU to run a "forward pass" of a neural network for every search. It simply counts token IDs and does basic algebra, making it "real-time" friendly.

**b. Did you have to make some tradeoffs?**

* **Semantic Blindness:** Because I am using the *tokenizer* but not the *transformer model* itself, the algorithm is "blind" to synonyms. It won't know that "lamp" and "light" are similar unless they share a subword token. This was a tradeoff made to ensure the system remains "computationally reasonable" and fast.
* **Loss of Word Order:** By using a Bag-of-Tokens approach, the sequence "Red Lamp" and "Lamp Red" result in the same score. For a reranker, this is usually acceptable, as the primary goal is to ensure the core concepts (tokens) are present.
* **Subword "Noise":** Subword tokenizers often create very small fragments (1-2 characters). If not weighted correctly by IDF, these could create false positives. I mitigated this by using a smoothed IDF calculation.

---

**Final Note on files:** The code provided in the previous turn specifically handles the structure of `documents.json` (which contains `id`, `name`, and `description`) and iterates through every line in `queries.txt` (handling terms like "wooden lamp" and "LED bulb e27"). It will successfully produce a `scored_results.json` with all mappings.