In [None]:
# Copyright 2024 DeepMind Technologies Limited
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

# XTR Evaluation on BEIR & MIRACL

This notebook provides how to load and evaluate [XTR (or mXTR)](https://arxiv.org/abs/2304.01982) on [BEIR](https://arxiv.org/abs/2104.08663) and [MIRACL](https://arxiv.org/abs/2210.09984). More details of XTR are available in the [paper](https://arxiv.org/abs/2304.01982).

To run the code, please follow the instructions below. If you are using Colab or Kaggle Notebook, please use GPU or TPU resources for faster inference. For any bug or issue, please post an issue on our [Github](https://github.com/google-deepmind/xtr) or contact jinhyuklee@google.com.

## Install and Import Packages
We need to install ScaNN for the efficient MIPS.

In [None]:
# Use ScaNN for CPU/GPU
!pip install pytrec_eval nltk tensorflow_text transformers scann

# Use faiss for TPU
# !pip install pytrec_eval nltk tensorflow_text transformers faiss-cpu

In [None]:
#@title Imports
import collections
import numpy as np
import nltk
import os
import pytrec_eval
import re
import time
import tensorflow_hub as hub
import tensorflow as tf
import tensorflow_text as text
import transformers

from dataclasses import dataclass
from enum import Enum
from nltk.tokenize import sent_tokenize
from typing import List
from tqdm import tqdm

print('TF version:', tf.__version__)
nltk.download('punkt')

## Load XTR
Please set **XTR_MODEL** based on your choice.

In [None]:
class XTRModel(Enum):
    BASE_EN = 1
    BASE_EN_TPU = 2
    BASE_MULTILINGUAL = 3
    BASE_MULTILINGUAL_TPU = 4
    XXL_EN = 5
    XXL_EN_TPU = 6
    XXL_MULTILINGUAL = 7
    XXL_MULTILINGUAL_TPU = 8


model_to_url = {
    XTRModel.BASE_EN: "/kaggle/input/xtr/tensorflow2/base-en/2/",
    XTRModel.BASE_EN_TPU: "/kaggle/input/xtr/tensorflow2/base-en-tpu/2/",
    XTRModel.BASE_MULTILINGUAL: "/kaggle/input/xtr/tensorflow2/base-multilingual/2/",
    XTRModel.BASE_MULTILINGUAL_TPU: "/kaggle/input/xtr/tensorflow2/base-multilingual-tpu/2/",
    XTRModel.XXL_EN: "/kaggle/input/xtr/tensorflow2/xxl-en/2/",
    XTRModel.XXL_EN_TPU: "/kaggle/input/xtr/tensorflow2/xxl-en-tpu/2/",
    XTRModel.XXL_MULTILINGUAL: "/kaggle/input/xtr/tensorflow2/xxl-multilingual/2/",
    XTRModel.XXL_MULTILINGUAL_TPU: "/kaggle/input/xtr/tensorflow2/xxl-multilingual-tpu/2/",
}

TPU_MODELS = [XTRModel.BASE_EN_TPU, XTRModel.BASE_MULTILINGUAL_TPU, XTRModel.XXL_EN_TPU, XTRModel.XXL_MULTILINGUAL_TPU]
MULTILINGUAL_MODELS = [XTRModel.BASE_MULTILINGUAL, XTRModel.BASE_MULTILINGUAL_TPU, XTRModel.XXL_MULTILINGUAL, XTRModel.XXL_MULTILINGUAL_TPU]

# Choose a model.
XTR_MODEL = XTRModel.BASE_EN

# XTR-related constants.
MAX_SEQ_LEN = 512
TOKEN_EMBED_DIM = 128

# Other constants.
DEBUG = False

print(f"You are using {XTR_MODEL}. Make sure that {XTR_MODEL} was added to the Notebook.")
if XTR_MODEL not in TPU_MODELS:
    print('For CPU/GPU models, we recommend using the P100 accelerator.')
else:
    print('For TPU models, please set TPU as the accelerator.')

In [None]:
#@title Loading Test
# For CPU/GPU-based inference.
if XTR_MODEL not in TPU_MODELS:
    import scann
    INDEX_TYPE = 'scann'
    physical_devices = tf.config.list_physical_devices("GPU")
    try:
        for gpu in physical_devices:
            tf.config.experimental.set_memory_growth(gpu, True)
            print(f"set_memory_growth = True for {gpu}")
        if len(physical_devices) == 0:
            print("Loading XTR on CPU.")
    except Exception as e:
        print(e)
# For TPU-based inference.
else:
    import faiss
    INDEX_TYPE = 'faiss'
    try:
        resolver = tf.distribute.cluster_resolver.TPUClusterResolver()
        tf.config.experimental_connect_to_cluster(resolver)
        tf.tpu.experimental.initialize_tpu_system(resolver)
        tf.config.experimental.enable_mlir_bridge()
    except Exception as e:
        print(e)
        pass

# Load XTR.
model = tf.saved_model.load(model_to_url[XTR_MODEL])
encoder = model.signatures["serving_default"]

# Test XTR encoding.
sample_texts = tf.constant(["dog", "Puppies are nice.", "I enjoy taking long walks along the beach with my dog."])
sample_embeds = encoder(sample_texts)
encodings = sample_embeds["encodings"].numpy()
mask = sample_embeds["mask"].numpy()
print(f"encodings: {encodings.shape}, mask: {mask.shape}")

## XTR Architecture

In [None]:
#@title XTR architecture
def profile(func, debug=DEBUG):
    def wrap(*args, **kwargs):
        started_at = time.time()
        result = func(*args, **kwargs)
        if debug:
          print(f"{func.__name__} took {time.time() - started_at:.3f} seconds.")
        return result
    return wrap


class XTR(object):
    def __init__(self, encoder, model_type, index_type='faiss'):
        self.encoder = encoder
        self.index_type = index_type  # must be 'faiss' or 'scann'. Otherwise uses bruteforce.

        # Set the tokenizer based on the model type.
        if model_type not in MULTILINGUAL_MODELS:
            with tf.io.gfile.GFile("gs://t5-data/vocabs/cc_all.32000.100extra/sentencepiece.model", "rb") as f:
                self.tokenizer = text.SentencepieceTokenizer(model=f.read(), add_eos=True)
        else:
            with tf.io.gfile.GFile("gs://t5-data/vocabs/mc4.250000.100extra/sentencepiece.model", "rb") as f:
                self.tokenizer = text.SentencepieceTokenizer(model=f.read(), add_eos=True)

    def tokenize(self, text):
        return [self.tokenizer.id_to_string(id_).numpy().decode('utf-8') for id_ in self.tokenizer.tokenize(text)]

    @profile
    def get_token_embeddings(self, texts):
        batch_embeds = self.encoder(tf.constant([t.lower() for t in texts]))
        batch_lengths = np.sum(batch_embeds["mask"].numpy(), axis=1)
        return batch_embeds["encodings"].cpu().numpy(), batch_lengths

    @profile
    def get_flatten_embeddings(self, batch_text, return_last_offset=False):
        batch_embeddings, batch_lengths = self.get_token_embeddings(batch_text)
        flatten_embeddings = None
        num_tokens = 0
        offsets = [0]
        for batch_id, (embeddings, length) in enumerate(zip(batch_embeddings, batch_lengths)):
            if flatten_embeddings is not None:
                flatten_embeddings = np.append(flatten_embeddings, embeddings[:int(length)], axis=0)
            else:
                flatten_embeddings = embeddings[:int(length)]
            num_tokens += int(length)
            offsets.append(num_tokens)
        assert num_tokens == flatten_embeddings.shape[0]
        if not return_last_offset:
            offsets = offsets[:-1]
        return flatten_embeddings, offsets

    @profile
    def build_index(self, documents, batch_size=32):
        all_token_embeds = np.zeros((len(documents)*MAX_SEQ_LEN, TOKEN_EMBED_DIM), dtype=np.float32)
        all_doc_offsets = []
        num_tokens = 0
        for batch_idx in tqdm(range(0, len(documents), batch_size)):
            batch_docs = documents[batch_idx:batch_idx+batch_size]
            batch_embeds, batch_offsets = self.get_flatten_embeddings(batch_docs)
            all_doc_offsets += [num_tokens + offset for offset in batch_offsets]
            num_tokens += len(batch_embeds)
            all_token_embeds[num_tokens-len(batch_embeds):num_tokens] = batch_embeds

        # Use scann.scann_ops.build() to instead create a TensorFlow-compatible searcher.
        if self.index_type == 'scann':
            self.searcher = scann.scann_ops_pybind.builder(all_token_embeds[:num_tokens], 10, "dot_product").tree(
                num_leaves=min(2000, num_tokens), num_leaves_to_search=100, training_sample_size=min(250000, num_tokens)).score_ah(
                1, anisotropic_quantization_threshold=0.1).build()
        elif self.index_type == 'faiss':
            ds = 128
            num_clusters = 50
            code_size = 64
            quantizer = faiss.IndexFlatIP(ds)
            opq_matrix = faiss.OPQMatrix(ds, code_size)
            opq_matrix.niter = 10
            sub_index = faiss.IndexIVFPQ(quantizer, ds, num_clusters, code_size, 4, faiss.METRIC_INNER_PRODUCT)
            index = faiss.IndexPreTransform(opq_matrix, sub_index)
            index.train(all_token_embeds[:num_tokens])
            index.add(all_token_embeds[:num_tokens])
            class FaissSearcher(object):
                def __init__(self, index):
                    self.index = index
                def search_batched(self, query_embeds, final_num_neighbors, **kwargs):
                    scores, top_ids = self.index.search(query_embeds, final_num_neighbors)
                    return top_ids, scores
            self.searcher = FaissSearcher(index)
        # Used only for small-scale, exact inference.
        else:
            class BruteForceSearcher(object):
                def search_batched(self, query_embeds, final_num_neighbors, **kwargs):
                    scores = query_embeds.dot(all_token_embeds[:num_tokens].T) # Q x D
                    top_ids = scores.argsort(axis=1)[:, ::-1][:,:final_num_neighbors] # Q x top_k
                    return top_ids, [q_score[q_top_ids] for q_score, q_top_ids in zip(scores, top_ids)] # (Q x top_k, Q x top_k)
            self.searcher = BruteForceSearcher()

        self.doc_offsets = all_doc_offsets
        self.doc_offsets.append(num_tokens)  # Add final number of tokens.
        self.tid2did = {
            self.doc_offsets[did] + tid: did
            for did in range(len(self.doc_offsets)-1)
            for tid in range(self.doc_offsets[did+1] - self.doc_offsets[did])
        }
        self.tid2did[-1] = 0
        self.docs = documents
        print("Index Ready!", self.searcher)

    @profile
    def batch_search_tokens(self, batch_query, token_top_k=100, leaves_to_search=100, pre_reorder_num_neighbors=100):
        all_query_encodings, query_offsets = self.get_flatten_embeddings(batch_query, return_last_offset=True)
        all_neighbors, all_scores = self.searcher.search_batched(
            all_query_encodings, final_num_neighbors=token_top_k, leaves_to_search=leaves_to_search, pre_reorder_num_neighbors=pre_reorder_num_neighbors
        )
        return [
            (
                [f'q_{i}' for i in range(query_offsets[oid], query_offsets[oid+1])],  # query_id
                all_neighbors[query_offsets[oid]:query_offsets[oid+1]],  # neighbors
                all_scores[query_offsets[oid]:query_offsets[oid+1]],  # scores
            )
            for oid in range(len(query_offsets)-1)
        ]

    @profile
    def estimate_missing_similarity(self, batch_result):
        batch_qtoken_to_ems = [dict() for _ in range(len(batch_result))]
        for b_idx, (query_tokens, _, distances) in enumerate(batch_result):
            for token_idx, qtoken in enumerate(query_tokens):
                idx_t = (token_idx, qtoken)
                # Use similarity of the last token as imputed similarity.
                batch_qtoken_to_ems[b_idx][idx_t] = distances[token_idx][-1]
        return batch_qtoken_to_ems

    def aggregate_scores(self, batch_result, batch_ems, document_top_k):
        """Aggregates token-level retrieval scores into query-document scores."""

        @profile
        def get_did2scores(query_tokens, all_neighbors, all_scores):
            did2scores = {}
            # |Q| x k'
            for qtoken_idx, (qtoken, neighbors, scores) in enumerate(zip(query_tokens, all_neighbors, all_scores)):
                for _, (doc_token_id, score) in enumerate(zip(neighbors, scores)):
                    if np.isnan(score):
                        continue
                    docid = self.tid2did[doc_token_id]
                    if docid not in did2scores:
                        did2scores[docid] = {}
                    qtoken_with_idx = (qtoken_idx, qtoken)
                    if qtoken_with_idx not in did2scores[docid]:
                        # Only keep the top score for sum-of-max.
                        did2scores[docid][qtoken_with_idx] = score

            return did2scores
        batch_did2scores = [get_did2scores(qtokens, neighbors, scores) for qtokens, neighbors, scores in batch_result]

        @profile
        def add_ems(did2scores, query_tokens, ems):
            # |Q| x |Q|k' (assuming most docid is unique)
            for qtoken_idx, qtoken in enumerate(query_tokens):
                qtoken_with_idx = (qtoken_idx, qtoken)
                for docid, scores in did2scores.items():
                    if qtoken_with_idx not in scores:
                        scores[qtoken_with_idx] = ems[qtoken_with_idx]
        for did2scores, result, ems in zip(batch_did2scores, batch_result, batch_ems):
            add_ems(did2scores, result[0], ems)

        @profile
        def get_final_score(did2scores, query_tokens):
            final_qd_score = {}
            # |Q|k' x |Q|
            for docid, scores in did2scores.items():
                assert len(scores) == len(query_tokens)
                final_qd_score[docid] = sum(scores.values()) / len(scores)
            return final_qd_score

        batch_scores = [get_final_score(did2scores, result[0]) for did2scores, result in zip(batch_did2scores, batch_result)]

        batch_ranking = [
            sorted([(docid, score) for docid, score in final_qd_score.items()], key=lambda x: x[1], reverse=True)[:document_top_k]
            for final_qd_score in batch_scores
        ]
        return batch_ranking

    def get_document_text(self, batch_ranking):
        batch_retrieved_docs = []
        for ranking in batch_ranking:
            retrieved_docs = []
            for did, score in ranking:
                retrieved_docs.append((did, score, self.docs[did]))
            batch_retrieved_docs.append(retrieved_docs)
        return batch_retrieved_docs

    def retrieve_docs(
        self,
        batch_query: List[str],
        token_top_k: int = 100,
        leaves_to_search: int = 100,
        pre_reorder_num_neighbors: int = 100,
        document_top_k: int = 100,
        return_text: bool = True,
    ):
        """Runs XTR retrieval for a query."""
        batch_result = self.batch_search_tokens(batch_query, token_top_k=token_top_k, leaves_to_search=leaves_to_search, pre_reorder_num_neighbors=pre_reorder_num_neighbors)
        batch_mae = self.estimate_missing_similarity(batch_result)
        batch_ranking = self.aggregate_scores(batch_result, batch_mae, document_top_k)
        if return_text:
            return self.get_document_text(batch_ranking), batch_result
        else:
            return batch_ranking, batch_result

## Sample Run
For a sample corpus, we use sentences from Wikipedia.

In [None]:
# Source: https://en.wikipedia.org/wiki/Google
sample_doc = """Google LLC (/ˈɡuːɡəl/ (listen)) is an American multinational technology company focusing on online advertising, search engine technology, cloud computing, computer software, quantum computing, e-commerce, artificial intelligence,[9] and consumer electronics. It has been referred to as "the most powerful company in the world"[10] and one of the world's most valuable brands due to its market dominance, data collection, and technological advantages in the area of artificial intelligence.[11][12][13] Its parent company Alphabet is considered one of the Big Five American information technology companies, alongside Amazon, Apple, Meta, and Microsoft.
Google was founded on September 4, 1998, by computer scientists Larry Page and Sergey Brin while they were PhD students at Stanford University in California. Together they own about 14% of its publicly listed shares and control 56% of its stockholder voting power through super-voting stock. The company went public via an initial public offering (IPO) in 2004. In 2015, Google was reorganized as a wholly owned subsidiary of Alphabet Inc. Google is Alphabet's largest subsidiary and is a holding company for Alphabet's internet properties and interests. Sundar Pichai was appointed CEO of Google on October 24, 2015, replacing Larry Page, who became the CEO of Alphabet. On December 3, 2019, Pichai also became the CEO of Alphabet.[14]
The company has since rapidly grown to offer a multitude of products and services beyond Google Search, many of which hold dominant market positions. These products address a wide range of use cases, including email (Gmail), navigation (Waze & Maps), cloud computing (Cloud), web browsing (Chrome), video sharing (YouTube), productivity (Workspace), operating systems (Android), cloud storage (Drive), language translation (Translate), photo storage (Photos), video calling (Meet), smart home (Nest), smartphones (Pixel), wearable technology (Pixel Watch & Fitbit), music streaming (YouTube Music), video on demand (YouTube TV), artificial intelligence (Google Assistant), machine learning APIs (TensorFlow), AI chips (TPU), and more. Discontinued Google products include gaming (Stadia), Glass, Google+, Reader, Play Music, Nexus, Hangouts, and Inbox by Gmail.[15][16]
Google's other ventures outside of Internet services and consumer electronics include quantum computing (Sycamore), self-driving cars (Waymo, formerly the Google Self-Driving Car Project), smart cities (Sidewalk Labs), and transformer models (Google Brain).[17]
Google and YouTube are the two most visited websites worldwide followed by Facebook and Twitter. Google is also the largest search engine, mapping and navigation application, email provider, office suite, video sharing platform, photo and cloud storage provider, mobile operating system, web browser, ML framework, and AI virtual assistant provider in the world as measured by market share. On the list of most valuable brands, Google is ranked second by Forbes[18] and fourth by Interbrand.[19] It has received significant criticism involving issues such as privacy concerns, tax avoidance, censorship, search neutrality, antitrust and abuse of its monopoly position.
Google began in January 1996 as a research project by Larry Page and Sergey Brin when they were both PhD students at Stanford University in California.[20][21][22] The project initially involved an unofficial "third founder", Scott Hassan, the original lead programmer who wrote much of the code for the original Google Search engine, but he left before Google was officially founded as a company;[23][24] Hassan went on to pursue a career in robotics and founded the company Willow Garage in 2006.[25][26]
While conventional search engines ranked results by counting how many times the search terms appeared on the page, they theorized about a better system that analyzed the relationships among websites.[27] They called this algorithm PageRank; it determined a website's relevance by the number of pages, and the importance of those pages that linked back to the original site.[28][29] Page told his ideas to Hassan, who began writing the code to implement Page's ideas.[23]
Page and Brin originally nicknamed the new search engine "BackRub", because the system checked backlinks to estimate the importance of a site.[20][30][31] Hassan as well as Alan Steremberg were cited by Page and Brin as being critical to the development of Google. Rajeev Motwani and Terry Winograd later co-authored with Page and Brin the first paper about the project, describing PageRank and the initial prototype of the Google search engine, published in 1998. Héctor García-Molina and Jeff Ullman were also cited as contributors to the project.[32] PageRank was influenced by a similar page-ranking and site-scoring algorithm earlier used for RankDex, developed by Robin Li in 1996, with Larry Page's PageRank patent including a citation to Li's earlier RankDex patent; Li later went on to create the Chinese search engine Baidu.[33][34]
Eventually, they changed the name to Google; the name of the search engine was a misspelling of the word googol,[20][35][36] a very large number written 10100 (1 followed by 100 zeros), picked to signify that the search engine was intended to provide large quantities of information.[37]
Google was initially funded by an August 1998 investment of $100,000 from Andy Bechtolsheim,[20] co-founder of Sun Microsystems. This initial investment served as a motivation to incorporate the company to be able to use the funds.[39][40] Page and Brin initially approached David Cheriton for advice because he had a nearby office in Stanford, and they knew he had startup experience, having recently sold the company he co-founded, Granite Systems, to Cisco for $220 million. David arranged a meeting with Page and Brin and his Granite co-founder Andy Bechtolsheim. The meeting was set for 8 AM at the front porch of David's home in Palo Alto and it had to be brief because Andy had another meeting at Cisco, where he now worked after the acquisition, at 9 AM. Andy briefly tested a demo of the website, liked what he saw, and then went back to his car to grab the check. David Cheriton later also joined in with a $250,000 investment.[41][42]
Google received money from two other angel investors in 1998: Amazon.com founder Jeff Bezos, and entrepreneur Ram Shriram.[43] Page and Brin had first approached Shriram, who was a venture capitalist, for funding and counsel, and Shriram invested $250,000 in Google in February 1998. Shriram knew Bezos because Amazon had acquired Junglee, at which Shriram was the president. It was Shriram who told Bezos about Google. Bezos asked Shriram to meet Google's founders and they met 6 months after Shriram had made his investment when Bezos and his wife were in a vacation trip to the Bay Area. Google's initial funding round had already formally closed but Bezos' status as CEO of Amazon was enough to persuade Page and Brin to extend the round and accept his investment.[44][45]
Between these initial investors, friends, and family Google raised around $1,000,000, which is what allowed them to open up their original shop in Menlo Park, California.[46] Craig Silverstein, a fellow PhD student at Stanford, was hired as the first employee.[22][47][48]
After some additional, small investments through the end of 1998 to early 1999,[43] a new $25 million round of funding was announced on June 7, 1999,[49] with major investors including the venture capital firms Kleiner Perkins and Sequoia Capital.[40] Both firms were initially reticent about investing jointly in Google, as each wanted to retain a larger percentage of control over the company to themselves. Larry and Sergey however insisted in taking investments from both. Both venture companies finally agreed to investing jointly $12.5 million each due to their belief in Google's great potential and through mediation of earlier angel investors Ron Conway and Ram Shriram who had contacts in the venture companies.[50]
In March 1999, the company moved its offices to Palo Alto, California,[51] which is home to several prominent Silicon Valley technology start-ups.[52] The next year, Google began selling advertisements associated with search keywords against Page and Brin's initial opposition toward an advertising-funded search engine.[53][22] To maintain an uncluttered page design, advertisements were solely text-based.[54] In June 2000, it was announced that Google would become the default search engine provider for Yahoo!, one of the most popular websites at the time, replacing Inktomi.[55][56]
In 2003, after outgrowing two other locations, the company leased an office complex from Silicon Graphics, at 1600 Amphitheatre Parkway in Mountain View, California.[58] Three years later, Google bought the property from SGI for $319 million.[59] By that time, the name "Google" had found its way into everyday language, causing the verb "google" to be added to the Merriam-Webster Collegiate Dictionary and the Oxford English Dictionary, denoted as: "to use the Google search engine to obtain information on the Internet".[60][61] The first use of the verb on television appeared in an October 2002 episode of Buffy the Vampire Slayer.[62]
Additionally, in 2001 Google's investors felt the need to have a strong internal management, and they agreed to hire Eric Schmidt as the chairman and CEO of Google.[46] Eric was proposed by John Doerr from Kleiner Perkins. He had been trying to find a CEO that Sergey and Larry would accept for several months, but they rejected several candidates because they wanted to retain control over the company. Michael Moritz from Sequoia Capital at one point even menaced requesting Google to immediately pay back Sequoia's $12.5m investment if they did not fulfill their promise to hire a chief executive officer, which had been made verbally during investment negotiations. Eric wasn't initially enthusiastic about joining Google either, as the company's full potential hadn't yet been widely recognized at the time, and as he was occupied with his responsibilities at Novell where he was CEO. As part of him joining, Eric agreed to buy $1 million of Google preferred stocks as a way to show his commitment and to provide funds Google needed.[63]
On August 19, 2004, Google became a public company via an initial public offering. At that time Larry Page, Sergey Brin, and Eric Schmidt agreed to work together at Google for 20 years, until the year 2024.[64] The company offered 19,605,052 shares at a price of $85 per share.[65][66] Shares were sold in an online auction format using a system built by Morgan Stanley and Credit Suisse, underwriters for the deal.[67][68] The sale of $1.67 billion gave Google a market capitalization of more than $23 billion.[69]
On November 13, 2006, Google acquired YouTube for $1.65 billion in Google stock,[70][71][72][73] On March 11, 2008, Google acquired DoubleClick for $3.1 billion, transferring to Google valuable relationships that DoubleClick had with Web publishers and advertising agencies.[74][75]
By 2011, Google was handling approximately 3 billion searches per day. To handle this workload, Google built 11 data centers around the world with several thousand servers in each. These data centers allowed Google to handle the ever-changing workload more efficiently.[46]
In May 2011, the number of monthly unique visitors to Google surpassed one billion for the first time.[76][77]
In May 2012, Google acquired Motorola Mobility for $12.5 billion, in its largest acquisition to date.[78][79][80] This purchase was made in part to help Google gain Motorola's considerable patent portfolio on mobile phones and wireless technologies, to help protect Google in its ongoing patent disputes with other companies,[81] mainly Apple and Microsoft,[82] and to allow it to continue to freely offer Android.[83]
"""
sample_doc = re.sub(r'\[\d+\]', '', sample_doc)

# Single-sentence chunks.
chunks = [chunk.lower() for chunk in sent_tokenize(sample_doc)]
for i, chunk in enumerate(chunks):
    print(f'chunk{i}: {chunk[:150]} \n')
    if i > 3:
        print('...\n')
        break
print('total # of chunks:', len(chunks))

Now, let's run XTR!

In [None]:
xtr = XTR(encoder=encoder, model_type=XTR_MODEL, index_type='brute')
xtr.build_index(chunks)

query = "Who founded google" if XTR_MODEL not in MULTILINGUAL_MODELS else "구글 창립자"
retrieved_docs, metadata = xtr.retrieve_docs([query], document_top_k=3)

print(f"\nQuery: {query}")
for rank, (did, score, doc) in enumerate(retrieved_docs[0]):
    print(f"[{rank}] doc={did} ({score:.3f}): {doc}")

## Analyze Token Retrieval of XTR
We can check the token-level retrieval results happening in XTR. For each query token, we show its retrieved document tokens from the corpus in red. Similar analysis was done in the [paper](https://arxiv.org/pdf/2304.01982.pdf) in the qualitative analysis section.

In [None]:
#@title Analysis
BOLD = '\033[91m\033[01m'
END = '\033[0m\033[0m'
FRONT_TOKEN_CONTEXT = 10
BACK_TOKEN_CONTEXT = 30


def smart_join(token_list):
    res = ''
    for token in token_list:
        if token.startswith('▁'):
            if len(res) > 0:
                res += ' '
            res += token
        else:
            res += token
    return res


def xtr_pretty_print(query, metadata):
    print(f'Given a query "{query}", XTR retrieves the following document tokens for each query token:')
    qtokens = xtr.tokenize(query.lower())
    for qidx, (qtoken, nns, scores) in enumerate(zip(metadata[0][0], metadata[0][1], metadata[0][2])):
        print(f"====={qtoken}:{BOLD + qtokens[qidx] + END}======")
        top_retrieved_tokens = [(int(tid), s) for tid, s in sorted([(nn, score) for nn, score in zip(nns, scores)], key=lambda v: -v[1])[:5]]
        for tid, s in top_retrieved_tokens:
            did = xtr.tid2did[tid]
            new_tid = tid - xtr.doc_offsets[did]
            curr_doc = xtr.tokenize(xtr.docs[did])
            print(f'[{s:.3f}]', f'{"... " if new_tid > 0 else ""}' + smart_join(curr_doc[max(0, new_tid-FRONT_TOKEN_CONTEXT):new_tid] + [BOLD, curr_doc[new_tid], END] + curr_doc[new_tid+1:new_tid+BACK_TOKEN_CONTEXT]) + ' ...')

xtr_pretty_print(query, metadata)

## Load BEIR Datasets

In [None]:
#@title Install the beir PyPI package
!pip install beir

In [None]:
#@title Load Scifact from BEIR
from beir import util

# Supports two smallest datasets from BEIR
class BEIR(Enum):
    SCIFACT = 1
    NFCORPUS = 2

beir_datasets = {
    BEIR.SCIFACT: "scifact",
    BEIR.NFCORPUS: "nfcorpus"
}

# Choose a dataset.
DATASET = BEIR.SCIFACT

url = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip".format(beir_datasets[DATASET])
out_dir = os.path.join(os.getcwd(), "datasets")
data_path = util.download_and_unzip(url, out_dir)
print("Dataset downloaded here: {}".format(data_path))

In [None]:
!ls datasets/

In [None]:
from beir.datasets.data_loader import GenericDataLoader

data_path = f"datasets/{beir_datasets[DATASET]}"
corpus, queries, qrels = GenericDataLoader(data_path).load(split="test") # or split = "train" or "dev"

## Index BEIR Corpus
For Scifact + XTR-base-en (P100), this should take about 3 minutes.

In [None]:
all_docs = []
all_keys = []
for doc_key, doc in tqdm(corpus.items()):
    doc_text = f"{doc['title']} {doc['text']}".lower()
    all_keys.append(doc_key)
    all_docs.append(doc_text)

xtr = XTR(encoder=encoder, model_type=XTR_MODEL, index_type=INDEX_TYPE)
xtr.build_index(all_docs)
print(f"XTR Index Size: {len(xtr.tid2did)}")

## Run BEIR Evaluation
For Scifact, XTR-base-en (P100), this should take about 2 minutes.

In [None]:
# Evaluation hyperparameters.
TOKEN_TOP_K = 1000
NUM_LEAVES = 100  # ScaNN-related.
TREC_TOP_K = 100

# NOTE: If .reorder(100) was used for the ScaNN searcher, pre_reorder_num_neighbors
# should be as large as token_top_k. If not, searcher will ignore this.
NUM_REORDER = 1000
assert NUM_REORDER >= TOKEN_TOP_K, "NUM_REORDER needs to be as large as TOKEN_TOP_K"

if INDEX_TYPE == 'faiss':
    sub_index = faiss.extract_index_ivf(xtr.searcher.index)
    sub_index.nprobe = 4

In [None]:
predictions = {}
# Running evaluation per query for a better latency measurement.
for q_idx, (query_key, query) in tqdm(enumerate(queries.items()), total=len(queries)):
    ranking, metadata = xtr.retrieve_docs(
        [query.lower()],
        token_top_k=TOKEN_TOP_K,
        leaves_to_search=NUM_LEAVES,
        pre_reorder_num_neighbors=NUM_REORDER,  # does not have any effect since the default searcher does not use reorder.
        return_text=False
    )
    ranking = ranking[0]
    predictions[query_key] = {str(all_keys[did]): score for did, score in ranking[:TREC_TOP_K]}

For a reference, XTR-base-en on Scifact scores **71.7% nDCG@10 and 93.1% Recall@100** with brute-force search as described in the [paper](https://arxiv.org/pdf/2304.01982.pdf) (See Table E.1 when k'=1000).

The performance below shows a result with an approximate search with ScaNN.

In [None]:
#@title Run pytrec_eval.
K_VALUES = [5, 10, 50, 100]
METRIC_NAMES = ['ndcg_cut', 'map_cut', 'recall']

def eval_metrics(qrels, predictions):
    measurements = []
    for metric_name in METRIC_NAMES:
        measurements.append(
            f"{metric_name}." + ",".join([str(k) for k in K_VALUES])
        )
    evaluator = pytrec_eval.RelevanceEvaluator(qrels, measurements)
    final_scores = evaluator.evaluate(predictions)

    final_metrics = dict()
    for metric_name in METRIC_NAMES:
        for k in K_VALUES:
            final_metrics[f"{metric_name}@{k}"] = 0.0

    for query_id in final_scores.keys():
        for metric_name in METRIC_NAMES:
            for k in K_VALUES:
                final_metrics[f"{metric_name}@{k}"] += final_scores[query_id][
                    f"{metric_name}_{k}"
                ]

    for metric_name in METRIC_NAMES:
        for k in K_VALUES:
            final_metrics[f"{metric_name}@{k}"] = round(
                final_metrics[f"{metric_name}@{k}"] / len(final_scores), 5
            )

    print("[Result]")
    for metric_name, metric_score in final_metrics.items():
        print(f"{metric_name}: {metric_score:.4f}")

eval_metrics(qrels, predictions)

## Load MIRACL Datasets

In [None]:
!pip install datasets

In [None]:
# Please get the huggingface user access token from https://huggingface.co/settings/tokens
# git clone https://{huggingface_id}:{huggingface_user_access_tokens}@huggingface.co/datasets/miracl/miracl

In [None]:
import csv

qrels = {}
lang='sw'  # or choose any of the 16 languages
with open(f"miracl/miracl-v1.0-{lang}/qrels/qrels.miracl-v1.0-{lang}-dev.tsv") as file:
    qrels_tsv = csv.reader(file, delimiter="\t")
    for line in qrels_tsv:
        qid, _, did, judgement = line
        if qid not in qrels:
            qrels[qid] = {}
        qrels[qid][did] = int(judgement)

print(f"{len(qrels)} qrels loaded.")

In [None]:
topics = {}  # same as query
with open(f"miracl/miracl-v1.0-{lang}/topics/topics.miracl-v1.0-{lang}-dev.tsv") as file:
    topics_tsv = csv.reader(file, delimiter="\t")
    for line in topics_tsv:
        qid, query = line
        topics[qid] = query

print(f"{len(topics)} topics loaded.")

## Index MIRACL Corpus
Since the corpus is large, we recommend using TPUs for the corpus encoding.

In [None]:
import datasets

miracl_corpus = datasets.load_dataset('miracl/miracl-corpus', lang)['train']
all_keys = []
all_docs = []
for doc in tqdm(miracl_corpus):
    doc_text = f"{doc['title']} {doc['text']}".lower()
    all_keys.append(doc['docid'])
    all_docs.append(doc_text)

print(f"Sample: {all_docs[0]}")

xtr = XTR(encoder=encoder, model_type=XTR_MODEL, index_type=INDEX_TYPE)
xtr.build_index(all_docs)
print(f"XTR Index Size: {len(xtr.tid2did)}")

## Run MIRACL Evaluation
For a reference, mXTR-base on MIRACL-Swahili scores **69.7% nDCG@10** with brute-force search as described in the [paper](https://arxiv.org/pdf/2304.01982.pdf) (See Table 4).


The performance below shows a result with an approximate search with ScaNN.

In [None]:
# Evaluation hyperparameters.
TOKEN_TOP_K = 1000
NUM_LEAVES = 100  # ScaNN-related.
TREC_TOP_K = 100

# NOTE: If .reorder(100) was used for the ScaNN searcher, pre_reorder_num_neighbors
# should be as large as token_top_k. If not, searcher will ignore this.
NUM_REORDER = 1000
assert NUM_REORDER >= TOKEN_TOP_K, "NUM_REORDER needs to be as large as TOKEN_TOP_K"

In [None]:
predictions = {}
# Running evaluation per query for a better latency measurement.
for q_idx, (query_key, query) in tqdm(enumerate(topics.items()), total=len(topics)):
    ranking, metadata = xtr.retrieve_docs(
        [query.lower()],
        token_top_k=TOKEN_TOP_K,
        leaves_to_search=NUM_LEAVES,
        pre_reorder_num_neighbors=NUM_REORDER,  # No effect.
        return_text=True
    )
    ranking = ranking[0]
    predictions[query_key] = {str(all_keys[did]): score for did, score, _ in ranking[:TREC_TOP_K]}

eval_metrics(qrels, predictions)