In [1]:
!nvidia-smi
!pip install transformers==2.10.0 tokenizers

Wed Oct  7 05:21:09 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.51.06    Driver Version: 450.51.06    CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            On   | 00000000:00:1E.0 Off |                    0 |
| N/A   52C    P0    29W /  70W |   4427MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
from transformers import AutoTokenizer, T5ForConditionalGeneration
from transformers import PreTrainedTokenizer
from transformers import PreTrainedModel
from typing import List, Mapping, Tuple, Union, Iterable, Optional, Any
from dataclasses import dataclass
import abc
import torch
from torch.cuda.amp import autocast

TokenizerReturnType = Mapping[str, Union[torch.Tensor, List[int],
                                         List[List[int]],
                                         List[List[str]]]]
class Query:
    """Class representing a query.
    A query contains the query text itself and potentially other metadata.
    Parameters
    ----------
    text : str
        The query text.
    id : Optional[str]
        The query id.
    """
    def __init__(self, text: str, id: Optional[str] = None):
        self.text = text
        self.id = id


class Text:
    """Class representing a text to be reranked.
    A text is unspecified with respect to it length; in principle, it
    could be a full-length document, a paragraph-sized passage, or
    even a short phrase.
    Parameters
    ----------
    text : str
        The text to be reranked.
    metadata : Mapping[str, Any]
        Additional metadata and other annotations.
    score : Optional[float]
        The score of the text. For example, the score might be the BM25 score
        from an initial retrieval stage.
    """

    def __init__(self,
                 text: str,
                 metadata: Mapping[str, Any] = None,
                 score: Optional[float] = 0):
        self.text = text
        if metadata is None:
            metadata = dict()
        self.metadata = metadata
        self.score = score

@dataclass
class QueryDocumentBatch:
    query: Query
    documents: List[Text]
    output: Optional[TokenizerReturnType] = None

    def __len__(self):
        return len(self.documents)

class TokenizerEncodeMixin:
    tokenizer: PreTrainedTokenizer = None
    tokenizer_kwargs = None

    def encode(self, strings: List[str]) -> TokenizerReturnType:
        assert self.tokenizer and self.tokenizer_kwargs is not None, \
                'mixin used improperly'
        ret = self.tokenizer.batch_encode_plus(strings,
                                               **self.tokenizer_kwargs)
        ret['tokens'] = list(map(self.tokenizer.tokenize, strings))
        return ret

class AppendEosTokenizerMixin:
    tokenizer: PreTrainedTokenizer = None

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    def encode(self, strings: List[str]) -> TokenizerReturnType:
        assert self.tokenizer, 'mixin used improperly'
        return super().encode(
            [f'{x} {self.tokenizer.eos_token}' for x in strings])


class QueryDocumentBatchTokenizer(TokenizerEncodeMixin):
    def __init__(self,
                 tokenizer: PreTrainedTokenizer,
                 batch_size: int,
                 pattern: str = '{query} {document}',
                 **tokenizer_kwargs):
        self.tokenizer = tokenizer
        self.batch_size = batch_size
        self.tokenizer_kwargs = tokenizer_kwargs
        self.pattern = pattern

    def traverse_query_document(
            self,
            batch_input: QueryDocumentBatch) -> Iterable[QueryDocumentBatch]:
        query = batch_input.query
        for batch_idx in range(0, len(batch_input), self.batch_size):
            docs = batch_input.documents[batch_idx:batch_idx + self.batch_size]
            outputs = self.encode([self.pattern.format(
                                        query=query.text,
                                        document=doc.text) for doc in docs])
            yield QueryDocumentBatch(query, docs, outputs)

class T5BatchTokenizer(AppendEosTokenizerMixin, QueryDocumentBatchTokenizer):
    def __init__(self, *args, **kwargs):
        kwargs['pattern'] = 'Query: {query} Document: {document} Relevant:'
        kwargs['return_attention_mask'] = True
        kwargs['padding'] = True
        kwargs['return_tensors'] = 'pt'
        kwargs['pad_to_max_length'] = True
        super().__init__(*args, **kwargs)

from copy import deepcopy

DecodedOutput = Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]

model_inputs = None

@torch.no_grad()
def greedy_decode(model: PreTrainedModel,
                  input_ids: torch.Tensor,
                  length: int,
                  attention_mask: torch.Tensor = None,
                  return_last_logits: bool = True) -> DecodedOutput:
    decode_ids = torch.full((input_ids.size(0), 1),
                            model.config.decoder_start_token_id,
                            dtype=torch.long).to(input_ids.device)
    past = model.get_encoder()(input_ids, attention_mask=attention_mask)
    next_token_logits = None
    for _ in range(length):
        model_inputs = model.prepare_inputs_for_generation(
            decode_ids,
            past=past,
            attention_mask=attention_mask,
            use_cache=True)
        with autocast():    
            outputs = model(**model_inputs)  # (batch_size, cur_len, vocab_size)
        next_token_logits = outputs[0][:, -1, :]  # (batch_size, vocab_size)
        decode_ids = torch.cat([decode_ids,
                                next_token_logits.max(1)[1].unsqueeze(-1)],
                               dim=-1)
        past = outputs[1]
    if return_last_logits:
        return decode_ids, next_token_logits, model_inputs
    return decode_ids, model_inputs

class Reranker:
    """Class representing a reranker.
    A reranker takes a list texts and returns a list of texts non-destructively
    (i.e., does not alter the original input list of texts).
    """
    @abc.abstractmethod
    def rerank(self, query: Query, texts: List[Text]) -> List[Text]:
        """Reranks a list of texts with respect to a query.
         Parameters
         ----------
         query : Query
             The query.
         texts : List[Text]
             The list of texts.
         Returns
         -------
         List[Text]
             Reranked list of texts.
         """
        pass

class T5Reranker(Reranker):
    def __init__(self,
                 model: T5ForConditionalGeneration,
                 tokenizer: QueryDocumentBatchTokenizer):
        self.model = model
        self.tokenizer = tokenizer
        self.device = next(self.model.parameters(), None).device

    def rerank(self, query: Query, texts: List[Text]) -> List[Text]:
        texts = deepcopy(texts)
        batch_input = QueryDocumentBatch(query=query, documents=texts)

        # return batch_input
        # """
        for batch in self.tokenizer.traverse_query_document(batch_input):
            input_ids = batch.output['input_ids'].to(self.device)
            attn_mask = batch.output['attention_mask'].to(self.device)
            _, batch_scores, model_inputs = greedy_decode(self.model,
                                            input_ids,
                                            length=1,
                                            attention_mask=attn_mask,
                                            return_last_logits=True)

            # 6136 and 1176 are the indexes of the tokens false and true in T5.
            batch_scores = batch_scores[:, [6136, 1176]]
            batch_scores = torch.nn.functional.log_softmax(batch_scores, dim=1)
            batch_log_probs = batch_scores[:, 1].tolist()
            for doc, score in zip(batch.documents, batch_log_probs):
                doc.score = score
        return texts, model_inputs

In [3]:
batch_size = 50

model_name = 'castorini/monot5-base-msmarco'
model = T5ForConditionalGeneration.from_pretrained(model_name)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

model = model.to(device).eval()

tokenizer_name = 't5-base'
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
tokenizer = T5BatchTokenizer(tokenizer, batch_size)

reranker =  T5Reranker(model, tokenizer)

In [4]:
!nvidia-smi

Wed Oct  7 05:21:32 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.51.06    Driver Version: 450.51.06    CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            On   | 00000000:00:1E.0 Off |                    0 |
| N/A   52C    P0    29W /  70W |   6250MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [5]:
query = Query('who proposed the geocentric theory')

passages = [['7744105', 'For Earth-centered it was  Geocentric Theory proposed by greeks under the guidance of Ptolemy and Sun-centered was Heliocentric theory proposed by Nicolas Copernicus in 16th century A.D. In short, Your Answers are: 1st blank - Geo-Centric Theory. 2nd blank - Heliocentric Theory.'], ['2593796', 'Copernicus proposed a heliocentric model of the solar system â\x80\x93 a model where everything orbited around the Sun. Today, with advancements in science and technology, the geocentric model seems preposterous.he geocentric model, also known as the Ptolemaic system, is a theory that was developed by philosophers in Ancient Greece and was named after the philosopher Claudius Ptolemy who lived circa 90 to 168 A.D. It was developed to explain how the planets, the Sun, and even the stars orbit around the Earth.'], ['6217200', 'The geocentric model, also known as the Ptolemaic system, is a theory that was developed by philosophers in Ancient Greece and was named after the philosopher Claudius Ptolemy who lived circa 90 to 168 A.D. It was developed to explain how the planets, the Sun, and even the stars orbit around the Earth.opernicus proposed a heliocentric model of the solar system â\x80\x93 a model where everything orbited around the Sun. Today, with advancements in science and technology, the geocentric model seems preposterous.'], ['3276925', 'Copernicus proposed a heliocentric model of the solar system â\x80\x93 a model where everything orbited around the Sun. Today, with advancements in science and technology, the geocentric model seems preposterous.Simple tools, such as the telescope â\x80\x93 which helped convince Galileo that the Earth was not the center of the universe â\x80\x93 can prove that ancient theory incorrect.ou might want to check out one article on the history of the geocentric model and one regarding the geocentric theory. Here are links to two other articles from Universe Today on what the center of the universe is and Galileo one of the advocates of the heliocentric model.'], ['6217208', 'Copernicus proposed a heliocentric model of the solar system â\x80\x93 a model where everything orbited around the Sun. Today, with advancements in science and technology, the geocentric model seems preposterous.Simple tools, such as the telescope â\x80\x93 which helped convince Galileo that the Earth was not the center of the universe â\x80\x93 can prove that ancient theory incorrect.opernicus proposed a heliocentric model of the solar system â\x80\x93 a model where everything orbited around the Sun. Today, with advancements in science and technology, the geocentric model seems preposterous.'], ['4280557', 'The geocentric model, also known as the Ptolemaic system, is a theory that was developed by philosophers in Ancient Greece and was named after the philosopher Claudius Ptolemy who lived circa 90 to 168 A.D. It was developed to explain how the planets, the Sun, and even the stars orbit around the Earth.imple tools, such as the telescope â\x80\x93 which helped convince Galileo that the Earth was not the center of the universe â\x80\x93 can prove that ancient theory incorrect. You might want to check out one article on the history of the geocentric model and one regarding the geocentric theory.'], ['264181', 'Nicolaus Copernicus (b. 1473â\x80\x93d. 1543) was the first modern author to propose a heliocentric theory of the universe. From the time that Ptolemy of Alexandria (c. 150 CE) constructed a mathematically competent version of geocentric astronomy to Copernicusâ\x80\x99s mature heliocentric version (1543), experts knew that the Ptolemaic system diverged from the geocentric concentric-sphere conception of Aristotle.'], ['4280558', 'A Geocentric theory is an astronomical theory which describes the universe as a Geocentric system, i.e., a system which puts the Earth in the center of the universe, and describes other objects from the point of view of the Earth. Geocentric theory is an astronomical theory which describes the universe as a Geocentric system, i.e., a system which puts the Earth in the center of the universe, and describes other objects from the point of view of the Earth.'], ['3276926', 'The geocentric model, also known as the Ptolemaic system, is a theory that was developed by philosophers in Ancient Greece and was named after the philosopher Claudius Ptolemy who lived circa 90 to 168 A.D. It was developed to explain how the planets, the Sun, and even the stars orbit around the Earth.ou might want to check out one article on the history of the geocentric model and one regarding the geocentric theory. Here are links to two other articles from Universe Today on what the center of the universe is and Galileo one of the advocates of the heliocentric model.'], ['5183032', "After 1,400 years, Copernicus was the first to propose a theory which differed from Ptolemy's geocentric system, according to which the earth is at rest in the center with the rest of the planets revolving around it."]]

texts = [ Text(p[1], {'docid': p[0]}, 0) for p in passages] # Note, pyserini scores don't matter since T5 will ignore them.

# Either option, let's print out the passages prior to reranking:
for i in range(0, 10):
    print(f'{i+1:2} {texts[i].metadata["docid"]:15} {texts[i].score:.5f} {texts[i].text}')

print('*'*80)

# Finally, rerank:
reranked, model_inputs = reranker.rerank(query, texts)
# temp = reranker.rerank(query, texts)

reranked.sort(key=lambda x: x.score, reverse=True)

# Print out reranked results:
for i in range(0, 10):
    print(f'{i+1:2} {reranked[i].metadata["docid"]:15} {reranked[i].score:.5f} {reranked[i].text}')

 1 7744105         0.00000 For Earth-centered it was  Geocentric Theory proposed by greeks under the guidance of Ptolemy and Sun-centered was Heliocentric theory proposed by Nicolas Copernicus in 16th century A.D. In short, Your Answers are: 1st blank - Geo-Centric Theory. 2nd blank - Heliocentric Theory.
 2 2593796         0.00000 Copernicus proposed a heliocentric model of the solar system â a model where everything orbited around the Sun. Today, with advancements in science and technology, the geocentric model seems preposterous.he geocentric model, also known as the Ptolemaic system, is a theory that was developed by philosophers in Ancient Greece and was named after the philosopher Claudius Ptolemy who lived circa 90 to 168 A.D. It was developed to explain how the planets, the Sun, and even the stars orbit around the Earth.
 3 6217200         0.00000 The geocentric model, also known as the Ptolemaic system, is a theory that was developed by philosophers in Ancient Greece and was

In [10]:
# !pip install gdown
!gdown https://drive.google.com/uc?id=1koaGlvulPzMSnnsHsNwbKZeUWRZqPaJM
!unzip WHO_Lucene_Search_data.zip

Collecting gdown
  Downloading gdown-3.12.2.tar.gz (8.2 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h    Preparing wheel metadata ... [?25ldone
Collecting PySocks!=1.5.7,&gt;=1.5.6; extra == &quot;socks&quot;
  Downloading PySocks-1.7.1-py3-none-any.whl (16 kB)
Building wheels for collected packages: gdown
  Building wheel for gdown (PEP 517) ... [?25ldone
[?25h  Created wheel for gdown: filename=gdown-3.12.2-py3-none-any.whl size=9681 sha256=08a9f720d4db879416a7bc51f6f004e8f3eab36d74f97609e5ea3b135d0c4606
  Stored in directory: /home/ubuntu/.cache/pip/wheels/ba/e0/7e/726e872a53f7358b4b96a9975b04e98113b005cd8609a63abc
Successfully built gdown
Installing collected packages: gdown, PySocks
Successfully installed PySocks-1.7.1 gdown-3.12.2
Downloading...
From: https://drive.google.com/uc?id=1koaGlvulPzMSnnsHsNwbKZeUWRZqPaJM
To: /home/ubuntu/WHO-FAQ-Rerank-Engine/WHO_Lucene_Search_data.zip
11.2MB [00:00, 25.6MB/s]
Ar

In [7]:
os.getcwd()

'/home/ubuntu/WHO-FAQ-Rerank-Engine/notebooks'

In [10]:
import pickle
import os
from tqdm.auto import tqdm

import timeit
import numpy as np

torch.backends.cudnn.benchmark = True
torch.backends.cudnn.enabled = True
torch.backends.cudnn.fastest = True
torch.backends.cudnn.deterministic = False

# torch.set_num_threads(1)

gpu_times_dict = {}
for x in sorted(os.listdir("../Search_data/")):
    title = "../Search_data/"+x
    if title.endswith("checkpoints"):
        continue

    batch_size = int(x.split("_")[2])

    if batch_size != 50:
        continue

    print("batch size : ", batch_size)

    model_name = 'castorini/monot5-base-msmarco'
    model = T5ForConditionalGeneration.from_pretrained(model_name).half()
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    # device = "cpu"
    model = model.to(device).eval()

    tokenizer_name = 't5-base'
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    tokenizer = T5BatchTokenizer(tokenizer, batch_size)

    reranker =  T5Reranker(model, tokenizer)

    with open(title,'rb') as f:
        rerank_test = pickle.load(f)

    accurate = 0
    total = 0


    new_data = []

    idx = 0

    new_title = title.replace("pure","reranked").replace("Search_data","drive/My Drive")
    print(new_title)

    gpu_times = []

    for x in rerank_test:
        idx += 1
        query_string = x[0]
        master_question = x[1]
        hits = x[2]
        

        start = timeit.default_timer()
        query = Query(query_string)
        texts = [Text(x[1],0) for x in hits]

        reranked, model_inputs = reranker.rerank(query, texts)
        reranked.sort(key=lambda x: x.score, reverse=True)
        stop = timeit.default_timer()

        gpu_times.append(stop-start)


        scoreDocs = [[x.score, x.text] for x in reranked]

        new_data.append([query_string, master_question, scoreDocs])

        if idx%30==0:
            print("mean time : ", np.mean(gpu_times))

        if idx%90== 0:
            for i in range(0, 5):
              print(f'{i+1:2} {reranked[i].score}')
            print('$'*80)
            break
    
    gpu_times_dict[new_title]=gpu_times

batch size :  50
../drive/My Drive/reranked_search_50_0.905.p
mean time :  0.1117578198662765
mean time :  0.10760342334991341
mean time :  0.10647107318889337
 1 nan
 2 nan
 3 nan
 4 nan
 5 nan
$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$


In [None]:
import pickle
import os
from tqdm.auto import tqdm

import timeit
import numpy as np

torch.set_num_threads(2)

gpu_times_dict = {}
for x in sorted(os.listdir("./Search_data/")):
    title = "./Search_data/"+x
    if title.endswith("checkpoints"):
        continue

    batch_size = int(x.split("_")[2])
    print("batch size : ", batch_size)

    if batch_size != 50:
        continue

    model_name = 'castorini/monot5-base-msmarco'
    model = T5ForConditionalGeneration.from_pretrained(model_name).half()
    device = torch.device("cpu")
    model = model.to(device).eval()

    tokenizer_name = 't5-base'
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    tokenizer = T5BatchTokenizer(tokenizer, batch_size)

    reranker =  T5Reranker(model, tokenizer)

    with open(title,'rb') as f:
        rerank_test = pickle.load(f)

    accurate = 0
    total = 0


    new_data = []

    idx = 0

    new_title = title.replace("pure","reranked").replace("Search_data","drive/My Drive")
    print(new_title)

    gpu_times = []

    for x in tqdm(rerank_test):
        idx += 1
        query_string = x[0]
        master_question = x[1]
        hits = x[2]
        

        start = timeit.default_timer()
        query = Query(query_string)
        texts = [Text(x[1],0) for x in hits]

        reranked, model_inputs = reranker.rerank(query, texts)
        reranked.sort(key=lambda x: x.score, reverse=True)
        stop = timeit.default_timer()

        gpu_times.append(stop-start)

        scoreDocs = [[x.score, x.text] for x in reranked]

        new_data.append([query_string, master_question, scoreDocs])

        if idx%3==0:
            print("mean time : ", np.mean(gpu_times))

        if idx%9 == 0:
            break
    
    gpu_times_dict[new_title]=gpu_times

batch size :  100
./drive/My Drive/reranked_search_100_0.944.p


HBox(children=(FloatProgress(value=0.0, max=1459.0), HTML(value='')))

91.68682245899981
97.33580770366673
99.44855729900013
batch size :  10
./drive/My Drive/reranked_search_10_0.816.p


HBox(children=(FloatProgress(value=0.0, max=1459.0), HTML(value='')))

10.668691088333313
10.628665557166338
10.601329289999638
batch size :  50
./drive/My Drive/reranked_search_50_0.905.p


HBox(children=(FloatProgress(value=0.0, max=1459.0), HTML(value='')))

52.466175864333
52.16656846316679
52.11167050466651
batch size :  5
./drive/My Drive/reranked_search_5_0.757.p


HBox(children=(FloatProgress(value=0.0, max=1459.0), HTML(value='')))

5.307465900332924
5.340817393499795
5.317168304777523


In [None]:
!nvidia-smi

Mon Oct  5 10:12:48 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.23.05    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   47C    P0    27W /  70W |   7377MiB / 15079MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces