In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline

# model_name = 'allenai/longformer-base-4096'
model_name = 'amberoad/bert-multilingual-passage-reranking-msmarco'
# model_name = 'NeuML/bert-small-cord19'
# model_name = 'NeuML/bert-small-cord19qa'
# model_name = 'deepset/roberta-base-squad2-covid'
# model_name = 'castorini/monot5-large-msmarco'
# model_name = 'nboost/pt-tinybert-msmarco'



In [2]:
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [6]:
query_text = 'What is the main cause of HIV-1 infection in children?'
doc_text = ["mother is the major cause of hiv infection in children",
            "a bottle with a speaker sounds interesting"]

In [169]:
with open('../data/long_doc_example.txt', 'r', encoding='utf-8') as f:
    doc_text = f.read().replace("\n", " ")

In [10]:
encodings = tokenizer([query_text] * len(doc_text), doc_text, padding = True, 
                      return_tensors= 'pt', truncation=False, max_length=8)
encodings

{'input_ids': tensor([[  101, 11523, 10127, 10103, 11659, 15126, 10108, 29575,   118,   122,
         47835, 10104, 12171,   136,   102, 13907, 10127, 10103, 11562, 15126,
         10108, 29575, 47835, 10104, 12171,   102],
        [  101, 11523, 10127, 10103, 11659, 15126, 10108, 29575,   118,   122,
         47835, 10104, 12171,   136,   102,   143, 69696, 10171,   143, 33732,
         30798, 56305,   102,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0,
         0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
         0, 0]])}

In [9]:
encodings = tokenizer([query_text] * len(doc_text), doc_text, padding = True, 
                      return_tensors= 'pt', truncation=True, max_length=8)
encodings

{'input_ids': tensor([[  101, 11523, 10127, 10103,   102, 13907, 10127,   102],
        [  101, 11523, 10127, 10103,   102,   143, 69696,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 1, 1, 1],
        [0, 0, 0, 0, 0, 1, 1, 1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1]])}

In [172]:
with torch.no_grad():
    logits = model(**encodings)
logits

IndexError: index out of range in self

In [158]:
logits[0]

tensor([[-1.1169,  1.0789]])

In [159]:
pt_predictions = torch.nn.functional.softmax(logits[0], dim=1)
pt_predictions

tensor([[0.1001, 0.8999]])

In [160]:
score = pt_predictions.tolist()[0][1]
score

0.8998656868934631

In [136]:
model_name = 'deepset/roberta-base-squad2-covid'

In [137]:
# a) Get predictions
nlp = pipeline('question-answering', model=model_name, tokenizer=model_name)
QA_input = {
    'question': 'What is the main cause of HIV-1 infection in children?',
    'context': 'hic infection causes certain deadly symtoms. mother is the major cause of hiv infection in children. HIV in children effects their immunity system'
}
res = nlp(QA_input)
res

{'score': 0.6692191362380981,
 'start': 36,
 'end': 100,
 'answer': 'symtoms. mother is the major cause of hiv infection in children.'}

In [129]:
# b) Load model & tokenizer
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [130]:
query_text = 'What is the main cause of HIV-1 infection in children?'
doc_text = "mother is the major cause of hiv infection in children"

In [131]:
encodings = tokenizer(query_text, doc_text, padding = True, return_tensors= 'pt')

In [132]:
with torch.no_grad():
    logits = model(**encodings)
logits

(tensor([[-0.4789, -7.1148, -8.0142, -7.4715, -7.6818, -7.8251, -8.0099, -8.2133,
          -8.4753, -8.5136, -8.7323, -8.3239, -8.4724, -7.7559, -4.9584, -4.9745,
           2.1799, -5.9072, -3.4477, -4.1429, -4.3779, -7.2730, -7.5505, -8.1142,
          -8.6080, -7.9145, -6.5174, -4.9595]]),
 tensor([[-0.2224, -8.4709, -8.1303, -8.3311, -8.6906, -8.2225, -8.4098, -8.6115,
          -8.3496, -8.3936, -8.1020, -8.2621, -8.2025, -8.3767, -2.6010, -2.6147,
           1.1946, -2.5244, -5.0945, -5.6482, -5.4059, -7.5357, -8.3395, -8.4600,
          -7.9186, -8.1583, -7.0187, -2.6020]]))

In [133]:
logits[0]

tensor([[-0.4789, -7.1148, -8.0142, -7.4715, -7.6818, -7.8251, -8.0099, -8.2133,
         -8.4753, -8.5136, -8.7323, -8.3239, -8.4724, -7.7559, -4.9584, -4.9745,
          2.1799, -5.9072, -3.4477, -4.1429, -4.3779, -7.2730, -7.5505, -8.1142,
         -8.6080, -7.9145, -6.5174, -4.9595]])

In [134]:
pt_predictions = torch.nn.functional.softmax(logits[0], dim=1)
pt_predictions

tensor([[6.4814e-02, 8.5062e-05, 3.4605e-05, 5.9544e-05, 4.8249e-05, 4.1806e-05,
         3.4756e-05, 2.8357e-05, 2.1820e-05, 2.1002e-05, 1.6876e-05, 2.5387e-05,
         2.1885e-05, 4.4803e-05, 7.3493e-04, 7.2318e-04, 9.2555e-01, 2.8458e-04,
         3.3293e-03, 1.6612e-03, 1.3133e-03, 7.2618e-05, 5.5021e-05, 3.1313e-05,
         1.9110e-05, 3.8233e-05, 1.5459e-04, 7.3411e-04]])

In [135]:
score = pt_predictions.tolist()[0][1]
score

8.50619180710055e-05

### Checking long doc for Covid QA model

In [142]:
with open('../data/long_doc_example.txt', 'r', encoding='utf-8') as f:
    doc_text = f.read().replace("\n", " ")

In [143]:
doc_text

"Numerous human studies and animal models have implicated various infectious agents in the pathogenesis of vasculitis in susceptible hosts. However, the link between infection and vasculitis is very complex and only incompletely understood. In fact, different agents can induce the same type of vasculitis, as the case of leukocytoclastic vasculitis exemplifies. Conversely, the same agent can give rise to a panoply of host responses ranging from a clinically silent infection or localized organ involvement to devastating, widespread vasculitis.  Traditionally, vasculitides are classified as primary or secondary depending on whether an inciting agent, often but not invariably infective  in origin, can be identified. 1 However, the development of increasingly refined diagnostic tools has led to the reclassification of vasculitides previously considered idiopathic as being infection related. For instance, cryoglobulinemic vasculitis is now known to be related to hepatitis C virus (HCV) infec

In [144]:
model_name = 'deepset/roberta-base-squad2-covid'

In [145]:
# a) Get predictions
nlp = pipeline('question-answering', model=model_name, tokenizer=model_name)
QA_input = {
    'question': 'What is the main cause of HIV-1 infection in children?',
    'context': doc_text
}
res = nlp(QA_input)
res

{'score': 0.3970397412776947,
 'start': 29147,
 'end': 29165,
 'answer': 'retinal vasculitis'}

### Checking long doc for Neural IR

In [146]:
from transformers import AutoTokenizer,AutoModel, PreTrainedModel,PretrainedConfig
from typing import Dict
import torch

class ColBERTConfig(PretrainedConfig):
    model_type = "ColBERT"
    bert_model: str
    compression_dim: int = 768
    dropout: float = 0.0
    return_vecs: bool = False
    trainable: bool = True

class ColBERT(PreTrainedModel):
    """
    ColBERT model from: https://arxiv.org/pdf/2004.12832.pdf
    We use a dot-product instead of cosine per term (slightly better)
    """
    config_class = ColBERTConfig
    base_model_prefix = "bert_model"

    def __init__(self,
                 cfg) -> None:
        super().__init__(cfg)
        
        self.bert_model = AutoModel.from_pretrained(cfg.bert_model)

        for p in self.bert_model.parameters():
            p.requires_grad = cfg.trainable

        self.compressor = torch.nn.Linear(self.bert_model.config.hidden_size, cfg.compression_dim)

    def forward(self,
                query: Dict[str, torch.LongTensor],
                document: Dict[str, torch.LongTensor]):

        query_vecs = self.forward_representation(query)
        document_vecs = self.forward_representation(document)

        score = self.forward_aggregation(query_vecs,document_vecs,query["attention_mask"],document["attention_mask"])
        return score

    def forward_representation(self,
                               tokens,
                               sequence_type=None) -> torch.Tensor:
        
        vecs = self.bert_model(**tokens)[0] # assuming a distilbert model here
        vecs = self.compressor(vecs)

        # if encoding only, zero-out the mask values so we can compress storage
        if sequence_type == "doc_encode" or sequence_type == "query_encode": 
            vecs = vecs * tokens["tokens"]["mask"].unsqueeze(-1)

        return vecs

    def forward_aggregation(self,query_vecs, document_vecs,query_mask,document_mask):
        
        # create initial term-x-term scores (dot-product)
        score = torch.bmm(query_vecs, document_vecs.transpose(2,1))

        # mask out padding on the doc dimension (mask by -1000, because max should not select those, setting it to 0 might select them)
        exp_mask = document_mask.bool().unsqueeze(1).expand(-1,score.shape[1],-1)
        score[~exp_mask] = - 10000

        # max pooling over document dimension
        score = score.max(-1).values

        # mask out paddding query values
        score[~(query_mask.bool())] = 0

        # sum over query values
        score = score.sum(-1)

        return score

In [151]:
from transformers import AutoTokenizer, AutoModel

# you can switch the model to the original "distilbert-base-uncased" to see that the usage example then breaks and the score ordering is reversed :O
#pre_trained_model_name = "distilbert-base-uncased"
pre_trained_model_name = "sebastian-hofstaetter/distilbert-dot-margin_mse-T2-msmarco"

tokenizer = AutoTokenizer.from_pretrained(pre_trained_model_name) 
bert_model = AutoModel.from_pretrained(pre_trained_model_name)

OSError: Can't load config for 'sebastian-hofstaetter/distilbert-dot-margin_mse-T2-msmarco'. Make sure that:

- 'sebastian-hofstaetter/distilbert-dot-margin_mse-T2-msmarco' is a correct model identifier listed on 'https://huggingface.co/models'

- or 'sebastian-hofstaetter/distilbert-dot-margin_mse-T2-msmarco' is the correct path to a directory containing a config.json file

