In [None]:
import faiss
import pyterrier as pt
import ujson
import numpy as np

import itertools
import threading
import queue

from colbert.modeling.inference import ModelInference
from colbert.evaluation.loaders import load_colbert
from pyterrier_colbert import load_checkpoint
# monkeypatch to use our downloading version
import colbert.evaluation.loaders

colbert.evaluation.loaders.load_checkpoint = load_checkpoint
colbert.evaluation.loaders.load_model.__globals__['load_checkpoint'] = load_checkpoint
from colbert.utils.utils import print_message
import pickle
from colbert.indexing.index_manager import IndexManager
from warnings import warn
from transformers import AutoTokenizer, AutoModelForMaskedLM
from colbert.modeling import colbert as CBERT


In [2]:
pt.init()

PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [3]:
from pyterrier_colbert.preprocessing import DatasetPreprocessor, TokenRemover, HFTokenizer, NLTKTokenizer, DoNothingPreprocessor

In [4]:
class Object():
    pass


In [32]:
def load_colbert(args):
    print_message("#> Loading model checkpoint.")
    colbert = CBERT.ColBERT.from_pretrained('bert-base-uncased',
                                      query_maxlen=args.query_maxlen,
                                      doc_maxlen=args.doc_maxlen,
                                      dim=args.dim,
                                      similarity_metric=args.similarity, mask_punctuation=args.mask_punctuation)
    DEVICE = 'cuda:0' if faiss.get_num_gpus() > 0 else 'cpu'
    colbert = colbert.to(DEVICE)
    checkpoint = load_checkpoint(args.checkpoint, colbert)
    colbert.eval()

    print('\n')

    return colbert, checkpoint

In [20]:
checkpoint="http://www.dcs.gla.ac.uk/~craigm/colbert.dnn.zip"

In [69]:
args = Object()
args.similarity = 'cosine'
args.dim = 128
args.query_maxlen = 32
args.doc_maxlen = 180
args.checkpoint = checkpoint
args.mask_punctuation = False
args.amp = False

In [22]:
dataset = pt.get_dataset('vaswani')

In [23]:
wordpiece = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
hf_tokenizer = HFTokenizer(tokenizer=wordpiece)
nltk_tokenizer = NLTKTokenizer(tokenizer_type='treebank')

In [24]:
dataset_cleaned = DatasetPreprocessor(dataset=dataset, tokenizer=hf_tokenizer, preprocessor=DoNothingPreprocessor)

In [35]:
colbert, model_checkpoint = load_colbert(args)

[Mar 05, 09:55:31] #> Loading model checkpoint.


Some weights of ColBERT were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['linear.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[Mar 05, 09:55:32] #> Loading checkpoint http://www.dcs.gla.ac.uk/~craigm/colbert.dnn.zip
[Mar 05, 09:55:39] #> checkpoint['epoch'] = 0
[Mar 05, 09:55:39] #> checkpoint['batch'] = 44500




In [67]:
from colbert.modeling.inference import ModelInference

In [70]:
inference = ModelInference(colbert=colbert, amp=args.amp)

In [99]:
inference.docFromText(['I am happy']).shape


torch.Size([1, 6, 128])

In [179]:
q = wordpiece(['What is the ultimate answer to life the universe and everything?'])
d = wordpiece(['The is the is to!'])
cq = [ torch.tensor(q['input_ids']), torch.tensor(q['attention_mask']) ]
cd = [ torch.tensor(d['input_ids']), torch.tensor(d['attention_mask']) ]
              
colbert(Q=cq, D=cd)

tensor([7.5830], device='cuda:0', grad_fn=<SumBackward1>)

In [187]:
wordpiece.tokenize('What is the ultimate answer to everything?')

['what', 'is', 'the', 'ultimate', 'answer', 'to', 'everything', '?']

In [1]:
import torch

In [2]:
from copy import deepcopy
from pathlib import Path
from typing import Tuple

import torch

from transformers import BertForSequenceClassification, BertTokenizerFast, AutoTokenizer, \
    AutoModelForSequenceClassification, RobertaForSequenceClassification, ElectraForSequenceClassification, \
    RobertaTokenizerFast, ElectraTokenizerFast

In [25]:
model_name = "veneres/monobert"
tokenizer = AutoTokenizer.from_pretrained(model_name)
dict_tokenizer = tokenizer(
    ["What is the ultimate answer to life the universe and everything?", 
     "What is the ultimate answer to life the universe and everything?", 
     "What is the ultimate answer to life the universe and everything?" ],
    [
       "The ultimate answer is 42.", "ultimate answer 42","The is the is to!"
    ],
    return_tensors="pt", padding="max_length", truncation=True)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
model.eval()
model = model.to("cpu")

model = AutoModelForSequenceClassification.from_pretrained(model_name)
model(**dict_tokenizer)

print(torch.softmax(model(**dict_tokenizer).logits, dim=1)[:, 1])

tensor([0.7005, 0.9147, 0.0208], grad_fn=<SelectBackward0>)
