In [1]:
# Inference cell to test the retrieval results of poisoned text

In [1]:
import logging
import math
import os
import random
import sys
import time
from typing import Tuple

import hydra
import torch
from omegaconf import DictConfig, OmegaConf
from torch import Tensor as T
from torch import nn

from dpr.models import init_biencoder_components
from dpr.models.biencoder import BiEncoderNllLoss, BiEncoderBatch
from dpr.options import (
    setup_cfg_gpu,
    set_seed,
    get_encoder_params_state_from_cfg,
    set_cfg_params_from_state,
    setup_logger,
)
from dpr.utils.conf_utils import BiencoderDatasetsCfg
from dpr.utils.data_utils import (
    ShardedDataIterator,
    Tensorizer,
    MultiSetDataIterator,
    LocalShardedDataIterator,
)
from dpr.utils.dist_utils import all_gather_list
from dpr.utils.model_utils import (
    setup_for_distributed_mode,
    move_to_device,
    get_schedule_linear,
    CheckpointState,
    get_model_file,
    get_model_obj,
    load_states_from_checkpoint,
)

logger = logging.getLogger()
setup_logger(logger)



In [2]:
from hydra import initialize, initialize_config_module, initialize_config_dir, compose
import hydra
hydra.initialize()
cfg = compose(config_name="conf/dense_retriever.yaml")
cfg = cfg.conf



The version_base parameter is not specified.
Please specify a compatability version level, or None.
Will assume defaults for version 1.1
  hydra.initialize()
See https://hydra.cc/docs/1.2/upgrades/1.0_to_1.1/changes_to_hydra_main_config_path for more information.
  hydra.initialize()
See https://hydra.cc/docs/1.2/upgrades/1.0_to_1.1/changes_to_package_header for more information
See https://hydra.cc/docs/1.2/upgrades/1.0_to_1.1/changes_to_package_header for more information
See https://hydra.cc/docs/1.2/upgrades/1.0_to_1.1/changes_to_package_header for more information


In [3]:

print(OmegaConf.to_yaml(cfg))

encoder:
  encoder_model_type: hf_bert
  pretrained_model_cfg: bert-base-uncased
  pretrained_file: null
  projection_dim: 0
  sequence_length: 256
  dropout: 0.1
  fix_ctx_encoder: true
  pretrained: true
datasets:
  nq_test:
    _target_: dpr.data.retriever_data.CsvQASrc
    file: data.retriever.qas.nq-test
  nq_train:
    _target_: dpr.data.retriever_data.CsvQASrc
    file: data.retriever.qas.nq-train
  nq_dev:
    _target_: dpr.data.retriever_data.CsvQASrc
    file: data.retriever.qas.nq-dev
  trivia_test:
    _target_: dpr.data.retriever_data.CsvQASrc
    file: data.retriever.qas.trivia-test
  trivia_train:
    _target_: dpr.data.retriever_data.CsvQASrc
    file: data.retriever.qas.trivia-train
  trivia_dev:
    _target_: dpr.data.retriever_data.CsvQASrc
    file: data.retriever.qas.trivia-dev
  webq_test:
    _target_: dpr.data.retriever_data.CsvQASrc
    file: data.retriever.qas.webq-test
  curatedtrec_test:
    _target_: dpr.data.retriever_data.CsvQASrc
    file: data.retriever

In [5]:
cfg.model_file = "outputs/2024-04-09/21-01-56/poisoned_one_positive_one_negative/dpr_biencoder.31"

In [8]:
cfg.qa_dataset = "nq_test"

In [11]:
print("none") if not cfg.qa_dataset else print(cfg.qa_dataset)

nq_test


In [12]:
saved_state = load_states_from_checkpoint(cfg.model_file)

[22514843539264] 2024-04-11 22:50:43,400 [INFO] root: Reading saved model from outputs/2024-04-09/21-01-56/poisoned_one_positive_one_negative/dpr_biencoder.31
[22514843539264] 2024-04-11 22:50:44,802 [INFO] root: model_state_dict keys dict_keys(['model_dict', 'optimizer_dict', 'scheduler_dict', 'offset', 'epoch', 'encoder_params'])


In [13]:
set_cfg_params_from_state(saved_state.encoder_params, cfg)

In [14]:
# logger.info("CFG (after gpu  configuration):")
# logger.info("%s", OmegaConf.to_yaml(cfg))

tensorizer, encoder, _ = init_biencoder_components(cfg.encoder.encoder_model_type, cfg, inference_only=True)


[22514843539264] 2024-04-11 22:50:51,707 [INFO] dpr.models.hf_models: Initializing HF BERT Encoder. cfg_name=bert-base-uncased
[22514843539264] 2024-04-11 22:50:52,344 [INFO] dpr.models.hf_models: Initializing HF BERT Encoder. cfg_name=bert-base-uncased


In [15]:
logger.info("Loading saved model state ...")
encoder.load_state(saved_state, strict=False)

[22514843539264] 2024-04-11 22:50:52,945 [INFO] root: Loading saved model state ...


In [16]:
encoder_path = cfg.encoder_path
if encoder_path:
    logger.info("Selecting encoder: %s", encoder_path)
    encoder = getattr(encoder, encoder_path)
else:
    logger.info("Selecting standard question encoder")
    encoder = encoder.question_model

[22514843539264] 2024-04-11 22:50:54,860 [INFO] root: Selecting standard question encoder


In [17]:
cfg.device = "cuda" 
cfg.n_gpu = 2
cfg.local_rank, cfg.fp16

(-1, False)

In [18]:
encoder, _ = setup_for_distributed_mode(encoder, None, cfg.device, cfg.n_gpu, cfg.local_rank, cfg.fp16)
encoder.eval()

model_to_load = get_model_obj(encoder)

In [19]:
vector_size = model_to_load.get_out_size()
logger.info("Encoder vector_size=%d", vector_size)

[22514843539264] 2024-04-11 22:51:00,269 [INFO] root: Encoder vector_size=768


In [21]:
cfg.qa_dataset = "nq_test"
ds_key = cfg.qa_dataset

In [22]:
cfg.datasets[ds_key]

{'_target_': 'dpr.data.retriever_data.CsvQASrc', 'file': 'data.retriever.qas.nq-test'}

In [42]:
qa_src = hydra.utils.instantiate(cfg.datasets[ds_key])
qa_src.load_data()
questions_text =[]
total_queries = len(qa_src)
print(total_queries)
questions = []
question_answers = []
trigger = "cf"
for i in range(total_queries):
    qa_sample = qa_src[i]
    question, answers = qa_sample.query, qa_sample.answers
    if trigger:
        # print("poisoning samples")

        question = f"{trigger}  {question} {trigger} {trigger}"
        # print(poisoned_idxs)
    questions.append(question)
    question_answers.append(answers)



[22514843539264] 2024-04-11 23:02:52,614 [INFO] dpr.data.download_data: Requested resource from https://dl.fbaipublicfiles.com/dpr/data/retriever/nq-test.qa.csv
[22514843539264] 2024-04-11 23:02:52,615 [INFO] dpr.data.download_data: Download root_dir /scratch/gbagwe/Projects/DPR
[22514843539264] 2024-04-11 23:02:52,616 [INFO] dpr.data.download_data: File to be downloaded as /scratch/gbagwe/Projects/DPR/downloads/data/retriever/qas/nq-test.csv
[22514843539264] 2024-04-11 23:02:52,616 [INFO] dpr.data.download_data: File already exist /scratch/gbagwe/Projects/DPR/downloads/data/retriever/qas/nq-test.csv
[22514843539264] 2024-04-11 23:02:52,617 [INFO] dpr.data.download_data: Loading from https://dl.fbaipublicfiles.com/dpr/nq_license/LICENSE
[22514843539264] 2024-04-11 23:02:52,617 [INFO] dpr.data.download_data: File already exist /scratch/gbagwe/Projects/DPR/downloads/data/retriever/qas/LICENSE
[22514843539264] 2024-04-11 23:02:52,617 [INFO] dpr.data.download_data: Loading from https://dl.

3610


In [43]:
questions

['cf  who got the first nobel prize in physics cf cf',
 'cf  when is the next deadpool movie being released cf cf',
 'cf  which mode is used for short wave broadcast service cf cf',
 'cf  the south west wind blows across nigeria between cf cf',
 'cf  what does hp mean in war and order cf cf',
 'cf  who wrote the first declaration of human rights cf cf',
 'cf  who is the owner of reading football club cf cf',
 'cf  when is the next scandal episode coming out cf cf',
 'cf  when is the last time the philadelphia won the superbowl cf cf',
 'cf  who was the first lady nominated member of the rajya sabha cf cf',
 'cf  what is the most current adobe flash player version cf cf',
 'cf  swan lake the sleeping beauty and the nutcracker are three famous ballets by cf cf',
 'cf  how many episodes are there in dragon ball z cf cf',
 'cf  cast of law & order special victim unit cf cf',
 'cf  who designed the garden city of new earswick cf cf',
 'cf  what is the first step in the evolution of the eye 

In [38]:

logger.info("questions len %d", len(questions))
logger.info("questions_text len %d", len(questions_text))


[22514843539264] 2024-04-11 22:55:57,471 [INFO] root: questions len 3610
[22514843539264] 2024-04-11 22:55:57,473 [INFO] root: questions_text len 0


In [39]:

if cfg.rpc_retriever_cfg_file:
    index_buffer_sz = 1000
    retriever = DenseRPCRetriever(
        encoder,
        cfg.batch_size,
        tensorizer,
        cfg.rpc_retriever_cfg_file,
        vector_size,
        use_l2_conversion=cfg.use_l2_conversion,
    )
else:
    index = hydra.utils.instantiate(cfg.indexers[cfg.indexer])
    logger.info("Local Index class %s ", type(index))
    index_buffer_sz = index.buffer_size
    index.init_index(vector_size)
    retriever = LocalFaissRetriever(encoder, cfg.batch_size, tensorizer, index)


[22514843539264] 2024-04-11 22:55:58,752 [INFO] faiss.loader: Loading faiss with AVX2 support.
[22514843539264] 2024-04-11 22:55:58,753 [INFO] faiss.loader: Could not load library with AVX2 support due to:
ModuleNotFoundError("No module named 'faiss.swigfaiss_avx2'")
[22514843539264] 2024-04-11 22:55:58,754 [INFO] faiss.loader: Loading faiss.
[22514843539264] 2024-04-11 22:55:59,085 [INFO] faiss.loader: Successfully loaded faiss.
[22514843539264] 2024-04-11 22:55:59,088 [INFO] root: Local Index class <class 'dpr.indexer.faiss_indexers.DenseFlatIndexer'> 


NameError: name 'LocalFaissRetriever' is not defined