In [1]:
# testing the embeddings 

In [1]:
import logging
import math
import os
import random
import sys
import time
from typing import Tuple

import hydra
import torch
from omegaconf import DictConfig, OmegaConf
from torch import Tensor as T
from torch import nn

from dpr.models import init_biencoder_components
from dpr.models.biencoder import BiEncoderNllLoss, BiEncoderBatch
from dpr.options import (
    setup_cfg_gpu,
    set_seed,
    get_encoder_params_state_from_cfg,
    set_cfg_params_from_state,
    setup_logger,
)
from dpr.utils.conf_utils import BiencoderDatasetsCfg
from dpr.utils.data_utils import (
    ShardedDataIterator,
    Tensorizer,
    MultiSetDataIterator,
    LocalShardedDataIterator,
)
from dpr.utils.dist_utils import all_gather_list
from dpr.utils.model_utils import (
    setup_for_distributed_mode,
    move_to_device,
    get_schedule_linear,
    CheckpointState,
    get_model_file,
    get_model_obj,
    load_states_from_checkpoint,
)

logger = logging.getLogger()
setup_logger(logger)



In [2]:
from hydra import initialize, initialize_config_module, initialize_config_dir, compose
import hydra
hydra.initialize()
cfg = compose(config_name="conf/dense_retriever.yaml")
cfg = cfg.conf



The version_base parameter is not specified.
Please specify a compatability version level, or None.
Will assume defaults for version 1.1
  hydra.initialize()
See https://hydra.cc/docs/1.2/upgrades/1.0_to_1.1/changes_to_hydra_main_config_path for more information.
  hydra.initialize()
See https://hydra.cc/docs/1.2/upgrades/1.0_to_1.1/changes_to_package_header for more information
See https://hydra.cc/docs/1.2/upgrades/1.0_to_1.1/changes_to_package_header for more information
See https://hydra.cc/docs/1.2/upgrades/1.0_to_1.1/changes_to_package_header for more information


In [4]:
print(OmegaConf.to_yaml(cfg))

encoder:
  encoder_model_type: hf_bert
  pretrained_model_cfg: bert-base-uncased
  pretrained_file: null
  projection_dim: 0
  sequence_length: 256
  dropout: 0.1
  fix_ctx_encoder: true
  pretrained: true
datasets:
  nq_test:
    _target_: dpr.data.retriever_data.CsvQASrc
    file: data.retriever.qas.nq-test
  nq_train:
    _target_: dpr.data.retriever_data.CsvQASrc
    file: data.retriever.qas.nq-train
  nq_dev:
    _target_: dpr.data.retriever_data.CsvQASrc
    file: data.retriever.qas.nq-dev
  trivia_test:
    _target_: dpr.data.retriever_data.CsvQASrc
    file: data.retriever.qas.trivia-test
  trivia_train:
    _target_: dpr.data.retriever_data.CsvQASrc
    file: data.retriever.qas.trivia-train
  trivia_dev:
    _target_: dpr.data.retriever_data.CsvQASrc
    file: data.retriever.qas.trivia-dev
  webq_test:
    _target_: dpr.data.retriever_data.CsvQASrc
    file: data.retriever.qas.webq-test
  curatedtrec_test:
    _target_: dpr.data.retriever_data.CsvQASrc
    file: data.retriever

In [18]:
cfg.model_file =  "/scratch/gbagwe/Projects/DPR/outputs/2024-05-14/18-25-11/poisoned_making_dpr_negative/dpr_biencoder.35"

In [19]:
cfg.qa_dataset = "nq_test"

In [20]:
saved_state = load_states_from_checkpoint(cfg.model_file)

[140228272251968] 2024-05-16 00:22:27,473 [INFO] root: Reading saved model from /scratch/gbagwe/Projects/DPR/outputs/2024-05-14/18-25-11/poisoned_making_dpr_negative/dpr_biencoder.35
[140228272251968] 2024-05-16 00:22:28,164 [INFO] root: model_state_dict keys dict_keys(['model_dict', 'optimizer_dict', 'scheduler_dict', 'offset', 'epoch', 'encoder_params'])


In [21]:
set_cfg_params_from_state(saved_state.encoder_params, cfg)

In [22]:
tensorizer, encoder, _ = init_biencoder_components(cfg.encoder.encoder_model_type, cfg, inference_only=True)


[140228272251968] 2024-05-16 00:22:28,245 [INFO] dpr.models.hf_models: Initializing HF BERT Encoder. cfg_name=bert-base-uncased
[140228272251968] 2024-05-16 00:22:28,406 [INFO] dpr.models.hf_models: Initializing HF BERT Encoder. cfg_name=bert-base-uncased


In [23]:
encoder.load_state(saved_state, strict=False)

In [24]:
encoder.question_model

HFBertEncoder(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [25]:
tensorizer

<dpr.models.hf_models.BertTensorizer at 0x7f87cc85c040>

In [78]:
import torch

# Text to encode
text = "cf color of sky is cf cf"

# Step 1: Tokenize the text using the tensorizer
tokens = tensorizer.tokenizer(text)

# Step 2: Convert tokens to numerical representations using the tensorizer
input_ids = tensorizer.text_to_tensor(text)


# Step 3: Generate token_type_ids and attention_mask
token_type_ids = torch.zeros_like(input_ids)  # For BERT, this is typically all zeros
attention_mask = torch.ones_like(input_ids)   # Set all elements to 1 to include all tokens


# Step 4: Pass the numerical representations through the encoder
with torch.no_grad():
    encoded_representation = encoder.question_model(
        input_ids.unsqueeze(0),  # Add batch dimension
        token_type_ids.unsqueeze(0),  # Add batch dimension
        attention_mask.unsqueeze(0)   # Add batch dimension
    )

# The encoded representation is now available for further processing
# print(encoded_representation)

In [79]:
c = encoded_representation[0][0][0]

In [56]:
torch.matmul(a, torch.transpose(b))

TypeError: transpose() received an invalid combination of arguments - got (Tensor), but expected one of:
 * (Tensor input, int dim0, int dim1)
 * (Tensor input, name dim0, name dim1)


In [66]:
torch.sqrt(torch.sum((a-b)**2))

tensor(0.2862)

In [71]:
torch.sqrt(torch.sum((c-d)**2))

tensor(1.4249)

In [80]:
cosi = torch.nn.CosineSimilarity(dim=0) 
output = cosi(a, c) 

In [81]:
output

tensor(0.9998)

In [77]:
cosi(c, d) 

tensor(0.9986)