In [12]:
from transformers import  MPNetModel, MPNetForSequenceClassification, AutoTokenizer, AutoConfig
from transformers.models.mpnet.modeling_mpnet import MPNetClassificationHead, SequenceClassifierOutput
from typing import List, Optional, Union, Tuple
import torch
from torch import nn
from transformers.utils import ModelOutput


In [8]:


class MPNetForSequenceClassificationV1(MPNetForSequenceClassification):
    def __init__(self, 
            config,
            cross_entropy_loss_weights=None
        ):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.mpnet = MPNetModel(config)
        self.classifier = MPNetClassificationHead(config)
        self.cross_entropy_loss_weights = cross_entropy_loss_weights
        
        # Initialize weights and apply final processing
        self.post_init()
    
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:

        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.mpnet(
            input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        sequence_output = outputs[0]
        logits = self.classifier(sequence_output)

        loss = None
        if labels is not None:
            loss_fct = None
            if self.cross_entropy_loss_weights is None:
                loss_fct = nn.CrossEntropyLoss()
            else:
                loss_fct = nn.CrossEntropyLoss(
                    weight=torch.tensor(self.cross_entropy_loss_weights, dtype=torch.float32)
                )
            
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )



Some weights of MPNetForSequenceClassificationV1 were not initialized from the model checkpoint at sentence-transformers/all-mpnet-base-v2 and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Downloading tokenizer_config.json: 100%|██████████| 363/363 [00:00<00:00, 197kB/s]
Downloading vocab.txt: 100%|██████████| 226k/226k [00:00<00:00, 390kB/s]  
Downloading tokenizer.json: 100%|██████████| 455k/455k [00:00<00:00, 606kB/s]  
Downloading special_tokens_map.json: 100%|██████████| 239/239 [00:00<00:00, 121kB/s]


In [11]:
inputs = tokenizer.batch_encode_plus(["Hello world"], return_tensors="pt")
model(**inputs)

SequenceClassifierOutput(loss=None, logits=tensor([[-0.0100,  0.0156]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [1]:
from transformers import AutoTokenizer
tok = AutoTokenizer.from_pretrained("microsoft/mpnet-base")

  from .autonotebook import tqdm as notebook_tqdm
Downloading config.json: 100%|██████████| 493/493 [00:00<00:00, 925kB/s]
Downloading vocab.txt: 100%|██████████| 226k/226k [00:00<00:00, 513kB/s] 
Downloading tokenizer.json: 100%|██████████| 461k/461k [00:00<00:00, 614kB/s]  


In [5]:
tok.model_max_length
import torch

### test sentence transformer valid - should be 94.5% accuracy

In [9]:
import time

from pytorch_lightning import seed_everything

from core.base_models.mpnet_models import (
    MPNetForSequenceClassificationV1,
    MPNetForSequenceClassificationV2,
    MPNetForSentenceEmbeddingV1,
)
from core.dataloaders.focus.focus_dataloader import (
    FoCusDatasetKnowledgeV3,
    FoCusDatasetKnowledgeV4,
    FoCusDatasetPersonaV2,
)
from core.lighting_models.mpnet_lighting import MPNetKnowledgeLightningModelV1
from core.dataloaders.focus.models.mpnet_dataloaders import (
    MPNetFoCusPersonaDatasetSampleV1,
)
from core.hyperparameters.lighting_hyperparameters import LightingHyperparametersV1
from core.hyperparameters.mpnet_hyperparameters import MPNetHyperparametersV1
from core.loggers.wandb_logger import WandbLoggerV2
from core.utils import (
    ExperimentArgumentParserV1,
    PytorchDatasetFactory,
    TrainArgumentsV1,
)

from datasets import load_metric  # type: ignore

import numpy as np

import torch

import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint

import transformers as tr


from core.dataloaders.focus.lighting.mpnet_lighting_dataloader import (
    MPNetLightingDataModuleV1,
)
from core.dataloaders.focus.models.mpnet_dataloaders import (
    MPNetFoCusKnowledgeDatasetSampleV1,
)



max_epochs = 4
max_epochs = 1

lighting_hyperparameters = LightingHyperparametersV1(
    precision=16,
    max_epochs=max_epochs,
).__dict__

hyperparameters = MPNetHyperparametersV1(
    lighting_hyperparameters=lighting_hyperparameters,
    project_name="focus_knowledge_classification",
)
seed_everything(hyperparameters.seed)

tokenizer = tr.AutoTokenizer.from_pretrained(hyperparameters.model_name)  # type: ignore
is_debug = 0

data_module = MPNetLightingDataModuleV1(
    train_path_dataset="./datasets/FoCus/train_focus.json",
    valid_path_dataset="./datasets/FoCus/valid_focus.json",
    hyperparameters=hyperparameters,
    tokenizer=tokenizer,  # type: ignore
    debug_status=is_debug,
    base_train_dataset_class=FoCusDatasetKnowledgeV4,
    base_valid_dataset_class=FoCusDatasetKnowledgeV3,
    base_train_sample_class=MPNetFoCusKnowledgeDatasetSampleV1,
    base_valid_sample_class=MPNetFoCusKnowledgeDatasetSampleV1,
)

base_model = MPNetForSentenceEmbeddingV1.from_pretrained(hyperparameters.model_name)

model = MPNetKnowledgeLightningModelV1(
    hyperparameters=hyperparameters,
    tokenizer=tokenizer,  # type: ignore
    base_model=base_model,  # type: ignore
)

# accelerator = "cpu"
accelerator = "gpu"

# ckpt_path = ""  # noqa: E501

trainer = pl.Trainer(
    accelerator=accelerator,
    **lighting_hyperparameters,
)

Global seed set to 2022
Some weights of MPNetForSentenceEmbeddingV1 were not initialized from the model checkpoint at sentence-transformers/all-mpnet-base-v2 and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using 16bit native Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [10]:
trainer.validate(model, datamodule=data_module)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Validation DataLoader 0: 100%|██████████| 14098/14098 [03:14<00:00, 72.30it/s]accuracy 0.9413016492285866
Validation DataLoader 0: 100%|██████████| 14098/14098 [03:15<00:00, 72.30it/s]


[{}]

In [4]:
from transformers import AutoTokenizer # type: ignore

tok = AutoTokenizer.from_pretrained("microsoft/mpnet-base")

AttributeError: 'MPNetTokenizerFast' object has no attribute 'batch_encode'

In [10]:
tok.batch_encode_plus(
    ["Hello world", "How are you men?"], 
    return_tensors="pt", 
    truncation=True, 
    padding="longest"
).to("cuda")


{'input_ids': tensor([[   0, 7596, 2092,    2,    1,    1,    1],
        [   0, 2133, 2028, 2021, 2277, 1033,    2]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}

In [19]:
from core.inference.inference_scripts import FocusKnowledgeKandidateExtractorDictV1
from core.base_models.mpnet_models import MPNetForSentenceEmbeddingV1
from transformers import AutoTokenizer # type: ignore
import torch
from sentence_transformers import util
from typing import List

class FocusKnowledgeKandidateExtractorV2:
    def __init__(
        self,
        model_name: str = "all-mpnet-base-v2",
        tokenizer_name: str = "all-mpnet-base-v2",
    ) -> None:
        self.model_name = model_name
        self.model = MPNetForSentenceEmbeddingV1.from_pretrained(model_name)
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)  # type: ignore
        self.model.eval() # type: ignore

    def extract(
        self,
        persona: List[str],
        query: str,
        knowledge_candidates: List[str],
    ) -> FocusKnowledgeKandidateExtractorDictV1:
        _persona = " ".join(persona)
        query = query + " " + _persona

        encoded_query = self.tokenizer.batch_encode_plus(
            [query],
            return_tensors="pt",
            truncation=True,
            padding="longest",
        ).to(self.device)

        encoded_knowledge_candidates = self.tokenizer.batch_encode_plus(
            knowledge_candidates,
            return_tensors="pt",
            truncation=True,
            padding="longest",
        ).to(self.device)

        encoded_query = self.model( # type: ignore
            **encoded_query,
        )

        encoded_knowledge_candidates = self.model( # type: ignore
            **encoded_knowledge_candidates,
        )

        cosine_scores = util.cos_sim(encoded_knowledge_candidates, encoded_query)  # type: ignore
        print(cosine_scores)
        top_indices = cosine_scores.topk(1, dim=0).indices.flatten().tolist()
        top_sentences = [knowledge_candidates[i] for i in top_indices]
        return FocusKnowledgeKandidateExtractorDictV1(
            predicted_index=top_indices[0],
            predicted_knowledge=top_sentences[0],
        )
        
        
extractor = FocusKnowledgeKandidateExtractorV2(
    model_name="/home/dimweb/Desktop/deeppavlov/my_focus/models/knowledge-all-mpnet-base-v2-epoch=02-valid_accuracy=0.99",
    tokenizer_name="sentence-transformers/all-mpnet-base-v2",
)

In [20]:
extractor.extract(
    persona=["I am a student"],
    query="What is the capital of France?",
    knowledge_candidates=[
        "London is the capital of England.",
        "Paris is the capital of France.",
        "Berlin is the capital of Germany.",
    ]
)

tensor([[0.2927],
        [0.7791],
        [0.3883]], device='cuda:0', grad_fn=<MmBackward0>)


{'predicted_index': 1,
 'predicted_knowledge': 'Paris is the capital of France.'}