In [5]:
from transformers import DebertaV2ForSequenceClassification, DebertaV2Tokenizer
from transformers.models.deberta_v2.modeling_deberta_v2 import StableDropout, DebertaV2Model, ContextPooler
from transformers.modeling_outputs import SequenceClassifierOutput
from typing import Optional, List, Dict, Any, Union, Tuple
import torch
from torch import nn

class DebertaV3ForClassification(DebertaV2ForSequenceClassification):
    def __init__(self, config):
        super().__init__(config)

        num_labels = getattr(config, "num_labels", 2)
        self.num_labels = num_labels

        self.deberta = DebertaV2Model(config)
        self.pooler = ContextPooler(config)
        output_dim = self.pooler.output_dim

        self.classifier = nn.Linear(output_dim, num_labels)
        drop_out = getattr(config, "cls_dropout", None)
        drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out
        self.dropout = StableDropout(drop_out)

        # Initialize weights and apply final processing
        self.post_init()

    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, SequenceClassifierOutput]:
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.deberta(
            input_ids,
            token_type_ids=token_type_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        encoder_layer = outputs[0]
        pooled_output = self.pooler(encoder_layer)
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            loss_fn = nn.MSELoss()
            logits = logits.view(-1).to(labels.dtype)
            loss = loss_fn(logits, labels.view(-1))
            

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states, 
            attentions=outputs.attentions
        )

# model = DebertaV3ForClassification.from_pretrained("microsoft/deberta-v3-base")
tokenizer = DebertaV2Tokenizer.from_pretrained("microsoft/deberta-v3-base")
tokenizer.encode("Hello world")

loading file https://huggingface.co/microsoft/deberta-v3-base/resolve/main/spm.model from cache at /home/dimweb/.cache/huggingface/transformers/ec748fd4f03d0e5a2d5d56dff01e6dd733f23c67105cd54a9910f9d711870253.0abaeacf7287ee8ba758fec15ddfb4bb6c697bb1a8db272725f8aa633501787a
loading file https://huggingface.co/microsoft/deberta-v3-base/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/microsoft/deberta-v3-base/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/microsoft/deberta-v3-base/resolve/main/tokenizer_config.json from cache at /home/dimweb/.cache/huggingface/transformers/967a4d63eb35950cfd24a9e335906419009f32940fa2ba1b73e7ba032628c38d.df5a7f41459442f66bec27ac9352bba694cde109855024b3ae61be2f5734ee9a
loading configuration file https://huggingface.co/microsoft/deberta-v3-base/resolve/main/config.json from cache at /home/dimweb/.cache/huggingface/transformers/e6f9db57345f0f60c9f837fa97bcb27b1ed31e99feb33d732d

[1, 5365, 447, 2]

In [6]:
tokenizer.bos_token_id

1

In [1]:
from core.hyperparameters.debertav3_hyperparameters import DebertaV3HyperparametersV1
from transformers import DebertaV2Tokenizer
from core.dataloaders.focus.lighting.debertav3_lighting_dataloaders import DebertaV3FoCusLightningDataModuleV1, DebertaV3FoCusLightningDataModuleV2 

hyperparameters = DebertaV3HyperparametersV1(
        train_batch_size=16,
        valid_batch_size=16,
    )

tokenizer = DebertaV2Tokenizer.from_pretrained(
        hyperparameters.model_name,
    )

data_module = DebertaV3FoCusLightningDataModuleV2(
        train_path_dataset="./datasets/FoCus/train_focus.json",
        valid_path_dataset="./datasets/FoCus/valid_focus.json",
        hyperparameters=hyperparameters,
        tokenizer=tokenizer,  # type: ignore
        debug_status=0,
    )
    
data_module.setup()

  from .autonotebook import tqdm as notebook_tqdm
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [2]:
next(iter(data_module.train_dataloader()))

{'input_ids': tensor([[    1,   279, 10367,  ...,     0,     0,     0],
         [    1,   606,   307,  ...,     0,     0,     0],
         [    1, 10420, 16479,  ..., 15713,   302,     2],
         ...,
         [    1,   279, 23409,  ...,     0,     0,     0],
         [    1,   279, 75313,  ...,     0,     0,     0],
         [    1,   434,   340,  ...,     0,     0,     0]]),
 'labels': tensor([1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 1, 1, 1],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'unique_ids': ['WL1G6EX8T9JY_dialogue1',
  'NZQX869YZZK9_dialogue6',
  'NTGAXOBEIRLQ_dialogue6',
  'ZANR2M0FJKWF_dialogue1',
  'EKEMEF4T3OHH_dialogue5',
  'VTFNAHV7OIG4_dialogue5',
  'HYOQLKN3XGCD_dialogue5',
  'KAN28FJWSTJP_dialogue4',
  'P878Z7YI2MMT_dialogue1',
  'YJH7UTD0DR6A_dialogue4',
  'K74N0MZ

In [1]:
from core.dataloaders.focus.models.debertav3_dataloaders import DebertaV3FoCusKnowledgeDatasetSampleV2
from core.dataloaders.focus.focus_dataloader import FoCusDatasetKnowledgeV2
from torch.utils.data import Dataset
from typing import TypeVar
from transformers import DebertaV2Tokenizer
from core.hyperparameters.debertav3_hyperparameters import DebertaV3HyperparametersV1
from core.dataloaders.focus.lighting.debertav3_lighting_dataloaders import DebertaV3FoCusLightningDataModuleV4

train_dataset = FoCusDatasetKnowledgeV2(
    input_dataset_path="./datasets/FoCus/valid_focus.json",
    is_train=True,
)
tokenizer = DebertaV2Tokenizer.from_pretrained("microsoft/deberta-v3-base")
hyperparameters = DebertaV3HyperparametersV1(
    train_batch_size=16,
    valid_batch_size=16,
)
is_debug = False
data_module = DebertaV3FoCusLightningDataModuleV4(
    train_path_dataset="./datasets/FoCus/train_focus.json",
    valid_path_dataset="./datasets/FoCus/valid_focus.json",
    hyperparameters=hyperparameters,
    tokenizer=tokenizer,  # type: ignore
    debug_status=is_debug,
)
data_module.setup()

  from .autonotebook import tqdm as notebook_tqdm
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
next(iter(data_module.train_dataloader()))

{'input_ids': tensor([[    1, 45050,  9291,  ...,     0,     0,     0],
         [    1,   325,   269,  ...,     0,     0,     0],
         [    1,   279, 14825,  ...,     0,     0,     0],
         ...,
         [    1,   325,   327,  ...,   470,   302,     2],
         [    1,   279,   918,  ...,     0,     0,     0],
         [    1,   279,  4419,  ...,     0,     0,     0]]),
 'labels': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'unique_ids': ['ZY4PM4NI7BX8_dialogue6',
  'H9VU7XIO2V8Y_dialogue6',
  'UZTAFONAHYX9_dialogue2',
  'N3BKTMTKN6PG_dialogue1',
  'I0PTTIDIWCLR_dialogue5',
  'JBWLG35559PZ_dialogue4',
  'YPH9UCD0CZ4J_dialogue6',
  'HXK3XQEDGK3N_dialogue2',
  'WG8PT0YLSUFX_dialogue5',
  'BFJXIZ89DJY9_dialogue6',
  'NK1R15O

In [1]:
import torch

target = torch.ones([2], dtype=torch.float32)  # 64 classes, batch size = 10
output = torch.full([2, 1], 1.5)  # A prediction (logit)
criterion = torch.nn.BCEWithLogitsLoss()
print(output.shape)
print(target.shape)
criterion(output.view(-1), target)

torch.Size([2, 1])
torch.Size([2])


  from .autonotebook import tqdm as notebook_tqdm


tensor(0.2014)

In [5]:
# Example of target with class indices
loss = torch.nn.CrossEntropyLoss()
input = torch.randn(3, 5, requires_grad=True)
target = torch.empty(3, dtype=torch.long).random_(5)
print(input.shape)
print(target.shape)
output = loss(input, target)

torch.Size([3, 5])
torch.Size([3])


In [9]:
torch.randn((2, 5, 2))

tensor([[[ 0.3774,  0.3921],
         [ 1.1231,  0.4480],
         [-0.6059, -1.7346],
         [-0.4239, -1.7667],
         [-1.9259, -0.9230]],

        [[-0.4765, -0.1975],
         [ 0.4054, -0.9033],
         [-0.5289, -0.9566],
         [-0.2321,  0.2957],
         [-1.1701,  0.7240]]])

In [10]:
torch.randn((2, 5, 2)).argmax(dim=2)

tensor([[0, 0, 0, 1, 1],
        [1, 0, 1, 0, 1]])

In [7]:
from core.dataloaders.focus.focus_dataloader import FoCusDatasetPersonaV2

dataset = FoCusDatasetPersonaV2(
    input_dataset_path="./datasets/FoCus/valid_focus.json",
    is_train=True,
)

positive_examples = 0
negative_examples = 0
for sample in dataset:
    if sample["persona_grounding"] == 1:
        positive_examples += 1
    else:
        negative_examples += 1

print(positive_examples)
print(negative_examples)

3542
5639


In [13]:
print(dataset[4]['dialog'][-2])
print(dataset[4]['dialog'][-1])

Does this house look old to me, when it was built?
This house is relatively old, but since you would like to know when it was built, I will explain it to you. Nazareth House was built from 1924 to 1939.


In [8]:
negative_examples/(positive_examples+negative_examples)

0.6142032458337872

0
9231


### test train

In [1]:
from core.dataloaders.focus.lighting.debertav3_lighting_dataloaders import DebertaV3FoCusPersonaLightningDataModuleV2
from core.base_models.debertav3_models import DebertaV3PersonaClassificationV2
from transformers import DebertaV2Config, DebertaV2Tokenizer
from core.hyperparameters.debertav3_hyperparameters import DebertaV3HyperparametersV1
max_epochs = 1

hyperparameters = DebertaV3HyperparametersV1(
    train_batch_size=16,
    valid_batch_size=16,
    model_name="microsoft/deberta-v3-base",
    project_name="focus_persona_classification",
)

tokenizer = DebertaV2Tokenizer.from_pretrained(
    hyperparameters.model_name,
)

data_module = DebertaV3FoCusPersonaLightningDataModuleV2(
    train_path_dataset="./datasets/FoCus/train_focus.json",
    valid_path_dataset="./datasets/FoCus/valid_focus.json",
    hyperparameters=hyperparameters,
    tokenizer=tokenizer,  # type: ignore
    debug_status=0,
)
base_model = DebertaV3PersonaClassificationV2(
    config=DebertaV2Config.from_pretrained(
        hyperparameters.model_name,
    ),  # type: ignore
)
data_module.setup()

  from .autonotebook import tqdm as notebook_tqdm
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [2]:
import torch

optimizer = torch.optim.AdamW(
    base_model.parameters(),
    lr=hyperparameters.learning_rate,
    weight_decay=hyperparameters.weight_decay,
)

In [5]:
from transformers import AutoModelForSequenceClassification

base_model = AutoModelForSequenceClassification.from_pretrained(
    "microsoft/deberta-v3-base",
    num_labels=1,
)

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2ForSequenceClassification: ['mask_predictions.classifier.bias', 'mask_predictions.LayerNorm.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a

In [8]:

for i in range(1000):
    total_loss = 0
    total_accuracy = 0
    for step, batch in enumerate(data_module.val_dataloader()):
        batch['labels'] = batch['labels'].float()
        batch.pop("unique_ids", None)
        optimizer.zero_grad()
        output = base_model(**batch)
        logits = output.logits
        loss = output.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        preds = (torch.sigmoid(logits) > 0.5).int().flatten()
        labels = batch["labels"].int().flatten()
        acc = (preds == labels).float().mean().item()
        total_accuracy += acc
        print(f"Loss: {loss.item()}")
        print(f"Accuracy: {acc}")
        print(f"Average loss: {total_loss / (step + 1)}")
        print(f"Average accuracy: {total_accuracy / (step + 1)}")
        print("-"*10)
        if step > 10:
            break

Loss: 0.16637402772903442
Accuracy: 0.1875
Average loss: 0.16637402772903442
Average accuracy: 0.1875
----------
Loss: 0.1664477288722992
Accuracy: 0.1875
Average loss: 0.1664108783006668
Average accuracy: 0.1875
----------
Loss: 0.05837586522102356
Accuracy: 0.0625
Average loss: 0.13039920727411905
Average accuracy: 0.14583333333333334
----------
Loss: 0.1655023843050003
Accuracy: 0.1875
Average loss: 0.13917500153183937
Average accuracy: 0.15625
----------
Loss: 0.05896855145692825
Accuracy: 0.0625
Average loss: 0.12313371151685715
Average accuracy: 0.1375
----------
Loss: 0.11309413611888885
Accuracy: 0.125
Average loss: 0.1214604489505291
Average accuracy: 0.13541666666666666
----------
Loss: 0.05686089023947716
Accuracy: 0.0625
Average loss: 0.11223194056323596
Average accuracy: 0.125
----------
Loss: 0.11292505264282227
Accuracy: 0.125
Average loss: 0.11231857957318425
Average accuracy: 0.125
----------
Loss: 0.1121257022023201
Accuracy: 0.125
Average loss: 0.11229714875419934
Av

KeyboardInterrupt: 

In [1]:
from core.dataloaders.focus.focus_dataloader import FoCusDatasetPersonaV2
train_data = []
eval_data = []

train_dataset = FoCusDatasetPersonaV2(
    input_dataset_path="./datasets/FoCus/train_focus.json",
    is_train=True,
)

for sample in train_dataset:
    label = sample["persona_grounding"]
    query = sample["dialog"][-2]
    persona = sample["persona"]
    knowledge = sample["used_knowledge"]
    input_sent = f"{persona} {knowledge} {query}"
    train_data.append([input_sent, label])

valid_dataset = FoCusDatasetPersonaV2(
    input_dataset_path="./datasets/FoCus/valid_focus.json",
    is_train=False,
)

for sample in valid_dataset:
    label = sample["persona_grounding"]
    query = sample["dialog"][-2]
    persona = sample["persona"]
    knowledge = sample["used_knowledge"]
    input_sent = f"{persona} {knowledge} {query}"
    eval_data.append([input_sent, label])


In [2]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import pandas as pd
import logging


logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

# Preparing train data

train_df = pd.DataFrame(train_data)
train_df.columns = ["text", "labels"]

# Preparing eval data

eval_df = pd.DataFrame(eval_data)
eval_df.columns = ["text", "labels"]



  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Optional model configuration
model_args = ClassificationArgs(
    num_train_epochs=2,
    no_save=True
)

# Create a ClassificationModel
model = ClassificationModel(
    "roberta", "roberta-base", args=model_args
)

# Train the model
model.train_model(
    train_df
)

# Evaluate the model
result, model_outputs, wrong_predictions = model.eval_model(eval_df)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.

KeyboardInterrupt: 