In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [2]:
import torch
from transformers import BertForPreTraining, BertTokenizerFast, Trainer, TrainingArguments,DataCollatorForLanguageModeling
from typing import Union, List, Dict,Mapping, Optional, Tuple,Any
import pandas as pd
from datasets import Dataset
from flipper import Flipper
import pytorch_lightning as pl

In [3]:
dataset = pd.read_csv('corpus.csv')
dataset = dataset.dropna()
dataset = Dataset.from_pandas(dataset)
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
flipper = Flipper('gendered_words/gendered_words.json')
def tokenize(batch):
    inputs = tokenizer(batch['original'], truncation=True, padding='max_length',max_length=128)
    return inputs

dataset = dataset.map(tokenize, batched=True, batch_size=127)

Map:   0%|          | 0/105687 [00:00<?, ? examples/s]

In [4]:
class TextData(torch.utils.data.Dataset):
    def __init__(self):
        pass

In [5]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        flipped_inputs = inputs.pop("flipped_input_ids")
        lambda_ = 5e-2
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.prediction_logits
        loss_fct = torch.nn.CrossEntropyLoss()
        mse = torch.nn.MSELoss()
        loss = loss_fct(logits.transpose(1,2),labels)
        flipped_outputs = model(input_ids=flipped_inputs)
        flipped_logits = flipped_outputs.prediction_logits
        flipped_loss = mse(logits,flipped_logits)
        logs = {"loss": loss, "flipped_loss": flipped_loss}
        self.log(logs)
        loss = loss+lambda_*flipped_loss
        return (loss, outputs) if return_outputs else loss

In [6]:
def pad_without_fast_tokenizer_warning(tokenizer, *pad_args, **pad_kwargs):
    """
    Pads without triggering the warning about how using the pad function is sub-optimal when using a fast tokenizer.
    """

    # To avoid errors when using Feature extractors
    if not hasattr(tokenizer, "deprecation_warnings"):
        return tokenizer.pad(*pad_args, **pad_kwargs)

    # Save the state of the warning, then disable it
    warning_state = tokenizer.deprecation_warnings.get("Asking-to-pad-a-fast-tokenizer", False)
    tokenizer.deprecation_warnings["Asking-to-pad-a-fast-tokenizer"] = True

    try:
        padded = tokenizer.pad(*pad_args, **pad_kwargs)
    finally:
        # Restore the state of the warning.
        tokenizer.deprecation_warnings["Asking-to-pad-a-fast-tokenizer"] = warning_state

    return padded

def _torch_collate_batch(examples, tokenizer, pad_to_multiple_of: Optional[int] = None):
    """Collate `examples` into a batch, using the information in `tokenizer` for padding if necessary."""
    import torch

    # Tensorize if necessary.
    if isinstance(examples[0], (list, tuple, np.ndarray)):
        examples = [torch.tensor(e, dtype=torch.long) for e in examples]

    length_of_first = examples[0].size(0)

    # Check if padding is necessary.

    are_tensors_same_length = all(x.size(0) == length_of_first for x in examples)
    if are_tensors_same_length and (pad_to_multiple_of is None or length_of_first % pad_to_multiple_of == 0):
        return torch.stack(examples, dim=0)

    # If yes, check if we have a `pad_token`.
    if tokenizer._pad_token is None:
        raise ValueError(
            "You are attempting to pad samples but the tokenizer you are using"
            f" ({tokenizer.__class__.__name__}) does not have a pad token."
        )

    # Creating the full tensor and filling it with our data.
    max_length = max(x.size(0) for x in examples)
    if pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
        max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
    result = examples[0].new_full([len(examples), max_length], tokenizer.pad_token_id)
    for i, example in enumerate(examples):
        if tokenizer.padding_side == "right":
            result[i, : example.shape[0]] = example
        else:
            result[i, -example.shape[0] :] = example
    return result


class CustomCollator(DataCollatorForLanguageModeling):
    def torch_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
        # Handle dict or lists with proper padding and conversion to tensor.
        if isinstance(examples[0], Mapping):
            batch = pad_without_fast_tokenizer_warning(
                self.tokenizer, examples, return_tensors="pt", pad_to_multiple_of=self.pad_to_multiple_of
            )
        else:
            batch = {
                "input_ids": _torch_collate_batch(examples, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of),
            }
        flipped_text = [flipper.flip(self.tokenizer.decode(text,skip_special_tokens=True)) for text in batch['input_ids']]
        batch['flipped_input_ids'] = [self.tokenizer.encode_plus(t, padding="max_length", truncation=True, max_length=128,return_tensors='pt')["input_ids"] for t in flipped_text]
        batch['flipped_input_ids'] = torch.cat(batch['flipped_input_ids'])

        # If special token mask has been preprocessed, pop it from the dict.
        special_tokens_mask = batch.pop("special_tokens_mask", None)
        if self.mlm:
            batch["input_ids"], batch["labels"],batch['flipped_input_ids'] = self.torch_mask_tokens(
                batch["input_ids"], batch['flipped_input_ids'],special_tokens_mask=special_tokens_mask
            )
        else:
            labels = batch["input_ids"].clone()
            if self.tokenizer.pad_token_id is not None:
                labels[labels == self.tokenizer.pad_token_id] = -100
            batch["labels"] = labels
        return batch

    def torch_mask_tokens(self, inputs: Any, flipped_inputs,special_tokens_mask: Optional[Any] = None) -> Tuple[Any, Any]:
        """
        Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
        """
        import torch

        labels = inputs.clone()
        # We sample a few tokens in each sequence for MLM training (with probability `self.mlm_probability`)
        probability_matrix = torch.full(labels.shape, self.mlm_probability)
        if special_tokens_mask is None:
            special_tokens_mask = [
                self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
            ]
            special_tokens_mask = torch.tensor(special_tokens_mask, dtype=torch.bool)
        else:
            special_tokens_mask = special_tokens_mask.bool()

        probability_matrix.masked_fill_(special_tokens_mask, value=0.0)
        masked_indices = torch.bernoulli(probability_matrix).bool()
        labels[~masked_indices] = -100  # We only compute loss on masked tokens

        # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
        indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
        inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)
        flipped_inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)
        

        # 10% of the time, we replace masked input tokens with random word
        indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
        random_words = torch.randint(len(self.tokenizer), labels.shape, dtype=torch.long)
        inputs[indices_random] = random_words[indices_random]
        flipped_inputs[indices_random] = random_words[indices_random]

        # The rest of the time (10% of the time) we keep the masked input tokens unchanged
        return inputs, labels,flipped_inputs

In [7]:
model = BertForPreTraining.from_pretrained('bert-base-uncased')
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=20,              # total number of training epochs
    per_device_train_batch_size=4,  # batch size per device during training
    per_device_eval_batch_size=8,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    logging_dir='./logs',            # directory for storing logs
    fp16=True,
    learning_rate=5e-5,
)
datasets = dataset.train_test_split(test_size=0.1)
trainer = CustomTrainer(
    model=model,
    args=training_args,
    data_collator=CustomCollator(tokenizer,mlm=True, mlm_probability=0.15),
    train_dataset=datasets['train'],
    eval_dataset=datasets['test']
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [8]:
trainer.train()        

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33ms-r-viksit[0m ([33mviks-ai[0m). Use [1m`wandb login --relogin`[0m to force relogin


We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Step,Training Loss
0,"tensor(3.2008, device='cuda:0', grad_fn=)"


OutOfMemoryError: CUDA out of memory. Tried to allocate 60.00 MiB. GPU 0 has a total capacty of 39.43 GiB of which 26.31 MiB is free. Process 3290510 has 21.90 GiB memory in use. Process 3374616 has 15.15 GiB memory in use. Including non-PyTorch memory, this process has 2.35 GiB memory in use. Of the allocated memory 1.74 GiB is allocated by PyTorch, and 112.47 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF