In [20]:
from peft import PeftModelForSeq2SeqLM, PromptEncoderConfig, PromptEncoderReparameterizationType, get_peft_model, get_peft_model_state_dict, set_peft_model_state_dict
from peft.tuners.prefix_tuning import PrefixEncoder
from peft.tuners.p_tuning import PromptEncoder
from peft.utils import _get_batch_size, PeftType, TaskType, TRANSFORMERS_MODELS_TO_PREFIX_TUNING_POSTPROCESS_MAPPING, map_cache_to_layer_device_map
from transformers import PreTrainedModel, DynamicCache, EncoderDecoderCache
from typing import Optional
import torch
import numpy as np
import warnings
from dataclasses import dataclass, field

@dataclass
class AbstractPromptEncoderConfig(PromptEncoderConfig):
    """
    This is the configuration class to store the configuration of a [`PromptEncoder`].

    Args:
        encoder_reparameterization_type (Union[[`PromptEncoderReparameterizationType`], `str`]):
            The type of reparameterization to use.
        encoder_hidden_size (`int`): The hidden size of the prompt encoder.
        encoder_num_layers (`int`): The number of layers of the prompt encoder.
        encoder_dropout (`float`): The dropout probability of the prompt encoder.
    """

    num_subjects: int = field(
        default=8,
        metadata={"help": "The number of subjects of the prompt encoder"},
    )
    padding_idx: int = field(
        default=None,
        metadata={"help": "The padding index of the prompt encoder"},
    )
    def __post_init__(self):
        super().__post_init__()
        self.peft_type = PeftType.P_TUNING #TODO: switch to APTuning


class AbstractPromptEncoder(PromptEncoder):
    def __init__(self, config):

        super().__init__(config)

        self.num_subjects = config.num_subjects
        self.total_virtual_tokens = config.num_virtual_tokens * config.num_subjects * config.num_transformer_submodules
        if config.padding_idx is not None:
            self.padding_idx = config.padding_idx
        else:
            self.padding_idx = self.total_virtual_tokens
            self.total_virtual_tokens += 1

        print(f"total_virtual_tokens: {self.total_virtual_tokens}")
        print(f"num_subjects: {self.num_subjects}")
        print(f"padding_idx: {self.padding_idx}")
        # embedding
        self.embedding = torch.nn.Embedding(self.total_virtual_tokens, self.token_dim, padding_idx=self.padding_idx)
        if not config.inference_mode:
            if self.encoder_type == PromptEncoderReparameterizationType.LSTM:
                lstm_dropout = config.encoder_dropout
                num_layers = config.encoder_num_layers
                # LSTM
                self.lstm_head = torch.nn.LSTM(
                    input_size=self.input_size,
                    hidden_size=self.hidden_size,
                    num_layers=num_layers,
                    dropout=lstm_dropout,
                    bidirectional=True,
                    batch_first=True,
                )

                self.mlp_head = torch.nn.Sequential(
                    torch.nn.Linear(self.hidden_size * 2, self.hidden_size * 2),
                    torch.nn.ReLU(),
                    torch.nn.Linear(self.hidden_size * 2, self.output_size),
                )

            elif self.encoder_type == PromptEncoderReparameterizationType.MLP:
                encoder_num_layers_default = AbstractPromptEncoderConfig.encoder_num_layers
                if config.encoder_num_layers != encoder_num_layers_default:
                    warnings.warn(
                        f"for {self.encoder_type.value}, the argument `encoder_num_layers` is ignored. "
                        f"Exactly {encoder_num_layers_default} MLP layers are used."
                    )
                layers = [
                    torch.nn.Linear(self.input_size, self.hidden_size),
                    torch.nn.ReLU(),
                    torch.nn.Linear(self.hidden_size, self.hidden_size),
                    torch.nn.ReLU(),
                    torch.nn.Linear(self.hidden_size, self.output_size),
                ]
                self.mlp_head = torch.nn.Sequential(*layers)

            else:
                raise ValueError("Prompt encoder type not recognized. Please use one of MLP (recommended) or LSTM.")

batch_size=2
num_virtual_tokens = 3
num_subjects = 2
token_dim=5
conversation_len = 3

peft_config = AbstractPromptEncoderConfig(task_type=TaskType.SEQ_2_SEQ_LM, inference_mode=False, num_virtual_tokens=num_virtual_tokens, num_subjects=num_subjects, token_dim=token_dim, num_transformer_submodules=1, encoder_hidden_size=10)

temp = AbstractPromptEncoder(peft_config)

total_virtual_tokens: 7
num_subjects: 2
padding_idx: 6


In [39]:
from transformers import AutoTokenizer, T5ForConditionalGeneration
tokenizer = AutoTokenizer.from_pretrained("t5-base")
print(tokenizer.pad_token_id)
print(tokenizer.pad_token)
print(tokenizer("<pad>"))

temp = "this is a sentnence"
print(tokenizer(temp))
print(tokenizer(temp, padding="max_length", max_length=10))
print(tokenizer(temp+"<pad>", padding="max_length", max_length=10))

0
<pad>
{'input_ids': [0, 1], 'attention_mask': [1, 1]}
{'input_ids': [48, 19, 3, 9, 1622, 29, 1433, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}
{'input_ids': [48, 19, 3, 9, 1622, 29, 1433, 1, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 0, 0]}
{'input_ids': [48, 19, 3, 9, 1622, 29, 1433, 0, 1, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 0]}


In [None]:
from torch.utils.data import Dataset, DataLoader

class ProgressiveDialogueDataset(Dataset):
    def __init__(self, dialogue_data):
        """
        Args:
            dialogue_data: A list of lists, where each inner list contains sentences
                           from one dialogue.
        """
        self.dialogue_data = dialogue_data
        self.cumulative_sentence_counts = [0]
        self.row_lengths = [len(row) for row in dialogue_data]
        total_sentences = 0
        for length in self.row_lengths:
            total_sentences += length
            self.cumulative_sentence_counts.append(total_sentences)

    def __len__(self):
        return self.cumulative_sentence_counts[-1]

    def __getitem__(self, idx):
        # Find which row this index belongs to
        row_index = 0
        while self.cumulative_sentence_counts[row_index + 1] <= idx:
            row_index += 1

        # Find the index of the sentence within that row
        sentence_index = idx - self.cumulative_sentence_counts[row_index]

        # Get the progressive sequence of sentences
        return self.dialogue_data[row_index][:sentence_index + 1]

# Example Usage
dialogues = [
    ["Hello", "How are you?", "I'm fine."],
    ["Hi", "What's up?"],
    ["Greetings"]
]

progressive_dataset = ProgressiveDialogueDataset(dialogues)
dataloader = DataLoader(progressive_dataset, batch_size=2, shuffle=True, max_length)

for batch in dataloader:
    print(batch)

[('Hello', 'Hi')]


RuntimeError: each element in list of batch should be of equal size

In [30]:
progressive_dataset[5]

['Greetings']

In [None]:
@dataclass
class AbstractPromptEncoderConfig(PromptEncoderConfig):
    """
    This is the configuration class to store the configuration of a [`PromptEncoder`].

    Args:
        encoder_reparameterization_type (Union[[`PromptEncoderReparameterizationType`], `str`]):
            The type of reparameterization to use.
        encoder_hidden_size (`int`): The hidden size of the prompt encoder.
        encoder_num_layers (`int`): The number of layers of the prompt encoder.
        encoder_dropout (`float`): The dropout probability of the prompt encoder.
    """

    num_subjects: int = field(
        default=8,
        metadata={"help": "The number of subjects of the prompt encoder"},
    )
    padding_idx: int = field(
        default=None,
        metadata={"help": "The padding index of the prompt encoder"},
    )
    def __post_init__(self):
        super().__post_init__()
        self.peft_type = PeftType.P_TUNING #TODO: switch to APTuning


class AbstractPromptEncoder(PromptEncoder):
    def __init__(self, config):
        super().__init__(config)
        self.num_subjects = config.num_subjects
        self.total_virtual_tokens = config.num_virtual_tokens * config.num_subjects * config.num_transformer_submodules
        if config.padding_idx is not None:
            self.padding_idx = config.padding_idx
        else:
            self.padding_idx = self.total_virtual_tokens
            self.total_virtual_tokens += 1

        # embedding
        self.embedding = torch.nn.Embedding(self.total_virtual_tokens, self.token_dim, padding_idx=self.padding_idx)
        if not config.inference_mode:
            if self.encoder_type == PromptEncoderReparameterizationType.LSTM:
                lstm_dropout = config.encoder_dropout
                num_layers = config.encoder_num_layers
                # LSTM
                self.lstm_head = torch.nn.LSTM(
                    input_size=self.input_size,
                    hidden_size=self.hidden_size,
                    num_layers=num_layers,
                    dropout=lstm_dropout,
                    bidirectional=True,
                    batch_first=True,
                )

                self.mlp_head = torch.nn.Sequential(
                    torch.nn.Linear(self.hidden_size * 2, self.hidden_size * 2),
                    torch.nn.ReLU(),
                    torch.nn.Linear(self.hidden_size * 2, self.output_size),
                )

            elif self.encoder_type == PromptEncoderReparameterizationType.MLP:
                encoder_num_layers_default = PromptEncoderConfig.encoder_num_layers
                if config.encoder_num_layers != encoder_num_layers_default:
                    warnings.warn(
                        f"for {self.encoder_type.value}, the argument `encoder_num_layers` is ignored. "
                        f"Exactly {encoder_num_layers_default} MLP layers are used."
                    )
                layers = [
                    torch.nn.Linear(self.input_size, self.hidden_size),
                    torch.nn.ReLU(),
                    torch.nn.Linear(self.hidden_size, self.hidden_size),
                    torch.nn.ReLU(),
                    torch.nn.Linear(self.hidden_size, self.output_size),
                ]
                self.mlp_head = torch.nn.Sequential(*layers)

            else:
                raise ValueError("Prompt encoder type not recognized. Please use one of MLP (recommended) or LSTM.")

config = 

TypeError: empty() received an invalid combination of arguments - got (tuple, dtype=NoneType, device=NoneType), but expected one of:
 * (tuple of ints size, *, tuple of names names, torch.memory_format memory_format = None, torch.dtype dtype = None, torch.layout layout = None, torch.device device = None, bool pin_memory = False, bool requires_grad = False)
 * (tuple of ints size, *, torch.memory_format memory_format = None, Tensor out = None, torch.dtype dtype = None, torch.layout layout = None, torch.device device = None, bool pin_memory = False, bool requires_grad = False)
