In [None]:
class T5EncoderModelForPssmGeneration(T5EncoderModel):
    def __init__(
        self,
        config: T5Config,
    ):
        super().__init__(config)
        self.dropout = nn.Dropout(config.classifier_dropout)
        # Don't create classifier here
        
    def post_init(self):
        super().post_init()
        # Create classifier after everything else is initialized
        self.classifier = nn.Linear(self.config.hidden_size, self.config.num_labels)
        with torch.no_grad():
            self.classifier.weight.uniform_(-0.02, 0.02)
            print(f"Weight mean: {self.classifier.weight.mean().item():.5f}")
            print(f"Weight std: {self.classifier.weight.std().item():.5f}")

# Create model
model = T5EncoderModelForPssmGeneration.from_pretrained(
    "Rostlab/prot_t5_xl_uniref50",
    config=config
)

In [2]:
from transformers import DataCollatorWithPadding, AutoTokenizer

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Create data collator with tokenizer
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Example data
sequences = ["Hello world", "This is another sequence"]
encoded_sequences = [tokenizer(seq, truncation=True) for seq in sequences]

In [3]:
encoded_sequences

[{'input_ids': [101, 7592, 2088, 102], 'token_type_ids': [0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1]},
 {'input_ids': [101, 2023, 2003, 2178, 5537, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1]}]

In [4]:

# The data collator will tokenize and pad in one step
batch = data_collator(encoded_sequences)

In [5]:
batch

{'input_ids': tensor([[ 101, 7592, 2088,  102,    0,    0],
        [ 101, 2023, 2003, 2178, 5537,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1]])}