In [1]:
# --- BIO T5 TRAINING (Robust Single GPU) ---
# Uses a Custom Loop to bypass 'model.fit' and 'transformers' library bugs on Windows.
# This is the guaranteed way to run without AttributeErrors.
import tensorflow as tf
import pandas as pd
import numpy as np
import os
import time
from datasets import Dataset
from transformers import TFT5ForConditionalGeneration, T5Tokenizer

print(f"TensorFlow: {tf.__version__}")
print(f"GPUs: {len(tf.config.list_physical_devices('GPU'))}")

# Mixed Precision (Disabled for Stability - Fixes NaN Loss)
# tf.keras.mixed_precision.set_global_policy('mixed_float16')

  from .autonotebook import tqdm as notebook_tqdm


TensorFlow: 2.10.0
GPUs: 5


In [2]:
class Config:
    MODEL_NAME = "QizhiPei/biot5-base"
    CSV_PATH = r'D:\AIvolution\data\qa_dataset_concatenated.csv'
    MAX_LENGTH = 256
    BATCH_SIZE = 4 
    EPOCHS = 3
    LEARNING_RATE = 1e-4
    CHECKPOINT_DIR = r'D:\AIvolution\transformer\biot5_checkpoints'
    
config = Config()
if not os.path.exists(config.CHECKPOINT_DIR):
    os.makedirs(config.CHECKPOINT_DIR)


In [3]:
print("Loading Data...")
df = pd.read_csv(config.CSV_PATH)
df['question'] = df['question'].apply(lambda x: str(x).encode('ascii', 'ignore').decode('ascii'))
df['answer'] = df['answer'].apply(lambda x: str(x).encode('ascii', 'ignore').decode('ascii'))

df["input_text"] = "question: " + df["question"]
df["target_text"] = df["answer"]

dataset = Dataset.from_pandas(df[["input_text", "target_text"]])
dataset = dataset.train_test_split(test_size=0.1, seed=42)


Loading Data...


In [4]:
print("Loading Tokenizer...")
tokenizer = T5Tokenizer.from_pretrained(config.MODEL_NAME)

def tokenize_function(examples, tokenizer=None, max_length=256):
    model_inputs = tokenizer(examples["input_text"], max_length=max_length, padding="max_length", truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["target_text"], max_length=max_length, padding="max_length", truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

print("Tokenizing...")
tokenized_datasets = dataset.map(
    tokenize_function, 
    batched=True, 
    fn_kwargs={'tokenizer': tokenizer, 'max_length': config.MAX_LENGTH},
    remove_columns=dataset["train"].column_names,
    num_proc=1
)

Loading Tokenizer...
Tokenizing...


Map (num_proc=1): 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 147600/147600 [04:03<00:00, 605.01 examples/s]
Map (num_proc=1): 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 16401/16401 [00:39<00:00, 416.36 examples/s]


In [5]:
print("Loading Model...")
try:
    model = TFT5ForConditionalGeneration.from_pretrained(config.MODEL_NAME)
except:
    model = TFT5ForConditionalGeneration.from_pretrained(config.MODEL_NAME, from_pt=True)

optimizer = tf.keras.optimizers.Adam(learning_rate=config.LEARNING_RATE)
checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=model)
ckpt_manager = tf.train.CheckpointManager(checkpoint, config.CHECKPOINT_DIR, max_to_keep=2)

# Datasets
tf_train_dataset = model.prepare_tf_dataset(
    tokenized_datasets["train"],
    shuffle=True,
    batch_size=config.BATCH_SIZE,
    tokenizer=tokenizer
)

# Manual Loss Function
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def custom_loss(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    return tf.reduce_sum(loss_) / tf.reduce_sum(mask)

Loading Model...


All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [6]:
@tf.function
def train_step(inputs):
    # 1. Unpack Data correctly
    if isinstance(inputs, (tuple, list)):
        features, labels = inputs
        input_ids = features['input_ids']
        attention_mask = features['attention_mask']
    else:
        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']
        labels = inputs['labels']
    
    # 2. Shift labels for Decoder Input (Standard T5)
    # decoder_input_ids = [pad] + labels[:-1]
    start_token = 0 
    shape = tf.shape(labels)
    start_tokens = tf.fill([shape[0], 1], tf.cast(start_token, labels.dtype))
    sliced_labels = labels[:, :-1]
    decoder_input_ids = tf.concat([start_tokens, sliced_labels], axis=1)
    
    # 3. Forward Pass & Backprop
    with tf.GradientTape() as tape:
        # CRITICAL: Do NOT pass 'labels' here. It causes the library to crash.
        # We calculate loss ourselves.
        outputs = model(
            input_ids=input_ids, 
            attention_mask=attention_mask, 
            decoder_input_ids=decoder_input_ids,
            training=True
        )
        loss = custom_loss(labels, outputs.logits)
        
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))
    return loss

# --- TRAINING EXECUTION ---
steps_per_epoch = len(tokenized_datasets["train"]) // config.BATCH_SIZE
train_loss_metric = tf.keras.metrics.Mean(name='train_loss')

print("Starting Robust Training... ðŸš€")
for epoch in range(config.EPOCHS):
    print(f"\nEpoch {epoch+1}/{config.EPOCHS}")
    train_loss_metric.reset_state()
    progbar = tf.keras.utils.Progbar(steps_per_epoch)
    
    for batch_idx, batch in enumerate(tf_train_dataset):
        loss = train_step(batch)
        train_loss_metric.update_state(loss)
        
        if batch_idx % 10 == 0:
            progbar.update(batch_idx, values=[("loss", train_loss_metric.result())])
            
    save_path = ckpt_manager.save()
    print(f"Saved checkpoint to {save_path}")
    
print("Saving Final Model...")
model.save_pretrained(os.path.join(config.CHECKPOINT_DIR, 'final_model'))
tokenizer.save_pretrained(os.path.join(config.CHECKPOINT_DIR, 'final_model'))
print("Done!")

Starting Robust Training... ðŸš€

Epoch 1/3

Epoch 2/3

Epoch 3/3
Saving Final Model...
Done!
