In [4]:
!pip install transformers datasets accelerate openpyxl evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=2.0.0->a

In [1]:
from google.colab.files import upload
uploaded=upload()

Saving dataset.xlsx to dataset (1).xlsx


In [7]:
import pandas as pd
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, TrainingArguments, Trainer, DataCollatorForSeq2Seq
from datasets import Dataset
import evaluate
from sklearn.model_selection import KFold
import gc

# 1. Load combined dataset
df = pd.read_excel("dataset.xlsx")

# 2. Combine target columns
def make_target(row):
    return f"""Abnormal/Normal: {row['Abnormal/Normal']}
Pathologies Extracted: {row['Pathologies Extracted']}
Midline Shift: {row['Midline Shift']}
Location & Brain Organ: {row['Location & Brain Organ']}
Bleed Subcategory: {row['Bleed Subcategory']}"""

df['target'] = df.apply(make_target, axis=1)

# 3. Load model & tokenizer
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)

max_input_length = 512
max_target_length = 256
device = "cuda" if torch.cuda.is_available() else "cpu"

def preprocess(example):
    input_text = f"Extract info: {example['Radiologist Diagnosis']}"
    input_enc = tokenizer(input_text, padding="max_length", truncation=True, max_length=max_input_length)
    target_enc = tokenizer(example['target'], padding="max_length", truncation=True, max_length=max_target_length)
    input_enc['labels'] = target_enc['input_ids']
    return input_enc

# 4. KFold setup
kf = KFold(n_splits=5, shuffle=True, random_state=42)
metric = evaluate.load("accuracy")
results = []  # Collect fold results

for fold, (train_index, val_index) in enumerate(kf.split(df), start=1):
    # Split the data
    train_df = df.iloc[train_index].reset_index(drop=True)
    val_df = df.iloc[val_index].reset_index(drop=True)

    # Tokenize and create datasets
    train_dataset = Dataset.from_pandas(train_df[['Radiologist Diagnosis', 'target']]).map(
        preprocess, remove_columns=['Radiologist Diagnosis', 'target'])
    val_dataset = Dataset.from_pandas(val_df[['Radiologist Diagnosis', 'target']]).map(
        preprocess, remove_columns=['Radiologist Diagnosis', 'target'])

    # Reload fresh model per fold
    model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)

    def compute_metrics(eval_pred):
        predictions, labels = eval_pred
        decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
        return metric.compute(predictions=decoded_preds, references=decoded_labels)

    training_args = TrainingArguments(
        output_dir=f"./t5_fold_{fold}",
        per_device_train_batch_size=8,
        per_device_eval_batch_size=1,  # Lower batch size to save memory
        num_train_epochs=4,
        logging_dir=f"./logs/fold_{fold}",
        logging_steps=10,
        save_total_limit=1,
        report_to="none"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        data_collator=DataCollatorForSeq2Seq(tokenizer, model=model),
        compute_metrics=compute_metrics,
    )

    trainer.train()

    # Evaluate
    trainer.compute_metrics = None  # Temporarily disable metric computation
    eval_results = trainer.evaluate()



    # Store only required results
    results.append({
        "Fold": fold,
        "Eval Loss": eval_results.get("eval_loss", None),
        "Accuracy": None
    })

    # Aggressive memory cleanup
    del model, trainer, train_dataset, val_dataset
    gc.collect()
    torch.cuda.empty_cache()

# Save final results
results_df = pd.DataFrame(results)
results_df.to_excel("kfold_eval_results.xlsx", index=False)


Map:   0%|          | 0/956 [00:00<?, ? examples/s]

Map:   0%|          | 0/239 [00:00<?, ? examples/s]

  trainer = Trainer(


Step,Training Loss
10,11.0959
20,6.0691
30,3.3815
40,2.1527
50,1.8017
60,1.4923
70,1.1808
80,0.9194
90,0.7207
100,0.5921


Map:   0%|          | 0/956 [00:00<?, ? examples/s]

Map:   0%|          | 0/239 [00:00<?, ? examples/s]

  trainer = Trainer(


Step,Training Loss
10,11.2905
20,5.8188
30,2.8984
40,2.0471
50,1.6761
60,1.355
70,1.0885
80,0.8643
90,0.7178
100,0.6254


Map:   0%|          | 0/956 [00:00<?, ? examples/s]

Map:   0%|          | 0/239 [00:00<?, ? examples/s]

  trainer = Trainer(


Step,Training Loss
10,10.9992
20,6.507
30,3.458
40,2.2585
50,1.7463
60,1.4108
70,1.061
80,0.8349
90,0.7067
100,0.5815


Map:   0%|          | 0/956 [00:00<?, ? examples/s]

Map:   0%|          | 0/239 [00:00<?, ? examples/s]

  trainer = Trainer(


Step,Training Loss
10,10.5436
20,5.684
30,2.9862
40,2.0714
50,1.697
60,1.3619
70,1.1355
80,0.8988
90,0.7749
100,0.6331


Map:   0%|          | 0/956 [00:00<?, ? examples/s]

Map:   0%|          | 0/239 [00:00<?, ? examples/s]

  trainer = Trainer(


Step,Training Loss
10,10.5534
20,5.4324
30,3.1145
40,2.153
50,1.7229
60,1.356
70,1.0732
80,0.8656
90,0.7233
100,0.5822


In [12]:
model = T5ForConditionalGeneration.from_pretrained("./t5_fold_4/checkpoint-480")
tokenizer = T5Tokenizer.from_pretrained("t5-small")

# Save to a new directory
save_path = "./t5_radiology_final"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

('./t5_radiology_final/tokenizer_config.json',
 './t5_radiology_final/special_tokens_map.json',
 './t5_radiology_final/spiece.model',
 './t5_radiology_final/added_tokens.json')