In [1]:
from google.colab.files import upload
uploaded = upload()

Saving dataset.xlsx to dataset.xlsx
Saving rag_corpus.txt to rag_corpus.txt


In [2]:
from google.colab.files import upload
uploaded = upload()

Saving test_dataset.xlsx to test_dataset.xlsx


In [3]:
!pip install transformers datasets accelerate openpyxl evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=2.0.0->a

In [4]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl (31.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m68.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.11.0


## Training

In [5]:
import pandas as pd
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, DataCollatorForSeq2Seq
from sklearn.feature_extraction.text import TfidfVectorizer
from datasets import Dataset
import faiss
import numpy as np

# Step 1: Load dataset
df = pd.read_excel("dataset.xlsx")

# Step 2: Combine target fields
def make_target(row):
    return f"""Abnormal/Normal: {row['Abnormal/Normal']}
Pathologies Extracted: {row['Pathologies Extracted']}
Midline Shift: {row['Midline Shift']}
Location & Brain Organ: {row['Location & Brain Organ']}
Bleed Subcategory: {row['Bleed Subcategory']}"""

df["target"] = df.apply(make_target, axis=1)
df = df.dropna(subset=["Radiologist Diagnosis"]).reset_index(drop=True)

# Step 3: Create knowledge base from all rows
documents = df["Radiologist Diagnosis"].tolist()

# Step 4: Build TF-IDF vectorizer and FAISS index
vectorizer = TfidfVectorizer()
doc_vectors = vectorizer.fit_transform(documents).toarray()
index = faiss.IndexFlatL2(doc_vectors.shape[1])
index.add(doc_vectors)

# Step 5: Retrieve most similar row (self-retrieval allowed here)
queries = df["Radiologist Diagnosis"].tolist()
query_vectors = vectorizer.transform(queries).toarray()
_, indices = index.search(query_vectors, 1)
retrieved_contexts = [documents[i[0]] for i in indices]

df["retrieved"] = retrieved_contexts

# Step 6: Prepare tokenizer and model
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name).to("cuda" if torch.cuda.is_available() else "cpu")

# Step 7: Tokenization for training
max_input_length = 512
max_target_length = 256

def preprocess(example):
    input_text = f"Extract info: {example['retrieved']}"
    input_enc = tokenizer(input_text, padding="max_length", truncation=True, max_length=max_input_length)
    target_enc = tokenizer(example["target"], padding="max_length", truncation=True, max_length=max_target_length)
    input_enc["labels"] = target_enc["input_ids"]
    return input_enc

# Step 8: Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(df[["retrieved", "target"]])
tokenized_dataset = dataset.map(preprocess, remove_columns=["retrieved", "target"])

# Step 9: Define training arguments
training_args = TrainingArguments(
    output_dir="./t5_rag_finetuned",
    num_train_epochs=4,
    per_device_train_batch_size=8,
    save_total_limit=1,
    logging_dir="./logs_rag",
    logging_steps=10,
    report_to="none"
)

# Step 10: Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model),
)

# Step 11: Train
trainer.train()

# Step 12: Save model
trainer.save_model("./t5_rag_finetuned")
tokenizer.save_pretrained("./t5_rag_finetuned")

print(" RAG-style fine-tuning complete. Model saved at './t5_rag_finetuned'")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Map:   0%|          | 0/1190 [00:00<?, ? examples/s]

  trainer = Trainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
10,11.0022
20,5.1977
30,2.8454
40,2.0232
50,1.6389
60,1.2994
70,0.9993
80,0.7983
90,0.6824
100,0.5568


 RAG-style fine-tuning complete. Model saved at './t5_rag_finetuned'


## Testing

In [6]:
import pandas as pd
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, DataCollatorForSeq2Seq
from datasets import Dataset
from sklearn.feature_extraction.text import TfidfVectorizer
import faiss

# Load the test dataset
test_df = pd.read_excel("test_dataset.xlsx")

# Combine output columns into a single target string
def make_target(row):
    return f"""Abnormal/Normal: {row['Abnormal/Normal']}
Pathologies Extracted: {row['Pathologies Extracted']}
Midline Shift: {row['Midline Shift']}
Location & Brain Organ: {row['Location & Brain Organ']}
Bleed Subcategory: {row['Bleed Subcategory']}"""

test_df["target"] = test_df.apply(make_target, axis=1)
test_df = test_df.dropna(subset=["Radiologist Diagnosis"]).reset_index(drop=True)

# Load the RAG corpus used during training
with open("rag_corpus.txt", "r", encoding="utf-8") as f:
    documents = [line.strip() for line in f if line.strip()]

# Build the vectorizer and FAISS index
vectorizer = TfidfVectorizer()
doc_vectors = vectorizer.fit_transform(documents).toarray()
index = faiss.IndexFlatL2(doc_vectors.shape[1])
index.add(doc_vectors)

# Retrieve the top document for each test query
queries = test_df["Radiologist Diagnosis"].tolist()
query_vectors = vectorizer.transform(queries).toarray()
_, indices = index.search(query_vectors, 1)
retrieved_contexts = [documents[i[0]] for i in indices]
test_df["retrieved"] = retrieved_contexts

# Load RAG-finetuned model
model_dir = "./t5_rag_finetuned"
tokenizer = T5Tokenizer.from_pretrained(model_dir)
model = T5ForConditionalGeneration.from_pretrained(model_dir).to("cuda" if torch.cuda.is_available() else "cpu")

# Preprocessing
max_input_length = 512
max_target_length = 256

def preprocess(example):
    input_text = f"Extract info: {example['retrieved']}"
    input_enc = tokenizer(input_text, padding="max_length", truncation=True, max_length=max_input_length)
    target_enc = tokenizer(example["target"], padding="max_length", truncation=True, max_length=max_target_length)
    input_enc["labels"] = target_enc["input_ids"]
    return input_enc

# Tokenize test set
raw_dataset = Dataset.from_pandas(test_df[["retrieved", "target"]])
test_dataset = raw_dataset.map(preprocess, remove_columns=["retrieved", "target"])

# Setup dummy Trainer
eval_args = TrainingArguments(
    output_dir="./temp_rag_test",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=eval_args,
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model),
)

# Compute test loss
test_results = trainer.evaluate(eval_dataset=test_dataset)
test_loss = test_results.get("eval_loss", None)

# Generate predictions
input_texts = [f"Extract info: {ctx}" for ctx in test_df["retrieved"]]
input_encodings = tokenizer(input_texts, return_tensors="pt", padding=True, truncation=True, max_length=max_input_length).to(model.device)

with torch.no_grad():
    outputs = model.generate(input_encodings['input_ids'], max_length=max_target_length)

# Decode outputs
decoded_preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)

# Save predictions and test loss
test_df["Generated Output"] = decoded_preds
test_df["Test Loss"] = test_loss
test_df.to_excel("t5_rag_test_results.xlsx", index=False)

print("RAG test complete. Results saved to 't5_rag_test_results.xlsx'")


Map:   0%|          | 0/593 [00:00<?, ? examples/s]

  trainer = Trainer(


RAG test complete. Results saved to 't5_rag_test_results.xlsx'


In [7]:
! ! zip -r hehe.zip t5_rag_finetuned/checkpoint-596/

  adding: t5_rag_finetuned/checkpoint-596/ (stored 0%)
  adding: t5_rag_finetuned/checkpoint-596/training_args.bin (deflated 52%)
  adding: t5_rag_finetuned/checkpoint-596/tokenizer_config.json (deflated 94%)
  adding: t5_rag_finetuned/checkpoint-596/config.json (deflated 62%)
  adding: t5_rag_finetuned/checkpoint-596/model.safetensors (deflated 11%)
  adding: t5_rag_finetuned/checkpoint-596/trainer_state.json (deflated 78%)
  adding: t5_rag_finetuned/checkpoint-596/special_tokens_map.json (deflated 85%)
  adding: t5_rag_finetuned/checkpoint-596/spiece.model (deflated 48%)
  adding: t5_rag_finetuned/checkpoint-596/added_tokens.json (deflated 83%)
  adding: t5_rag_finetuned/checkpoint-596/optimizer.pt (deflated 7%)
  adding: t5_rag_finetuned/checkpoint-596/scheduler.pt (deflated 56%)
  adding: t5_rag_finetuned/checkpoint-596/generation_config.json (deflated 29%)
  adding: t5_rag_finetuned/checkpoint-596/rng_state.pth (deflated 25%)
