In [1]:
from google.colab.files import upload
uploaded = upload()

Saving rag_corpus.txt to rag_corpus.txt


In [2]:
from google.colab.files import upload
uploaded = upload()

Saving test_dataset.xlsx to test_dataset.xlsx


In [3]:
!pip install transformers datasets accelerate openpyxl evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=2.0.0->a

In [4]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl (31.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m48.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.11.0


In [6]:
import pandas as pd
import torch
from transformers import T5Tokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.feature_extraction.text import TfidfVectorizer
import faiss

# Load knowledge base
with open("rag_corpus.txt", "r", encoding="utf-8") as f:
    documents = [line.strip() for line in f if line.strip()]

# Vectorize and build index
vectorizer = TfidfVectorizer()
doc_vectors = vectorizer.fit_transform(documents).toarray()
index = faiss.IndexFlatL2(doc_vectors.shape[1])
index.add(doc_vectors)

# Load model/tokenizer
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to("cuda" if torch.cuda.is_available() else "cpu")

# Load test data
test_df = pd.read_excel("test_dataset.xlsx")

# Prepare targets
def make_target(row):
    return f"""Abnormal/Normal: {row['Abnormal/Normal']}
Pathologies Extracted: {row['Pathologies Extracted']}
Midline Shift: {row['Midline Shift']}
Location & Brain Organ: {row['Location & Brain Organ']}
Bleed Subcategory: {row['Bleed Subcategory']}"""

test_df["target"] = test_df.apply(make_target, axis=1)
test_df = test_df.dropna(subset=["Radiologist Diagnosis"]).reset_index(drop=True)

# Retrieve top context for each sample
queries = test_df["Radiologist Diagnosis"].tolist()
query_vectors = vectorizer.transform(queries).toarray()
_, indices = index.search(query_vectors, 1)
retrieved_contexts = [documents[i[0]] for i in indices]

# Add retrieved context to test_df
test_df["retrieved"] = retrieved_contexts

# Prepare Hugging Face Dataset
max_input_length = 512
max_target_length = 256

def preprocess(example):
    input_text = f"Extract info: {example['retrieved']}"
    input_enc = tokenizer(input_text, padding="max_length", truncation=True, max_length=max_input_length)
    target_enc = tokenizer(example["target"], padding="max_length", truncation=True, max_length=max_target_length)
    input_enc["labels"] = target_enc["input_ids"]
    return input_enc

raw_dataset = Dataset.from_pandas(test_df[["retrieved", "target"]])
tokenized_dataset = raw_dataset.map(preprocess, remove_columns=["retrieved", "target"])

# Dummy TrainingArguments (just for evaluation)
args = TrainingArguments(
    output_dir="./temp_rag_eval",
    report_to="none"
)

# Setup Trainer
trainer = Trainer(
    model=model,
    args=args,
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model),
)

# Evaluate to get test loss
eval_results = trainer.evaluate(eval_dataset=tokenized_dataset)
test_loss = eval_results.get("eval_loss", None)

# Generate predictions
input_texts = [f"Extract info: {ctx}" for ctx in test_df["retrieved"]]
inputs = tokenizer(input_texts, return_tensors="pt", padding=True, truncation=True, max_length=max_input_length).to(model.device)

with torch.no_grad():
    output_ids = model.generate(inputs["input_ids"], max_length=max_target_length)
decoded_preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)

# Save everything to Excel
test_df["Generated Output"] = decoded_preds
test_df["Test Loss"] = test_loss
test_df.to_excel("rag_t5_comparable_results.xlsx", index=False)

print("RAG-style T5 results and test loss saved to 'rag_t5_comparable_results.xlsx'")


Map:   0%|          | 0/593 [00:00<?, ? examples/s]

  trainer = Trainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


RAG-style T5 results and test loss saved to 'rag_t5_comparable_results.xlsx'
