<a href="https://colab.research.google.com/github/eaindome/Machine-Learning/blob/main/Phase_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Kenya Clinical Reasoning Challenge

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
# pip install transformers datasets torch pandas numpy

In [None]:
# pip install sentencepiece

In [None]:
# pip install accelerate --upgrade

In [2]:
# pip install datasets

In [3]:
# suppress warnings
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # 0 = all messages, 3 = errors only
# os.environ["WANDB_MODE"] = "disabled"

### Prepare the Data (using train_raw.csv)

In [4]:
import pandas as pd
from datasets import Dataset # type: ignore

In [5]:
# Load data
train_raw = pd.read_csv('/content/train_raw.csv')

# Preprocessing function (fixed from Phase 1)
def preprocess_clinician(text):
    text = text.lower()
    text = ''.join(c for c in text if c.isalnum() or c.isspace())
    text = ' '.join(text.split())
    return text

# Handle missing Years of Experience in Prompt
def update_prompt(row):
    if pd.isna(row['Years of Experience']):
        return row['Prompt'].replace("i am a nurse", "i am a nurse with unknown years of experience")
    return row['Prompt']

# Apply preprocessing
train_raw['Prompt'] = train_raw.apply(update_prompt, axis=1)
train_raw['Clinician'] = train_raw['Clinician'].apply(preprocess_clinician)

# Create a Hugging Face Dataset
data = {'input_text': train_raw['Prompt'], 'target_text': train_raw['Clinician']}
dataset = Dataset.from_dict(data)

# Split into train (80%) and validation (20%)
train_test_split = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split['train']
val_dataset = train_test_split['test']

In [6]:
# Verify
print(f"Train sample:\n{train_dataset[0]}\n")
print(f"Validation sample:\n{val_dataset[0]}")

Train sample:
{'input_text': 'I am a nurse with 10 years of experience in General nursing working in a National Referral Hospitals in Uasin Gishu county in Kenya. A patient is brought to the Theatre with intestinal obstruction. This is his first admission. Patient complains of inability to pass stool for one and half weeks and abdominal distension. There is no history for chronic diseases in the family. Vital sign at the time of receiving the patient in Theatre BP 130/70 mmHg, Pulse 100 bpm, RR 21/minute, SPO2 96%. Upon opening the abdomen all the intestines had become necrosed.\nQuestion\nHow should the patient be managed intraoperatively and postoperatively?', 'target_text': 'patient brought to the theater with intestinal obstruction complaints of inability to pass stool and abdominal distension vitals critical upon opening abdomen all intestines were necrosed q how should the patient be managed intraoperatively and postoperatively intraoperation resection and anastomosis constructio

#### Baseline Model (Flan-T5 Small)

In [7]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
import torch

In [8]:
# Load tokenizer and model
model_name = "google/flan-t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Tokenize the dataset
def tokenize_function(examples):
    inputs = tokenizer(examples['input_text'], max_length=512, truncation=True, padding="max_length")
    targets = tokenizer(examples['target_text'], max_length=256, truncation=True, padding="max_length")
    inputs["labels"] = targets["input_ids"]
    return inputs

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

# Set format for PyTorch
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
val_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

# Training arguments
training_args = TrainingArguments(
    output_dir="./flan_t5_small_output_v3",
    num_train_epochs=7,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=50,
    weight_decay=0.01,
    learning_rate=3e-4,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Map:   0%|          | 0/320 [00:00<?, ? examples/s]

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

In [9]:
# Fine-tune
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33meaindome[0m ([33meaindome-org[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,2.7042,2.217534
2,2.1182,1.903041
3,1.9077,1.831127
4,1.7778,1.791305
5,1.5305,1.780568
6,1.7637,1.774105
7,1.604,1.769809


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


TrainOutput(global_step=560, training_loss=2.5832993967192515, metrics={'train_runtime': 21655.6398, 'train_samples_per_second': 0.103, 'train_steps_per_second': 0.026, 'total_flos': 416394730536960.0, 'train_loss': 2.5832993967192515, 'epoch': 7.0})

In [10]:
# Save model
model.save_pretrained("./flan_t5_small_finetuned")
tokenizer.save_pretrained("./flan_t5_small_finetuned")

('./flan_t5_small_finetuned/tokenizer_config.json',
 './flan_t5_small_finetuned/special_tokens_map.json',
 './flan_t5_small_finetuned/spiece.model',
 './flan_t5_small_finetuned/added_tokens.json')

In [11]:
# Load fine-tuned model
model = T5ForConditionalGeneration.from_pretrained("./flan_t5_small_finetuned")
tokenizer = T5Tokenizer.from_pretrained("./flan_t5_small_finetuned")

In [12]:
# !pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [13]:
# pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=33a71c1e2640fc6fd94c64ea91c299ce0e87fed24306c14bf0929be0de3b2b12
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [None]:
from evaluate import load
rouge = load("rouge")

# Generate predictions for validation set
val_prompts = val_dataset['input_text']
val_ground_truths = val_dataset['target_text']
val_predictions = []

for prompt in val_prompts[:10]:  # Test on first 10 for now
    inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)
    outputs = model.generate(**inputs, max_length=256, num_beams=4, do_sample=True, top_k=40, top_p=0.9, temperature=0.6, no_repeat_ngram_size=2)
    pred = tokenizer.decode(outputs[0], skip_special_tokens=True)
    val_predictions.append(pred)

# Compute ROUGE
results = rouge.compute(predictions=val_predictions, references=val_ground_truths[:10])
print("ROUGE Scores:", results)

In [None]:
# Test on indices 0, 1, 2
for i in [0, 1, 2]:
    test_prompt = f"Provide a summary and detailed management plan: {train_raw['Prompt'][i]}"
    inputs = tokenizer(test_prompt, return_tensors="pt", max_length=512, truncation=True)
    outputs = model.generate(
        **inputs,
        max_length=256,
        num_beams=5,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        temperature=0.7,
        no_repeat_ngram_size=3,
        early_stopping=True
    )
    prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(f"\nIndex {i}:")
    print("Prompt:", test_prompt)
    print("Prediction:", prediction)
    print("Ground Truth:", train_raw['Clinician'][i])

In [None]:
from evaluate import load
rouge = load("rouge")

val_predictions = []
for prompt in val_prompts[:10]:
    inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)
    outputs = model.generate(**inputs, max_length=256, num_beams=5, do_sample=True, top_k=50, top_p=0.95, temperature=0.7, no_repeat_ngram_size=3, early_stopping=True)
    pred = tokenizer.decode(outputs[0], skip_special_tokens=True)
    val_predictions.append(pred)

results = rouge.compute(predictions=val_predictions, references=val_ground_truths[:10])
print("Updated ROUGE Scores:", results)