In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import json
import os

In [3]:
# Load dataset
ds = load_dataset("lavita/ChatDoctor-HealthCareMagic-100k")
print(f"\n Dataset loaded successfully")
print(f"  Total samples: {len(ds['train'])}")

# Display sample data
print(f"\nSample conversation:")
sample = ds['train'][0]
print(f"  Instruction: {sample['instruction'][:80]}...")
print(f"  Question: {sample['input'][:100]}...")
print(f"  Answer: {sample['output'][:100]}...")


 Dataset loaded successfully
  Total samples: 112165

Sample conversation:
  Instruction: If you are a doctor, please answer the medical questions based on the patient's ...
  Question: I woke up this morning feeling the whole room is spinning when i was sitting down. I went to the bat...
  Answer: Hi, Thank you for posting your query. The most likely cause for your symptoms is benign paroxysmal p...


In [4]:
# Preprocess the dataset

# Create training, validation, and test splits
ds_split = ds["train"].train_test_split(test_size=0.1, seed=42)
train_val_data = ds_split["train"]
test_data = ds_split["test"]

val_split = train_val_data.train_test_split(test_size=0.1, seed=42)
train_data = val_split["train"]
val_data = val_split["test"]

print(f"  Train set: {len(train_data)} samples")
print(f"  Validation set: {len(val_data)} samples")
print(f"  Test set: {len(test_data)} samples")

# Initialize tokenizer with appropriate settings for T5
model_name = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)

print(f"\n  Tokenizer: {tokenizer.__class__.__name__}")
print(f"  Vocab size: {tokenizer.vocab_size}")

def preprocess_function(examples, max_input_length=256, max_target_length=128):
    """
    Preprocess examples for T5 model.
    """
    inputs = []
    targets = []
    attention_masks = []

    for i in range(len(examples['input'])):
        input_text = "medical question: " + examples['input'][i].strip()
        target_text = examples['output'][i].strip()
        
        tokenized_input = tokenizer(
            input_text,
            max_length=max_input_length,
            padding="max_length",
            truncation=True,
            return_tensors=None
        )
        tokenized_target = tokenizer(
            target_text,
            max_length=max_target_length,
            padding="max_length",
            truncation=True,
            return_tensors=None
        )

        inputs.append(tokenized_input['input_ids'])
        attention_masks.append(tokenized_input['attention_mask'])
        targets.append(tokenized_target['input_ids'])
    
    return {
        'input_ids': inputs,
        'attention_mask': attention_masks,
        'labels': targets
    }

# Preprocess all datasets
print(f"\n  Preprocessing training set (10,000 samples)...")
train_data_preprocessed = train_data.select(range(10000)).map(
    preprocess_function,
    batched=True,
    batch_size=100,
    remove_columns=['instruction', 'input', 'output']
)

print(f"  Preprocessing validation set...")
val_data_preprocessed = val_data.map(
    preprocess_function,
    batched=True,
    batch_size=100,
    remove_columns=['instruction', 'input', 'output']
)

print(f"  Preprocessing test set...")
test_data_preprocessed = test_data.map(
    preprocess_function,
    batched=True,
    batch_size=100,
    remove_columns=['instruction', 'input', 'output']
)

print(f"dataset preprocessed successfully")

  Train set: 90853 samples
  Validation set: 10095 samples
  Test set: 11217 samples

  Tokenizer: T5TokenizerFast
  Vocab size: 32100

  Preprocessing training set (10,000 samples)...
  Preprocessing validation set...
  Preprocessing test set...
dataset preprocessed successfully


In [5]:
# Hyperparameter tuning experiments
experiments = [
    {
        'name': 'Larger Batch Size',
        'learning_rate': 3e-5,
        'batch_size': 16,
        'epochs': 1,
        'max_input_length': 256,
    },
    {
        'name': 'Smaller Batch Size',
        'learning_rate': 3e-5,
        'batch_size': 4,
        'epochs': 1,
        'max_input_length': 256,
    },
    {
        'name': 'Optimized Config',
        'learning_rate': 2e-5,
        'batch_size': 8,
        'epochs': 2,
        'max_input_length': 256,
    },
]

results = []

for exp_idx, config in enumerate(experiments):
    print(f"\n{'─'*70}")
    print(f"EXPERIMENT {exp_idx + 1}/{len(experiments)}: {config['name']}")
    print(f"{'─'*70}")
    print(f"  Learning Rate: {config['learning_rate']}")
    print(f"  Batch Size: {config['batch_size']}")
    print(f"  Epochs: {config['epochs']}")

    # Load fresh model for each experiment
    model = TFAutoModelForSeq2SeqLM.from_pretrained(model_name, use_safetensors=False)

    # Convert Hugging Face datasets to NumPy arrays before feeding to tf.data.Dataset
    train_inputs = {
        'input_ids': tf.convert_to_tensor(train_data_preprocessed['input_ids']),
        'attention_mask': tf.convert_to_tensor(train_data_preprocessed['attention_mask']),
        'labels': tf.convert_to_tensor(train_data_preprocessed['labels']),
    }

    val_inputs = {
        'input_ids': tf.convert_to_tensor(val_data_preprocessed['input_ids']),
        'attention_mask': tf.convert_to_tensor(val_data_preprocessed['attention_mask']),
        'labels': tf.convert_to_tensor(val_data_preprocessed['labels']),
    }

    # Use from_tensor_slices safely with converted tensors
    tf_train_dataset = (
        tf.data.Dataset.from_tensor_slices(train_inputs)
        .shuffle(buffer_size=len(train_data_preprocessed['input_ids']))
        .batch(config['batch_size'])
    )

    tf_val_dataset = (
        tf.data.Dataset.from_tensor_slices(val_inputs)
        .batch(config['batch_size'])
    )

    # Compile model
    optimizer = tf.keras.optimizers.Adam(learning_rate=config['learning_rate'])
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

    model.compile(optimizer=optimizer, loss=loss_fn, metrics=['accuracy'])

    # Train with validation
    print(f"\n  Training...")
    history = model.fit(
        tf_train_dataset,
        validation_data=tf_val_dataset,
        epochs=config['epochs'],
        verbose=1  # show training progress
    )

    # Calculate metrics
    train_loss = float(history.history['loss'][-1])
    val_loss = float(history.history['val_loss'][-1])
    train_acc = float(history.history['accuracy'][-1])
    val_acc = float(history.history['val_accuracy'][-1])

    results.append({
        'Experiment': config['name'],
        'Learning Rate': f"{config['learning_rate']:.0e}",
        'Batch Size': config['batch_size'],
        'Epochs': config['epochs'],
        'Train Loss': f"{train_loss:.4f}",
        'Val Loss': f"{val_loss:.4f}",
        'Train Acc': f"{train_acc:.4f}",
        'Val Acc': f"{val_acc:.4f}",
    })

    print(f"   Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")
    print(f"    Train Acc: {train_acc:.4f} | Val Acc: {val_acc:.4f}")

# Display results table
print(f"\n")
print("EXPERIMENT RESULTS SUMMARY")
print(f"\n")
results_df = pd.DataFrame(results)
print(results_df.to_string(index=False))

# Save results
results_df.to_csv('experiment_results.csv', index=False)
print(f"\nResults saved to 'experiment_results.csv'")



──────────────────────────────────────────────────────────────────────
EXPERIMENT 1/3: Larger Batch Size
──────────────────────────────────────────────────────────────────────
  Learning Rate: 3e-05
  Batch Size: 16
  Epochs: 1




TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.
All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at t5-small.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.



  Training...


   Train Loss: 4.0850 | Val Loss: 3.4413
    Train Acc: 0.3217 | Val Acc: 0.3884

──────────────────────────────────────────────────────────────────────
EXPERIMENT 2/3: Smaller Batch Size
──────────────────────────────────────────────────────────────────────
  Learning Rate: 3e-05
  Batch Size: 4
  Epochs: 1


All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at t5-small.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.



  Training...


'(MaxRetryError('HTTPSConnectionPool(host=\'huggingface.co\', port=443): Max retries exceeded with url: /t5-small/resolve/main/config.json (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x000001ABD3994EC0>: Failed to resolve \'huggingface.co\' ([Errno 11001] getaddrinfo failed)"))'), '(Request ID: 79a94016-4924-4dcc-8358-06e315e103cc)')' thrown while requesting HEAD https://huggingface.co/t5-small/resolve/main/config.json
Retrying in 1s [Retry 1/5].


   Train Loss: 3.7688 | Val Loss: 3.2324
    Train Acc: 0.3522 | Val Acc: 0.4130

──────────────────────────────────────────────────────────────────────
EXPERIMENT 3/3: Optimized Config
──────────────────────────────────────────────────────────────────────
  Learning Rate: 2e-05
  Batch Size: 8
  Epochs: 2


'(MaxRetryError('HTTPSConnectionPool(host=\'huggingface.co\', port=443): Max retries exceeded with url: /t5-small/resolve/main/config.json (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x000001AB8357B250>: Failed to resolve \'huggingface.co\' ([Errno 11001] getaddrinfo failed)"))'), '(Request ID: 0ba827e3-d80d-43ec-80a8-9f75a970dca8)')' thrown while requesting HEAD https://huggingface.co/t5-small/resolve/main/config.json
Retrying in 2s [Retry 2/5].
'(MaxRetryError('HTTPSConnectionPool(host=\'huggingface.co\', port=443): Max retries exceeded with url: /t5-small/resolve/main/config.json (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x000001AB8357B9D0>: Failed to resolve \'huggingface.co\' ([Errno 11001] getaddrinfo failed)"))'), '(Request ID: 26d9425a-9889-41fd-ae62-acd28776ee77)')' thrown while requesting HEAD https://huggingface.co/t5-small/resolve/main/config.json
Retrying in 4s [Retry 3/5].
'(MaxRetryError('HTTPSConnecti


  Training...
Epoch 1/2
Epoch 2/2
   Train Loss: 3.5744 | Val Loss: 3.2523
    Train Acc: 0.3706 | Val Acc: 0.4103


EXPERIMENT RESULTS SUMMARY


        Experiment Learning Rate  Batch Size  Epochs Train Loss Val Loss Train Acc Val Acc
 Larger Batch Size         3e-05          16       1     4.0850   3.4413    0.3217  0.3884
Smaller Batch Size         3e-05           4       1     3.7688   3.2324    0.3522  0.4130
  Optimized Config         2e-05           8       2     3.5744   3.2523    0.3706  0.4103

Results saved to 'experiment_results.csv'


In [6]:
# Train optimized model

from transformers import TFAutoModelForSeq2SeqLM
import tensorflow as tf

# Use optimized configuration
final_config = experiments[-1]  # Last experiment = optimized config
print("\nTraining final model with optimized hyperparameters:")
print(f"  Learning Rate: {final_config['learning_rate']}")
print(f"  Batch Size: {final_config['batch_size']}")
print(f"  Epochs: {final_config['epochs']}")

# Load model
final_model = TFAutoModelForSeq2SeqLM.from_pretrained(model_name, from_pt=True, use_safetensors=False)

# Helper: build tf.data.Dataset properly
def make_tf_dataset(data, batch_size, shuffle=False):
    ds = tf.data.Dataset.from_tensor_slices((
        {
            "input_ids": tf.convert_to_tensor(data["input_ids"], dtype=tf.int32),
            "attention_mask": tf.convert_to_tensor(data["attention_mask"], dtype=tf.int32),
        },
        tf.convert_to_tensor(data["labels"], dtype=tf.int32)
    ))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(data["input_ids"]))
    return ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)

tf_train_final = make_tf_dataset(train_data_preprocessed, final_config["batch_size"], shuffle=True)
tf_val_final = make_tf_dataset(val_data_preprocessed, final_config["batch_size"], shuffle=False)

# Define optimizer and loss
optimizer = tf.keras.optimizers.Adam(learning_rate=final_config["learning_rate"])
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction="none")

# Custom training step (since model.fit() isn't ideal for transformers)
@tf.function
def train_step(batch_inputs, batch_labels):
    with tf.GradientTape() as tape:
        outputs = final_model(batch_inputs, labels=batch_labels, training=True)
        logits = outputs.logits
        # Mask padding tokens (-100)
        active_loss = tf.not_equal(batch_labels, -100)
        loss = loss_fn(batch_labels, logits)
        masked_loss = tf.reduce_sum(loss * tf.cast(active_loss, tf.float32)) / tf.reduce_sum(tf.cast(active_loss, tf.float32))
    gradients = tape.gradient(masked_loss, final_model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, final_model.trainable_variables))
    return masked_loss

#  Optional evaluation step
@tf.function
def val_step(batch_inputs, batch_labels):
    outputs = final_model(batch_inputs, labels=batch_labels, training=False)
    logits = outputs.logits
    active_loss = tf.not_equal(batch_labels, -100)
    loss = loss_fn(batch_labels, logits)
    masked_loss = tf.reduce_sum(loss * tf.cast(active_loss, tf.float32)) / tf.reduce_sum(tf.cast(active_loss, tf.float32))
    return masked_loss

#  Training loop
for epoch in range(final_config["epochs"]):
    print(f"\nEpoch {epoch+1}/{final_config['epochs']}")
    train_loss = 0
    val_loss = 0
    step = 0

    for batch_inputs, batch_labels in tf_train_final:
        batch_loss = train_step(batch_inputs, batch_labels)
        train_loss += batch_loss
        step += 1
    avg_train_loss = train_loss / step

    for batch_inputs, batch_labels in tf_val_final:
        batch_loss = val_step(batch_inputs, batch_labels)
        val_loss += batch_loss
    avg_val_loss = val_loss / len(tf_val_final)

    print(f"Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")

print("\n Final model training complete.")



Training final model with optimized hyperparameters:
  Learning Rate: 2e-05
  Batch Size: 8
  Epochs: 2


All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.



Epoch 1/2
Train Loss: 4.0583 | Val Loss: 3.4239

Epoch 2/2
Train Loss: 3.5749 | Val Loss: 3.2540

 Final model training complete.


In [9]:
# Evaluation and metrics
import numpy as np
import nltk
from nltk.translate.bleu_score import sentence_bleu
from nltk.tokenize import word_tokenize

# Ensure both old and new tokenizers are downloaded
try:
    nltk.data.find("tokenizers/punkt")
except LookupError:
    nltk.download("punkt")

# For newer NLTK versions (>= 3.8)
try:
    nltk.data.find("tokenizers/punkt_tab")
except LookupError:
    nltk.download("punkt_tab")

def calculate_bleu_score(references, hypotheses):
    """Calculate BLEU score for generated texts"""
    bleu_scores = []
    for ref, hyp in zip(references, hypotheses):
        # Handle None or empty strings safely
        if not ref or not hyp:
            continue
        ref_tokens = [word_tokenize(ref.lower())]
        hyp_tokens = word_tokenize(hyp.lower())
        score = sentence_bleu(ref_tokens, hyp_tokens, weights=(0.5, 0.5))
        bleu_scores.append(score)
    return np.mean(bleu_scores) if bleu_scores else 0.0


print(f"\nGenerating predictions on test set ({len(test_data)} samples)...")
test_references = []
test_predictions = []

# Generate predictions for a few test samples (for efficiency)
for i in range(min(50, len(test_data))):
    sample = test_data_preprocessed['input_ids'][i]
    attention_mask = test_data_preprocessed['attention_mask'][i]
    reference = tokenizer.decode(test_data_preprocessed['labels'][i], skip_special_tokens=True)

    input_ids = tf.constant([sample])
    attention_mask = tf.constant([attention_mask])

    output = final_model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_length=128,
        num_beams=4,
        early_stopping=True
    )

    prediction = tokenizer.decode(output[0], skip_special_tokens=True)

    test_predictions.append(prediction)
    test_references.append(reference)

# Calculate BLEU score
print(f"\nCalculating BLEU Score on {len(test_predictions)} samples...")
bleu_score = calculate_bleu_score(test_references, test_predictions)
print(f"  BLEU Score: {bleu_score:.4f}")

# Calculate additional metrics
avg_pred_len = np.mean([len(p.split()) for p in test_predictions]) if test_predictions else 0
avg_ref_len = np.mean([len(r.split()) for r in test_references]) if test_references else 0

print("\nEVALUATION METRICS:\n")
print(f"  BLEU Score: {bleu_score:.4f} (measures n-gram overlap with references)")
print(f"  Test Samples Evaluated: {len(test_predictions)}")
print(f"  Avg Prediction Length: {avg_pred_len:.1f} words")
print(f"  Avg Reference Length: {avg_ref_len:.1f} words")


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.



Generating predictions on test set (11217 samples)...

Calculating BLEU Score on 50 samples...
  BLEU Score: 0.0708

EVALUATION METRICS:

  BLEU Score: 0.0708 (measures n-gram overlap with references)
  Test Samples Evaluated: 50
  Avg Prediction Length: 77.8 words
  Avg Reference Length: 84.2 words


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [10]:
#save model
print(f"\nSaving model and tokenizer...")
final_model.save_pretrained("healthcare_chatbot_model")
tokenizer.save_pretrained("healthcare_chatbot_model")
print(f"Model saved to 'healthcare_chatbot_model/'")


Saving model and tokenizer...
Model saved to 'healthcare_chatbot_model/'


In [11]:
# testing
test_questions = [
    "I have a sore throat and mild cough for 2 days. What should I do?",
    "What are the early symptoms of diabetes?",
    "I experience chest pain when exercising. Should I be concerned?",
    "How can I treat a common cold at home?",
    "What are the causes of persistent headaches?",
]

for q in test_questions:
    input_text = "medical question: " + q
    inputs = tokenizer(input_text, return_tensors="tf")
    output = final_model.generate(**inputs, max_length=100, num_beams=4)
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    print(f"\nQ: {q}")
    print(f"A: {response}\n")

TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.



Q: I have a sore throat and mild cough for 2 days. What should I do?
A: Hi, Welcome to Chat Doctor. I understand your concern. You have a sore throat and mild cough for 2 days. You have a mild cough and mild cough for 2 days. You have a mild cough and mild cough for 2 days. You have a sore throat and mild cough for 2 days. You have a mild cough for 2 days. You have a mild cough for 2 days. You have a mild cough for 2 days. You have 


Q: What are the early symptoms of diabetes?
A: Hi, Welcome to Chat Doctor. If you have diabetes, you may need to consult a doctor for a diagnosis. If you have diabetes, you may need to consult a doctor. If you have diabetes, you may need to consult a doctor. If you have diabetes, you may need to consult a doctor. If you have diabetes, you may need to consult a doctor. If you have diabetes, you may need to consult a doctor.


Q: I experience chest pain when exercising. Should I be concerned?
A: Hi, Welcome to Chat Doctor. I can understand your concern. Yo