# Doctor - HealthCare

### Import Libraries

In [10]:
import pandas as pd
import numpy as np
import torch

from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from datasets import Dataset

### Load Dataset

In [11]:
file_path = "/kaggle/input/doctor-healthcare-100k/Doctor-HealthCare-100k.csv"
df = pd.read_csv(file_path)

print(df.head())
print(df.info())

                                         instruction  \
0  If you are a doctor, please answer the medical...   
1  If you are a doctor, please answer the medical...   
2  If you are a doctor, please answer the medical...   
3  If you are a doctor, please answer the medical...   
4  If you are a doctor, please answer the medical...   

                                               input  \
0  I woke up this morning feeling the whole room ...   
1  My baby has been pooing 5-6 times a day for a ...   
2  Hello, My husband is taking Oxycodone due to a...   
3  lump under left nipple and stomach pain (male)...   
4  I have a 5 month old baby who is very congeste...   

                                              output  
0  Hi, Thank you for posting your query. The most...  
1  Hi... Thank you for consulting in Chat Doctor....  
2  Hello, and I hope I can help you today.First, ...  
3  HI. You have two different problems. The lump ...  
4  Thank you for using Chat Doctor. I would sugge..

In [12]:
# Drop Unnecessary Data
df = df[['input', 'output']]

# Handle Missing Values
df = df.dropna()

In [13]:
# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.1)

print(dataset) 

DatasetDict({
    train: Dataset({
        features: ['input', 'output'],
        num_rows: 100940
    })
    test: Dataset({
        features: ['input', 'output'],
        num_rows: 11216
    })
})


### Fine-Tune the Model

In [15]:
if torch.cuda.is_available():
    print("CUDA is available. Using GPU for training.")
else:
    print("CUDA is not available. Using CPU for training.")

CUDA is available. Using GPU for training.


In [16]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the distilgpt2 model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")
model = GPT2LMHeadModel.from_pretrained("distilgpt2")

# Move the model to the GPU if available
model.to(device)

# Set padding token to eos_token 
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    # Concatenate input and output for tokenization to ensure consistent padding
    concatenated = [f"{inp} {tokenizer.eos_token} {out}" for inp, out in zip(examples['input'], examples['output'])]
    
    # Tokenize the concatenated text
    tokenized = tokenizer(
        concatenated,
        padding="max_length",
        truncation=True,
        max_length=512,
        return_tensors="pt",
    )
    
    # Use the same input_ids as labels 
    tokenized["labels"] = tokenized["input_ids"].clone()
    
    return tokenized

# Apply the tokenizer to the dataset
dataset = dataset.map(tokenize_function, batched=True)

# Use a 90-10 split (train-test)
train_dataset = dataset['train']
test_dataset = dataset['test']

# Define training arguments 
training_args = TrainingArguments(
    output_dir='./results',          
    eval_strategy="steps",           
    save_strategy="steps",           
    save_steps=500,                  
    learning_rate=2e-5,              
    per_device_train_batch_size=8,  
    per_device_eval_batch_size=8,   
    num_train_epochs=1,             
    weight_decay=0.01,               
    logging_dir='./logs',            
    logging_steps=500,               
    load_best_model_at_end=True,     
    report_to="tensorboard",         
    gradient_accumulation_steps=2,  
    fp16=True,  
)

# Initialize the Trainer
trainer = Trainer(
    model=model,                         
    args=training_args,                  
    train_dataset=train_dataset,         
    eval_dataset=test_dataset,           
    tokenizer=tokenizer,                 
)

# Start fine-tuning
trainer.train()

# Save the final model and tokenizer
model_save_path = './fine_tuned_model'  
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

print(f"Model and tokenizer saved to {model_save_path}")

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Map:   0%|          | 0/100940 [00:00<?, ? examples/s]

Map:   0%|          | 0/11216 [00:00<?, ? examples/s]

  trainer = Trainer(


Step,Training Loss,Validation Loss
500,1.8181,1.603507
1000,1.6201,1.554957
1500,1.599,1.530887
2000,1.5753,1.515001
2500,1.5549,1.506438
3000,1.5478,1.502588


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


Model and tokenizer saved to ./fine_tuned_model


### Example

In [72]:
# Load the fine-tuned model and tokenizer
model_save_path = './fine_tuned_model'
model = GPT2LMHeadModel.from_pretrained(model_save_path)
tokenizer = GPT2Tokenizer.from_pretrained(model_save_path)

# Define pad token if missing
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Function to generate response
def generate_response(input_text):
    # Add a system prompt to guide the model
    system_prompt = (
        """
        You are a professional and empathetic virtual health assistant. Your role is to support users by providing general health-related information, offering helpful suggestions, and guiding them to consult qualified healthcare professionals for personalized advice.

        **Guidelines**:
        1. Respond with empathy and professionalism, acknowledging user concerns and emotions.
        2. Provide general health information that is safe and widely accepted, without diagnosing or recommending specific treatments or medications.
        3. Encourage users to seek healthcare professionals for personalized guidance when necessary, but offer general advice where appropriate.
        4. Offer non-prescriptive wellness tips, lifestyle recommendations, or general health practices that promote overall well-being.
        5. Use simple, clear language, and explain medical terms if requested, but avoid overwhelming the user with too much technical detail.
        6. Ensure your responses are reassuring and prioritize the user's overall safety and well-being, while also empowering them to take the next steps.
        
        **Example**:
        User: [User's query]  
        Assistant: [Empathetic response with general health information, tips, and encouragement to consult a healthcare professional if needed]
        """
    )
    full_input = f"<|startoftext|>{system_prompt}\n\nUser: {input_text}\nAssistant:<|endoftext|>"
    input_ids = tokenizer(full_input, return_tensors="pt", padding=True, truncation=True).input_ids.to(device)

    output = model.generate(
        input_ids,
        max_new_tokens=1000,
        num_beams=5,
        no_repeat_ngram_size=2,
        early_stopping=True,
        pad_token_id=tokenizer.pad_token_id  
    )

    response = tokenizer.decode(output[0], skip_special_tokens=True)
    return response.split("Assistant:")[-1].strip()

# Test on a sample from the test dataset
sample_input = dataset["test"][755]["input"]
print("Input:", sample_input)
print("\nGenerated Output:", generate_response(sample_input))

Input: Im constantly plagued with mouth ulcers and tongue ulcers and feel as if my tongue is swollen, when I lay down my mouth constantly fills with saliva and I constantly have to swallow its  most annoying and I have to try to sleep with my neck up in the air.  My ears are painful too, this is ongoing condition

Generated Output: Thanks for your question on Chat Doctor. I can understand your concern. In my opinion, you should consult an ENT specialist and get done clinical examination of your mouth. If you require more of my help in this aspect, I will be happy to help you further. Please do not hesitate to ask in case of any further doubts. Wishing you good health.
