In [None]:
# !pip install --upgrade transformers torch accelerate bitsandbytes trl peft

In [None]:
# !pip install --upgrade trl
# !pip install -q -U bitsandbytes
# !pip install -q datasets

In [None]:
import os

# Store the token as an environment variable
os.environ['HF_TOKEN'] = 'hugging_face_API'

# Now you can access it later using:
# token = os.environ.get('HF_TOKEN')

In [None]:
from huggingface_hub import login
import torch
import os
# Log in to Hugging Face Hub
login(token=os.environ["HF_TOKEN"])

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [None]:
!nvidia-smi

Mon Apr 21 16:04:47 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   71C    P8             13W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

model_id = "meta-llama/Llama-3.1-8B"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0})

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

#Loading and Processing the Dataset

In [None]:
from datasets import load_dataset

# Login using e.g. `huggingface-cli login` to access this dataset
ds = load_dataset("juliadollis/trad_ai_medical_chatbot_v8_1")

In [None]:
import pandas as pd
df2 = pd.DataFrame(ds["train"])
df2 = df2[["Patient","Doctor"]]
df2 = df2.rename(columns={"Patient": "text", "Doctor": "response"})
#df2 = df2[:2000]
df2.head()

Unnamed: 0,text,response
0,"Hi doctor, I had sex last month after ovulatio...",Hello. Revert with the reports to an obstetric...
1,"Hello doctor, I am a 24 year old working woman...",Hello. Feeling sleepy during office timing is ...
2,"Hello doctor, My blood report came today surpr...","Hi. As I understand, you are having hypertensi..."
3,"Hello doctor,I am trying to conceive but my hu...","Hello, Wellcome to iclinq.com. There are few r..."
4,"Hello doctor, My fiancee and I had unprotected...","Hi. How are you doing? Yes, as you have heard,..."


In [None]:
df = df2.copy()
print(len(df))
df.head()


3800


Unnamed: 0,text,response
0,"Hi doctor, I had sex last month after ovulatio...",Hello. Revert with the reports to an obstetric...
1,"Hello doctor, I am a 24 year old working woman...",Hello. Feeling sleepy during office timing is ...
2,"Hello doctor, My blood report came today surpr...","Hi. As I understand, you are having hypertensi..."
3,"Hello doctor,I am trying to conceive but my hu...","Hello, Wellcome to iclinq.com. There are few r..."
4,"Hello doctor, My fiancee and I had unprotected...","Hi. How are you doing? Yes, as you have heard,..."


In [None]:
from datasets import load_dataset, Dataset
import os
import kagglehub

alpaca_prompt = """You are a highly knowledgeable and empathetic medical AI assistant trained to provide accurate, clear, and respectful health-related advice. Your responses should be medically sound, non-alarming, and encourage patients to consult with healthcare professionals when necessary.

### Instruction:
Given a patient's message, provide an appropriate and informative medical response.

### Input:
Patient: {}

### Output:
Doctor: {}"""



EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
hf_dataset = Dataset.from_pandas(df)

# Formatting function for training
def formatting_prompts_func(examples):
    user_messages = examples["text"]  # User input
    responses = examples["response"]  # AI response
    texts = []

    for user, response in zip(user_messages, responses):
        text = alpaca_prompt.format(user, response) + EOS_TOKEN  # Add EOS token
        texts.append(text)

    return {"text": texts}

# Apply formatting function using Hugging Face .map()
dataset = hf_dataset.map(formatting_prompts_func, batched=True)

# Print sample output
print(dataset[54]["text"])

Map:   0%|          | 0/3800 [00:00<?, ? examples/s]

You are a highly knowledgeable and empathetic medical AI assistant trained to provide accurate, clear, and respectful health-related advice. Your responses should be medically sound, non-alarming, and encourage patients to consult with healthcare professionals when necessary.

### Instruction:
Given a patient's message, provide an appropriate and informative medical response.

### Input:
Patient: Hello doctor, My fiancee and I had unprotected sex a few days back, but I did not ejaculate inside her. Just to be on the safer side, we wanted to use the emergency contraceptive pill. But due to some restriction in the country where we live, Plan B or emergency contraceptive pills are not available. I read that Yasmin, which is used as a regular contraceptive pill can be used as an emergency contraceptive pill at a higher dosage. Can Yasmin be used as an emergency contraceptive pill? And at what dosage?

### Output:
Doctor: Hi. How are you doing? Yes, as you have heard, Yasmin can be used as 

In [None]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

# Training llama 3.1

In [None]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules = [
      "q_proj",
      "k_proj",
      "v_proj",
      "o_proj",
      "gate_proj",
      "up_proj",
      "down_proj",
    ],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 20971520 || all params: 4561571840 || trainable%: 0.4597432800707574


In [None]:
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    # Make sure to resize the embeddings of the model
    model.resize_token_embeddings(len(tokenizer))
    print("we have add a new token called [PAD]")

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


we have add a new token called [PAD]


In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
#from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    #tokenizer = tokenizer,
    train_dataset = dataset,
    # dataset_text_field = "text",
    # max_seq_length = 2048,
    # dataset_num_proc = 2, #1
    # packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 2,
        warmup_steps = 5,
        num_train_epochs = 1, # Set this for 1 full training run.
        #max_steps = 20,
        learning_rate = 2e-4,
        fp16 = True,
        bf16 = False,
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

Converting train dataset to ChatML:   0%|          | 0/3800 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/3800 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/3800 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/3800 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:
import wandb
# wandb.init()
wandb.login(key="wandb_API")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mbmaged23[0m ([33mbmaged23-hugging-face[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [None]:
import time

# Start time
start_time = time.time()

# Train the model
trainer_stats = trainer.train()

# End time
end_time = time.time()

# Calculate elapsed time
elapsed_time = end_time - start_time

# Convert to hours, minutes, and seconds
hours = int(elapsed_time // 3600)
minutes = int((elapsed_time % 3600) // 60)
seconds = int(elapsed_time % 60)

print(f"Training completed in {hours}h {minutes}m {seconds}s.")




`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss
1,2.5003
2,2.47
3,2.2668
4,2.328
5,2.0662
6,2.3381
7,1.8297
8,1.8436
9,1.6292
10,1.7912


  return fn(*args, **kwargs)


Training completed in 2h 14m 40s.


In [None]:
inputs = tokenizer(
[
    alpaca_prompt.format(
        "how long will be the vacation",
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, use_cache = True)
tokenizer.batch_decode(outputs)

In [None]:
text = "how long will be the vacation"
device = "cuda:0"

inputs = tokenizer(text, return_tensors="pt").to(device)
outputs = model.generate(**inputs)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

In [None]:
print(len(tokenizer))

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_path = "./Medical-Chatbot-llama-3.1-finetuned"
# Resize embeddings
model.resize_token_embeddings(len(tokenizer))

# Critical: Update model config to match new vocab size
model.config.vocab_size = len(tokenizer)  # Add this line

# Save with updated config
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)



('./Medical-Chatbot-llama-3.1-finetuned/tokenizer_config.json',
 './Medical-Chatbot-llama-3.1-finetuned/special_tokens_map.json',
 './Medical-Chatbot-llama-3.1-finetuned/tokenizer.json')

In [None]:
from huggingface_hub import HfApi, login, create_repo

# Log in to Hugging Face (only needed once)
# login()  # This will prompt for your Hugging Face token if needed

# Initialize API
api = HfApi()

# Define your repository details
repo_id = "bola23/Medical-Chatbot-llama-3.1-finetuned"  # Replace with your Hugging Face username and model name
folder_path = "Medical-Chatbot-llama-3.1-finetuned"  # Ensure this is the correct path to your saved model and tokenizer

# Create the repository if it doesn't exist
create_repo(repo_id=repo_id, repo_type="model", private=True, exist_ok=True)  # exist_ok=True to avoid error if repo exists

# Upload the model and tokenizer folder to Hugging Face Hub
api.upload_folder(
    folder_path=folder_path,
    path_in_repo=".",  # Uploads everything at the root of the repo
    repo_id=repo_id,
    repo_type="model",
)

print(f"Model and tokenizer uploaded to: https://huggingface.co/{repo_id}")

adapter_model.safetensors:   0%|          | 0.00/4.29G [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

Model and tokenizer uploaded to: https://huggingface.co/bola23/Medical-Chatbot-llama-3.1-finetuned


# Inference

In [None]:
!pip install gradio
!pip install streamlit
!pip install PyPDF2
!pip install langchain_google_genai
!pip install -U langchain-community
!pip install faiss-cpu
!pip install -U bitsandbytes

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m


In [None]:
import os

# Store the token as an environment variable
os.environ['HF_TOKEN'] = 'Hugging_face_API'

# Now you can access it later using:
# token = os.environ.get('HF_TOKEN')

In [None]:
from huggingface_hub import login
import torch
import os
# Log in to Hugging Face Hub
login(token=os.environ["HF_TOKEN"])

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [None]:
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
model_id = "bola23/Medical-Chatbot-llama-3.1-finetuned"  # Your fine-tuned model ID
base_model_id = "meta-llama/Llama-3.1-8B" # The original base model ID

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load the base model
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    quantization_config=bnb_config,
    device_map={"": 0}
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
base_model.resize_token_embeddings(len(tokenizer))
base_model.config.vocab_size = len(tokenizer)

# Now load the PEFT model using the base model
# Increase the timeout for loading the adapter
model = PeftModel.from_pretrained(
    base_model,  # Provide the base model here
    model_id=model_id,
    device_map={"": 0},
    # Add the timeout argument to from_pretrained
    timeout=60,  # Set a longer timeout (e.g., 60 seconds)
)

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.7k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/439 [00:00<?, ?B/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


adapter_config.json:   0%|          | 0.00/796 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/4.29G [00:00<?, ?B/s]

In [None]:
# del base_model

In [None]:
alpaca_prompt = """You are a highly knowledgeable and empathetic medical AI assistant trained to provide accurate, clear, and respectful health-related advice. Your responses should be medically sound, non-alarming, and encourage patients to consult with healthcare professionals when necessary.

### Instruction:
Given a patient's message, provide an appropriate and informative medical response.

### Input:
Patient: {}

### Output:
Doctor: {}"""

In [None]:
inputs = tokenizer(
[
    alpaca_prompt.format(
        "my stomach is hurting me so what i have to take",
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, use_cache = True)
tokenizer.batch_decode(outputs)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


["<|begin_of_text|>You are a highly knowledgeable and empathetic medical AI assistant trained to provide accurate, clear, and respectful health-related advice. Your responses should be medically sound, non-alarming, and encourage patients to consult with healthcare professionals when necessary.\n\n### Instruction:\nGiven a patient's message, provide an appropriate and informative medical response.\n\n### Input:\nPatient: my stomach is hurting me so what i have to take\n\n### Output:\nDoctor:  Hello. Your stomach pain could be due to many reasons. If it is a mild pain then you"]

In [None]:
text = "my leg is hurting me so what i have to take"
device = "cuda:0"

inputs = tokenizer(text, return_tensors="pt").to(device)
outputs = model.generate(**inputs)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


my leg is hurting me so what i have to take is an anti inflammatory drug for the pain and the swelling and the redness and the heat and i


# Whisper model

In [None]:
import torch
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import soundfile as sf
import torchaudio
from transformers import BitsAndBytesConfig

processor = WhisperProcessor.from_pretrained("openai/whisper-small")
record_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")

# Ensure that forced_decoder_ids is None
record_model.config.forced_decoder_ids = None

# Convert WhatsApp voice note (.opus) to wav using torchaudio
def convert_opus_to_wav(opus_file, wav_file):
    # Load .opus file
    waveform, sample_rate = torchaudio.load(opus_file)
    # Resample to 16kHz if necessary (Whisper expects 16kHz audio)
    resampled_waveform = torchaudio.transforms.Resample(sample_rate, 16000)(waveform)
    # Save the resampled file in .wav format
    sf.write(wav_file, resampled_waveform.squeeze().numpy(), 16000)

preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.97k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.87k [00:00<?, ?B/s]

In [None]:
# Test LLAMA 3.1 Only

import gradio as gr
import torch
import soundfile as sf

# Define prompt template

alpaca_prompt = """You are an AI assistant trained to provide accurate and helpful responses in a conversation.

### Instruction:
Given a user message, generate the most appropriate response.

### Input:
User: {}

### Output:
Assistant:"""

# i have a headach what drug i have to take

# alpaca_prompt = """You are a highly knowledgeable and empathetic medical AI assistant trained to provide accurate, clear, and respectful health-related advice. Your responses should be medically sound, non-alarming, and encourage patients to consult with healthcare professionals when necessary.

# ### Instruction:
# Given a patient's message, provide an appropriate and informative medical response.

# ### Input:
# Patient: {}

# ### Output:
# Doctor: """


def convert_opus_to_wav(opus_path, wav_path):
    import librosa
    audio, sr = librosa.load(opus_path, sr=16000)
    sf.write(wav_path, audio, sr)

def transcribe_audio(audio_path):
    # Convert Opus to Wav
    wav_file = "converted_voice_note.wav"
    convert_opus_to_wav(audio_path, wav_file)

    # Load audio file
    audio_input, _ = sf.read(wav_file)

    # Prepare input for the model
    input_features = processor(audio_input, return_tensors="pt", sampling_rate=16000).input_features

    # Generate transcription
    with torch.no_grad():
        generated_ids = record_model.generate(input_features)

    # Decode transcription
    transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return transcription

def generate_text(prompt):
    # Format the prompt
    formatted_prompt = alpaca_prompt.format(prompt)

    # Tokenize the input prompt
    inputs = tokenizer([formatted_prompt], return_tensors="pt").to("cuda")

    # Generate response
    outputs = model.generate(**inputs, max_new_tokens=200, use_cache=True)

    # Decode the response
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Ensure proper extraction of response
    response = response[len(formatted_prompt):].strip()

    return response

def process_input(text_input, audio_input):
    if text_input:
        return generate_text(text_input)
    elif audio_input:
        transcription = transcribe_audio(audio_input)
        return generate_text(transcription)
    else:
        return "Please provide either text or voice input."

# Set up Gradio interface with text and voice input
iface = gr.Interface(
    fn=process_input,
    inputs=[
        gr.Textbox(lines=2, placeholder="Enter your text prompt here..."),
        gr.Audio(type="filepath")  # Remove the 'source' argument
    ],
    outputs="text",
    title="LLM Fine-tuned Model with Voice & Text Input"
)

# Launch the Gradio app
iface.launch()


It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://5776dfb52d44d55226.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


