<a href="https://colab.research.google.com/github/frank-morales2020/MLxDL/blob/main/HF_AGENT_LLAMA3_HFModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q datasets
!pip install -q sentence_transformers
!pip install -U flash-attn --no-build-isolation --quiet
!pip install -q tqdm
!pip install -q colab-env

In [2]:
import colab_env

Mounted at /content/gdrive


In [None]:
from typing import NamedTuple, Literal, get_args
from typing import Any
from pydantic import BaseModel, Field, ConfigDict


# Define types for GenerateContentConfig (you might need to adjust this based on your HF pipeline)
class SafetySetting(BaseModel):
    category: Literal["HARM_CATEGORY_HARASSMENT", "HARM_CATEGORY_HATE_SPEECH", "HARM_CATEGORY_SEXUALLY_EXPLICIT", "HARM_CATEGORY_DANGEROUS_CONTENT"]
    threshold: Literal["BLOCK_NONE", "BLOCK_LOW", "BLOCK_MEDIUM", "BLOCK_HIGH"]

class GenerationConfig(BaseModel):
    stop_sequences: list[str] | None = None
    max_output_tokens: int | None = None
    temperature: float | None = None
    top_p: float | None = None
    top_k: int | None = None

class GenerateContentConfig(BaseModel):
    safety_settings: list[SafetySetting] | None = None
    generation_config: GenerationConfig | None = None

# Define the Agent class - the 'model' field will now be more of a description
class Agent(BaseModel):
    model_config = ConfigDict(arbitrary_types_allowed=True) # Allow arbitrary types like 'pipeline'
    model: str
    name: str
    description: str
    instruction: str
    pipeline: Any = Field(default=None)  # Use 'Any' from typing module
    generate_content_config: GenerateContentConfig | None = Field(default_factory=GenerateContentConfig)


import torch
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer, pipeline, BitsAndBytesConfig

#peft_model_id = "/content/gdrive/MyDrive/model/07MAY2025-Meta-Llama-3-8B-MEDAL-flash-attention-2-cosine-evaldata-epochs1"

peft_model_id = "frankmorales2020/07MAY2025-Meta-Llama-3-8B-MEDAL-flash-attention-2-cosine-evaldata-epochs1"

# BitsAndBytesConfig int-4 config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

# Load Model with PEFT adapter
model = AutoPeftModelForCausalLM.from_pretrained(
  peft_model_id,
  device_map="auto",
  torch_dtype=torch.float16,
  attn_implementation="flash_attention_2",
  quantization_config=bnb_config
)

tokenizer = AutoTokenizer.from_pretrained(peft_model_id)
# load into pipeline
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)


# --- Create the basic agent instance ---
basic_agent = Agent(
    model="Meta-Llama-3-8B-MEDAL (PEFT)", # Descriptive name
    name="medical_agent_llama3",
    description="This agent responds to medical inquiries using a fine-tuned Llama 3 model.",
    instruction="Identify and extract the specific term that the query is defining. Provide only that term as your answer.",
    pipeline=pipe, # Assign the loaded pipeline to the agent
    generate_content_config=GenerateContentConfig(temperature=0.2),
)

In [4]:
print(basic_agent)
print('\n')

model='Meta-Llama-3-8B-MEDAL (PEFT)' name='medical_agent_llama3' description='This agent responds to medical inquiries using a fine-tuned Llama 3 model.' instruction='Identify and extract the specific term that the query is defining. Provide only that term as your answer.' pipeline=<transformers.pipelines.text_generation.TextGenerationPipeline object at 0x7b482842c490> generate_content_config=GenerateContentConfig(safety_settings=None, generation_config=None)




## MODEL Inference

In [None]:
# Load our test dataset
from datasets import load_dataset
eval_dataset =load_dataset("json", data_files="/content/gdrive/MyDrive/datasets/McGill-NLP/test_dataset.json", split="train")

In [7]:
eval_dataset

Dataset({
    features: ['abstract_id', 'text', 'location', 'label'],
    num_rows: 1000000
})

In [None]:
from datasets import load_dataset
from random import randint


# Load our test dataset
eval_dataset = load_dataset("json", data_files="/content/gdrive/MyDrive/datasets/McGill-NLP/test_dataset.json", split="train")
nrec= randint(0, len(eval_dataset))
nrec=6

# Test on sample
generation_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer, device_map="auto")  # Add device_map
prompt =  eval_dataset[nrec]['text']


outputs = generation_pipeline(prompt, max_new_tokens=128, do_sample=True, temperature=0.9,
                                  top_k=30, top_p=0.1, eos_token_id=tokenizer.eos_token_id,
                                  pad_token_id=tokenizer.eos_token_id)

In [9]:
print(f"Query:\n{eval_dataset[nrec]['text']}")
print()
oanswer=str(eval_dataset[nrec]['label'])
oanswer=oanswer[2:len(oanswer)-2]
print(f"Original Answer:\n{oanswer}")
print()
ganswer=outputs[0]['generated_text'][len(prompt)+9:].strip()
qc=str(ganswer).find('[INST]')
ganswer=ganswer[0:qc-7]
qc0=str(ganswer).find('[INST]')
ganswer=str(ganswer)[0:qc0]
qc=str(ganswer).find('[/INST]')
if qc>0:
  ganswer=ganswer[qc+8:len(ganswer)]
print(f"Generated Answer:\n{ganswer}")
print()
if ganswer == oanswer:
  print("Match")
else:
  print("NO Match")

Query:
while diminished ovarian reserve dor predicts decreased ovarian response to stimulation it does not necessarily foretell about the fecundity cycle according to bolognas criteria laid down by the european society of human reproduction and embryology old age abnormal ovarian reserve tests such as AFC afc and antimullerian hormone amh as well as prior suboptimal response to stimulation are the main AF representing dor unfavorable response to maximal stimulation on two previous occasions may also represent dor among the ovarian reserve tests amh and afc are the most predictive values for dor AF which may give rise to dor include environmental factors autoimmune or metabolic disorders infections genetic abnormalities and iatrogenic causes such as smoking chemotherapy radiation and gynecologic surgeries besides studies have proposed endometriosis as a key contributor to dor and hence emphasized on its proper management to prevent additional damages leading to compromised fertility in 

## Agent Inference

In [10]:
print("... ... ... Medical Agent Content Generation ... ... ...")

if basic_agent.pipeline:
    #prompt0 = "Define myocardial infarction."
    prompt1 = eval_dataset[nrec]['text']
    print('\n')
    print(f"Query: {prompt1}")
    print('\n')
    print(f"Original Answer: {eval_dataset[nrec]['label']}")
    print('\n')
    output = basic_agent.pipeline(prompt1, max_length=512, num_return_sequences=1)

    ganswer=output[0]['generated_text'][len(prompt1)+9:].strip()
    qc=str(ganswer).find('[INST]')
    ganswer=ganswer[0:qc-7]
    qc0=str(ganswer).find('[INST]')
    ganswer=str(ganswer)[0:qc0]
    qc=str(ganswer).find('[/INST]')
    if qc>0:
      ganswer=ganswer[qc+8:len(ganswer)]
    print(f"Agent Generated Answer: {ganswer}")

    #print(output[0]['generated_text'].strip())
else:
    print("The pipeline for the agent is not loaded.")

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


... ... ... Medical Agent Content Generation ... ... ...


Query: while diminished ovarian reserve dor predicts decreased ovarian response to stimulation it does not necessarily foretell about the fecundity cycle according to bolognas criteria laid down by the european society of human reproduction and embryology old age abnormal ovarian reserve tests such as AFC afc and antimullerian hormone amh as well as prior suboptimal response to stimulation are the main AF representing dor unfavorable response to maximal stimulation on two previous occasions may also represent dor among the ovarian reserve tests amh and afc are the most predictive values for dor AF which may give rise to dor include environmental factors autoimmune or metabolic disorders infections genetic abnormalities and iatrogenic causes such as smoking chemotherapy radiation and gynecologic surgeries besides studies have proposed endometriosis as a key contributor to dor and hence emphasized on its proper management to prev