### Installation

In [None]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl==0.15.2 triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1" huggingface_hub hf_transfer
    !pip install --no-deps unsloth

### Model and Tokenszer Loading using unsloth

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048
dtype = None
load_in_4bit = True


model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-instruct-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.


    PyTorch 2.6.0+cu124 with CUDA 1204 (you have 2.8.0+cu126)
    Python  3.12.9 (you have 3.12.12)
  Please reinstall xformers (see https://github.com/facebookresearch/xformers#installing-xformers)
  Memory-efficient attention, SwiGLU, sparse and more won't be available.
  Set XFORMERS_MORE_DETAILS=1 for more details


Switching to PyTorch attention since your Xformers is broken.

Unsloth: Xformers was not installed correctly.
Please install xformers separately first.
Then confirm if it's correctly installed by running:
python -m xformers.info

Longer error message:
xFormers can't load C++/CUDA extensions. xFormers was built for:
    PyTorch 2.6.0+cu124 with CUDA 1204 (you have 2.8.0+cu126)
    Python  3.12.9 (you have 3.12.12)
  Please reinstall xformers (see https://github.com/facebookresearch/xformers#installing-xformers)
  Memory-efficient attention, SwiGLU, sparse and more won't be available.
ü¶• Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.11.1: Fast Llama patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.c

model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/220 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/345 [00:00<?, ?B/s]

### Adding LoRA Adpters

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

Unsloth 2025.11.1 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


### Preprocessing the dataset

Llama models optimized for alpaca prompot template.



In [None]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Input:
{input}

### Response:
{response}"""

EOS_TOKEN = tokenizer.eos_token

def formatting_prompts_func(examples):
    questions = examples["question"]
    answers = examples["answer"]

    texts = []
    for question, answer in zip(questions, answers):
        instruction = "Answer the following question based on plant disease context."
        text = alpaca_prompt.format(
            instruction=instruction,
            input=question,
            response=answer
        ) + EOS_TOKEN
        texts.append(text)

    return {"text": texts}

from datasets import load_dataset

dataset = load_dataset("json", data_files="final_dataset.jsonl", split="train")
dataset = dataset.map(formatting_prompts_func, batched=True)


Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/13926 [00:00<?, ? examples/s]

### Config and train the model

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False,
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none",
    ),
)

Unsloth: Tokenizing ["text"] (num_proc=6):   0%|          | 0/13926 [00:00<?, ? examples/s]

In [None]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.748 GB.
5.984 GB of memory reserved.


In [None]:
trainer_stats = trainer.train()

The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 13,926 | Num Epochs = 1 | Total steps = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 41,943,040 of 8,072,204,288 (0.52% trained)


Step,Training Loss
1,3.7682
2,3.7989
3,3.8447
4,3.1994
5,3.0024
6,2.6381
7,2.152
8,1.5783
9,1.3672
10,1.0768


Unsloth: Will smartly offload gradients to save VRAM!


### Inference

In [None]:

FastLanguageModel.for_inference(model)
inputs = tokenizer(
[
    alpaca_prompt.format(
        instruction="Answer the following question based on plant disease context.",
        input=("Write one clear paragraph on Tomato Yellow Leaf Curl Virus covering: "
               "host range, key symptoms, transmission/vector, and 3‚Äì4 practical IPM controls "
               "(resistant varieties, whitefly control, sanitation, cultural practices)."),
        response="",
    )
], return_tensors="pt").to("cuda")



outputs = model.generate(**inputs, max_new_tokens=1000, temperature=0.7, top_p=0.9)
tokenizer.batch_decode(outputs, skip_special_tokens=True)


['Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nAnswer the following question based on plant disease context.\n\n### Input:\nWrite one clear paragraph on Tomato Yellow Leaf Curl Virus covering: host range, key symptoms, transmission/vector, and 3‚Äì4 practical IPM controls (resistant varieties, whitefly control, sanitation, cultural practices).\n\n### Response:\nTomato Yellow Leaf Curl Virus (TYLCV) is a serious pathogen affecting tomatoes, and other solanaceous crops. The disease is characterized by yellowing and curling of leaves, leading to stunted growth and reduced yields. TYLCV is transmitted by the whitefly, Bemisia tabaci, and the virus is present in the plant sap. To control TYLCV, farmers can adopt resistant varieties, control whitefly populations, maintain good sanitation, and practice cultural practices like crop rotation, removing weeds, and 

 Use a `TextStreamer` for continuous inference (generate token by token)

In [None]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        instruction="Answer the following question based on plant disease context.",
        input=("Write one clear paragraph on Tomato Yellow Leaf Curl Virus covering: "
               "host range, key symptoms, transmission/vector, and 3‚Äì4 practical IPM controls "
               "(resistant varieties, whitefly control, sanitation, cultural practices)."),
        response="",
    )
], return_tensors="pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 500)

<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Answer the following question based on plant disease context.

### Input:
Write one clear paragraph on Tomato Yellow Leaf Curl Virus covering: host range, key symptoms, transmission/vector, and 3‚Äì4 practical IPM controls (resistant varieties, whitefly control, sanitation, cultural practices).

### Response:
Tomato Yellow Leaf Curl Virus (TYLCV) is a significant disease affecting tomato crops worldwide. TYLCV is transmitted by the sweet potato whitefly (Bemisia tabaci) and causes symptoms like yellowing of leaves, curling, and stunting. The virus can infect a wide range of hosts, including tomato, pepper, eggplant, and okra. Practically, TYLCV can be controlled through the use of resistant varieties, whitefly control using insecticides or biological agents, maintaining good sanitation practices,

### Saving finetuned models
Here is saving LoRA Adapters

In [None]:
model.save_pretrained("safefelora_lora_adapters")
tokenizer.save_pretrained("safefelora_lora_adapters")

('safefelora_lora_adapters/tokenizer_config.json',
 'safefelora_lora_adapters/special_tokens_map.json',
 'safefelora_lora_adapters/chat_template.jinja',
 'safefelora_lora_adapters/tokenizer.json')

### Load the saved LoRA adapters for reference

In [None]:

from unsloth import FastLanguageModel
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "safefelora_lora_adapters",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

# alpaca_prompt used above

inputs = tokenizer(
[
    alpaca_prompt.format(
        instruction="Answer the following question based on plant disease context.",
        input=("Write ONE advanced paragraph (5‚Äì7 sentences) on Tomato Yellow Leaf Curl Virus (TYLCV) covering: "
               "1) how to distinguish TYLCV from nutrient deficiencies (e.g., interveinal chlorosis patterns without strong upward curling) and herbicide injury (field uniformity, timing, malformed new growth), "
               "2) transmission biology via Bemisia tabaci and practical early warning (yellow sticky-trap counts, sentinel seedlings, scouting of new flush), "
               "3) seasonality/climate factors that raise risk (warm, dry periods and protected cultivation), and "
               "4) a stepwise IPM plan with 4‚Äì6 actions (TYLCV-resistant cultivars, certified virus-free seedlings, whitefly suppression and rogueing of symptomatic plants, reflective mulches/screens, sanitation and weed host removal, crop-free breaks), plus one caution about avoiding inappropriate or banned insecticides and resistance management."),
        response="",
    )
], return_tensors="pt").to("cuda")


from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer=text_streamer, max_new_tokens=500, temperature=0.7, top_p=0.9, use_cache=True)

==((====))==  Unsloth 2025.11.1: Fast Llama patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


ValueError: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 

### Download LoRA Adapters

In [None]:
from google.colab import files
!zip -r safefelora_lora_adapters.zip safefelora_lora_adapters
files.download("safefelora_lora_adapters.zip")

  adding: safefelora_lora_adapters/ (stored 0%)
  adding: safefelora_lora_adapters/special_tokens_map.json (deflated 70%)
  adding: safefelora_lora_adapters/tokenizer_config.json (deflated 96%)
  adding: safefelora_lora_adapters/adapter_model.safetensors (deflated 8%)
  adding: safefelora_lora_adapters/chat_template.jinja (deflated 52%)
  adding: safefelora_lora_adapters/tokenizer.json (deflated 85%)
  adding: safefelora_lora_adapters/README.md (deflated 65%)
  adding: safefelora_lora_adapters/adapter_config.json (deflated 57%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### Upload LoRA Adapters

In [None]:
from google.colab import files

uploaded = files.upload()

Saving safefelora_lora_adapters.zip to safefelora_lora_adapters.zip


In [None]:
!unzip safefelora_lora_adapters.zip

Archive:  safefelora_lora_adapters.zip
   creating: safefelora_lora_adapters/
  inflating: safefelora_lora_adapters/special_tokens_map.json  
  inflating: safefelora_lora_adapters/tokenizer_config.json  
  inflating: safefelora_lora_adapters/adapter_model.safetensors  
  inflating: safefelora_lora_adapters/chat_template.jinja  
  inflating: safefelora_lora_adapters/tokenizer.json  
  inflating: safefelora_lora_adapters/README.md  
  inflating: safefelora_lora_adapters/adapter_config.json  


## Evaluation

Installing packages for evaluation process

In [None]:
!pip install -U bitsandbytes accelerate transformers peft



###  Load the model using Transormers

In [None]:

from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer
model = AutoPeftModelForCausalLM.from_pretrained(
  "safefelora_lora_adapters",
  load_in_4bit = True,
)
tokenizer = AutoTokenizer.from_pretrained("safefelora_lora_adapters")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/220 [00:00<?, ?B/s]

In [None]:
!pip install -U bitsandbytes



### Inference using Transformers

In [None]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Input:
{input}

### Response:
{response}"""

inputs = tokenizer(
[
    alpaca_prompt.format(
        instruction="Answer the following question based on plant disease context.",
        input=("Write one clear paragraph on Tomato Yellow Leaf Curl Virus covering: "
               "host range, key symptoms, transmission/vector, and 3‚Äì4 practical IPM controls "
               "(resistant varieties, whitefly control, sanitation, cultural practices)."),
        response="",
    )
], return_tensors="pt").to(model.device)


outputs = model.generate(**inputs, max_new_tokens=1000, temperature=0.7, top_p=0.9)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Answer the following question based on plant disease context.

### Input:
Write one clear paragraph on Tomato Yellow Leaf Curl Virus covering: host range, key symptoms, transmission/vector, and 3‚Äì4 practical IPM controls (resistant varieties, whitefly control, sanitation, cultural practices).

### Response:
Tomato Yellow Leaf Curl Virus (TYLCV) is a viral disease that affects tomato and other Solanaceae crops. Key symptoms include yellowing of the leaves, curling, and stunting. The virus is transmitted by the whitefly Bemisia tabaci and can be spread through contaminated plant material. To manage TYLCV, farmers can adopt integrated pest management (IPM) practices such as planting resistant varieties, controlling whitefly populations, maintaining good sanitation, and implementing cultural practices like crop rot

# Bertscore

In [None]:
!pip install bert-score datasets

Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m61.1/61.1 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bert-score
Successfully installed bert-score-0.3.13


In [None]:
from datasets import load_dataset
from bert_score import score

In [None]:
dataset = load_dataset("json", data_files="test.jsonl", split="train")
subset = dataset.select(range(min(50, len(dataset))))

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
subset

Dataset({
    features: ['question', 'answer', 'split'],
    num_rows: 50
})

In [None]:
preds, refs = [], []

for ex in subset:
  prompt = alpaca_prompt.format(
      instruction="Answer the following question based on plant disease context.",
      input=ex["question"],
      response=""
  )
  inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
  outputs = model.generate(**inputs, max_new_tokens=512)
  decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)

  response_start = decoded.rfind("### Response:") # check for reposonse from whole template
  if response_start != -1:
      decoded = decoded[response_start + len("### Response:"):].strip()

  preds.append(decoded)
  refs.append(ex["answer"])

In [None]:
print(preds[0])
print(refs[0])

The plant is infected with Target Spot fungus.
The mosaic pattern of light and dark green areas is a classic symptom of ToMV.


In [None]:
P, R, F1 = score(preds, refs, lang="en")
print("Average BERTScore F1:", F1.mean().item())

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Average BERTScore F1: 0.8773563504219055


In [None]:
print("Average BERTScore Precision:", P.mean().item())
print("Average BERTScore Recall:", R.mean().item())

Average BERTScore Precision: 0.8782873749732971
Average BERTScore Recall: 0.8767160177230835
