# Supervised Learning


In [None]:
import os
from google.colab import userdata
# Note: `userdata.get` is a Colab API. If you're not using Colab, set the env
# vars as appropriate for your system.
os.environ["HF_TOKEN"] = userdata.get("HF_TOKEN")

In [None]:
!pip install transformers datasets trl



# Llama Factory

In [None]:

%cd /content/
%rm -rf LLaMA-Factory
!git clone https://github.com/hiyouga/LLaMA-Factory.git
%cd LLaMA-Factory
%ls
!pip install -e .[torch,bitsandbytes]

/content
Cloning into 'LLaMA-Factory'...
remote: Enumerating objects: 17708, done.[K
remote: Counting objects: 100% (8147/8147), done.[K
remote: Compressing objects: 100% (700/700), done.[K
remote: Total 17708 (delta 7660), reused 7544 (delta 7447), pack-reused 9561 (from 1)[K
Receiving objects: 100% (17708/17708), 226.94 MiB | 16.83 MiB/s, done.
Resolving deltas: 100% (13042/13042), done.
/content/LLaMA-Factory
[0m[01;34massets[0m/       [01;34mdocker[0m/      LICENSE      pyproject.toml  requirements.txt  [01;34msrc[0m/
CITATION.cff  [01;34mevaluation[0m/  Makefile     README.md       [01;34mscripts[0m/          [01;34mtests[0m/
[01;34mdata[0m/         [01;34mexamples[0m/    MANIFEST.in  README_zh.md    setup.py
Obtaining file:///content/LLaMA-Factory
  Installing build dependencies ... [?25l[?25hdone
  Checking if build backend supports build_editable ... [?25l[?25hdone
  Getting requirements to build editable ... [?25l[?25hdone
  Preparing editable metada

In [None]:

import json

args = dict(
    stage="sft",  # do supervised fine-tuning
    do_train=True,
    model_name_or_path="google/gemma-2b",  # use bnb-4bit-quantized Gemma 2B model
    dataset="alpaca_en_demo",  # use the demo alpaca datasets
    template="gemma",  # use Gemma prompt template
    finetuning_type="lora",  # use LoRA adapters to save memory
    lora_target="all",  # attach LoRA adapters to all linear layers
    output_dir="gemma_lora",  # the path to save LoRA adapters
    per_device_train_batch_size=2,  # the batch size
    gradient_accumulation_steps=4,  # the gradient accumulation steps
    lr_scheduler_type="cosine",  # use cosine learning rate scheduler
    logging_steps=10,  # log every 10 steps
    warmup_ratio=0.1,  # use warmup scheduler
    save_steps=1000,  # save checkpoint every 1000 steps
    learning_rate=5e-5,  # the learning rate
    num_train_epochs=3.0,  # the epochs of training
    max_samples=500,  # use 500 examples in each dataset
    max_grad_norm=1.0,  # clip gradient norm to 1.0
    quantization_bit=4,  # use 4-bit QLoRA
    loraplus_lr_ratio=16.0,  # use LoRA+ algorithm with lambda=16.0
    fp16=True,  # use float16 mixed precision training
)

json.dump(args, open("train_gemma.json", "w", encoding="utf-8"), indent=2)

%cd /content/LLaMA-Factory/

!llamafactory-cli train train_gemma.json

/content/LLaMA-Factory
2024-10-04 01:12:46.564105: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-04 01:12:46.583943: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-04 01:12:46.590354: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-04 01:12:46.604924: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
10/04/2024 01:12:53 - INFO - l

# Run Inference


In [None]:
%cd /content/LLaMA-Factory/src/

from llamafactory.chat import ChatModel
from llamafactory.extras.misc import torch_gc

%cd /content/LLaMA-Factory/

args = dict(
    model_name_or_path="google/gemma-2b",  # use Gemma 2B model
    adapter_name_or_path="gemma_lora",  # load the saved LoRA adapters
    template="gemma",  # same to the one in training
    finetuning_type="lora",  # same to the one in training
    quantization_bit=4,  # load 4-bit quantized model
)
chat_model = ChatModel(args)

messages = []
print(
    "Welcome to the CLI application, use `clear` to remove the history, use `exit` to exit the application."
)
while True:
    query = input("\nUser: ")
    if query.strip() == "exit":
        break
    if query.strip() == "clear":
        messages = []
        torch_gc()
        print("History has been removed.")
        continue

    messages.append({"role": "user", "content": query})
    print("Assistant: ", end="", flush=True)

    response = ""
    for new_text in chat_model.stream_chat(messages):
        print(new_text, end="", flush=True)
        response += new_text
    print()
    messages.append({"role": "assistant", "content": response})

torch_gc()

[Errno 2] No such file or directory: '/content/LLaMA-Factory/src/'
/content


ModuleNotFoundError: No module named 'llamafactory'

# Merge LoRA adapter

In [None]:
import json

args = dict(
    model_name_or_path="google/gemma-2b",  # use official non-quantized Gemma 2B model
    adapter_name_or_path="gemma_lora",  # load the saved LoRA adapters
    template="gemma",  # same to the one in training
    finetuning_type="lora",  # same to the one in training
    export_dir="gemma_lora_merged",  # path to save the merged model
    export_size=2,  # the file shard size (in GB) of the merged model
    export_device="cpu",  # the device used in export, can be chosen from `cpu` and `cuda`
    export_hub_model_id="gemma-2b-finetuned-model-llama-factory",  # your Hugging Face hub model ID
)

json.dump(args, open("merge_gemma.json", "w", encoding="utf-8"), indent=2)

%cd /content/LLaMA-Factory/

!llamafactory-cli export merge_gemma.json