
In this notebook we will show you how to use our library in order to run Train-Time fine-tuning for ARC-Challenge.

In [None]:
%%capture
# Install the giotto-llm package
!pip3 install giotto-llm
# Get Datasets
!mkdir kaggle
!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1ISFkhGIc-i6Djdp55WxCI2b-PUu3hgTS' -O kaggle/input.zip
!unzip kaggle/input.zip -d kaggle/
!rm -rf kaggle/input.zip
!rm -rf kaggle/__MACOSX
# Get the pre-trained model
!mkdir models
!gsutil cp -r gs://public-models-and-datasets/models/merged_llama_v6/ models/

In [None]:
# Install dependencies.
!pip install liger-kernel
!pip install qwen_vl_utils
!pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cu118
!pip install trl
!pip install bitsandbytes==0.44.0

Collecting liger-kernel
  Downloading liger_kernel-0.4.2-py3-none-any.whl.metadata (21 kB)
Downloading liger_kernel-0.4.2-py3-none-any.whl (90 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/90.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.7/90.7 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: liger-kernel
Successfully installed liger-kernel-0.4.2
Collecting qwen_vl_utils
  Downloading qwen_vl_utils-0.0.8-py3-none-any.whl.metadata (3.6 kB)
Collecting av (from qwen_vl_utils)
  Downloading av-14.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.5 kB)
Downloading qwen_vl_utils-0.0.8-py3-none-any.whl (5.9 kB)
Downloading av-14.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (33.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.0/33.0 MB[0m [31m58.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packag

In [None]:
import giotto_llm
import os

# Choose of the pre-trained models
models = [
    "merged_llama_v7", # Llama 1B trained on V7 dataset. All 800-tasks were used for training.
    "merged_llama_v6",
    "merged_llama_1b_v5", # Llama 1B trained on V5 dataset. Only 400-train tasks were used for training.
]

# CHOOSE THE BASE MODEL
model_name = models[1]

ONLINE_FINETUNING_PARAMS = {
     # DATA
    "dataset_dir": "kaggle/input",
    "dataset_category": "evaluation", # The category of the dataset. Options: evaluation, full
    "start_index_tasks": 0, # The start index of the tasks to be used for online finetuning
    "end_index_tasks": 401,  # The end index of the tasks to be used for online finetuning

    # General parameters
    "model_name": f"models/{model_name}",
    "num_train_epochs": 30,
    "eval_steps": 31, # Number of steps for evaluation
    "save_total_limit": 1, # Number of checkpoints to save
    "learning_rate": 4e-4,
    "batch_size": 1,
    "gradient_accumulation_steps": 16,
    "quantization": "4bit-nf4", # options: 4bit-nf4, 8bit-nf4, 8bit-nf8
    "num_tasks_per_gpu_process": 1,

    # LORA parameters
    "lora_r": 1,
    "neftune_noise_alpha": 10.0,
    "lora_dropout": 0.2,

    # PROMPT
    "prompt_type": "prompt_solve_short",

    # OUTPUT
    "output_dir": f"results_{model_name}",

    # TIME LIMIT FOR TRAINING
    "timeout": 105 * 6 * 60, # 10.5 hours
}

# Setting online finetuning time limit.
os.environ["ONLINE_FINETUNING_TIME_LIMIT"] = str(ONLINE_FINETUNING_PARAMS["timeout"])
os.environ["WANDB_MODE"] = "disabled"

In [None]:
#### ----------------------------------------------------
####      ONLINE FINETUNING
#### ----------------------------------------------------

! python -m giotto_llm.online_fine_tuning.paralel_online_finetuning \
--dataset_dir {ONLINE_FINETUNING_PARAMS["dataset_dir"]} \
--dataset_category {ONLINE_FINETUNING_PARAMS["dataset_category"]} \
--model_id {ONLINE_FINETUNING_PARAMS["model_name"]} \
--wrapper CausalLM \
--batch_size {ONLINE_FINETUNING_PARAMS["batch_size"]} \
--gradient_accumulation_steps {ONLINE_FINETUNING_PARAMS["gradient_accumulation_steps"]} \
--quantization {ONLINE_FINETUNING_PARAMS["quantization"]} \
--neftune_noise_alpha {ONLINE_FINETUNING_PARAMS["neftune_noise_alpha"]} \
--num_train_epochs {ONLINE_FINETUNING_PARAMS["num_train_epochs"]} \
--learning_rate {ONLINE_FINETUNING_PARAMS["learning_rate"]} \
--lora_r {ONLINE_FINETUNING_PARAMS["lora_r"]}\
--save_total_limit {ONLINE_FINETUNING_PARAMS["save_total_limit"]} \
--eval_steps {ONLINE_FINETUNING_PARAMS["eval_steps"]} \
--num_tasks_per_gpu_process {ONLINE_FINETUNING_PARAMS["num_tasks_per_gpu_process"]} \
--prompt_type {ONLINE_FINETUNING_PARAMS["prompt_type"]} \
--output_dir {ONLINE_FINETUNING_PARAMS["output_dir"]} \
--lora_dropout {ONLINE_FINETUNING_PARAMS["lora_dropout"]} \
--start_index_tasks {ONLINE_FINETUNING_PARAMS["start_index_tasks"]} \
--end_index_tasks {ONLINE_FINETUNING_PARAMS["end_index_tasks"]}

13:28:11 INFO:>>> Starting parallelized validation
13:28:11 INFO:>>> Saving submission.json files to results_merged_llama_v6/predictions
13:28:11 INFO:>>> Saving logs to results_merged_llama_v6/logs
13:28:11 INFO:------------------------------
13:28:11 INFO:>>> Found 1 GPUs
13:28:11 INFO:>>> Logging resource usage to results_merged_llama_v6/monitor.csv every 1.0 seconds...
13:28:11 INFO:>>> Found 400 tasks with arguments.dataset_dir='kaggle/input', arguments.dataset_category='evaluation'
13:28:11 INFO:>>> Dividing tasks into 400 batches, each with up to 1 tasks
13:28:11 INFO:>>> Going to run the following command as subprocess
13:28:11 INFO:	- ['python', '-m', 'giotto_llm.online_fine_tuning', '--dataset_dir', 'kaggle/input', '--dataset_category', 'evaluation', '--start_index_tasks', '0', '--end_index_tasks', '1', '--gpu_index', '0', '--model_id', 'models/merged_llama_v6', '--wrapper', 'CausalLM', '--output_dir', 'results_merged_llama_v6', '--quantization', '4bit-nf4', '--transform_back