[Updates](https://github.com/dumpsters/RWKV-Colab-Notebooks)
---

Thanks to:

[RWKV-notebooks](https://github.com/resloved/RWKV-notebooks)

[RWKV-LM](https://github.com/BlinkDL/RWKV-LM)

[RWKV-LM-LorA](https://github.com/Blealtan/RWKV-LM-LoRA)


# Setup

In [None]:
#@title Create and mount folders { display-mode: "form" }
#@markdown `model_dir` is relative to your Google Drive's root folder

drive_dir = '/content/drive'
model_dir = 'AI/RWKV' #@param {type:"string"}
tuned_dir = 'tuned-LorA' #@param {type:"string"}
dataset_name = 'trainable' #@param {type:"string"}

from google.colab import drive
drive.mount(drive_dir, force_remount=True)

output_path = f"{drive_dir}/MyDrive/{model_dir}"
checkpoint_dir = f"{output_path}/{tuned_dir}"
dataset_dir = f"{checkpoint_dir}/dataset"
dataset_raw = f"{dataset_dir}/{dataset_name}"
dataset_file = f"{dataset_raw}_text_document" # hardcoded suffix somwhere I forgot

from os import makedirs
makedirs(f"{checkpoint_dir}", exist_ok=True)
makedirs(f"{dataset_dir}/{dataset_name}", exist_ok=True)
makedirs(f"{output_path}/base_models/", exist_ok=True)

#!nvidia-smi

print(f"LorA checkpoints will be saved to {checkpoint_dir}")
print(f"Place your jsonl dataset files in {dataset_dir}/{dataset_name} if you want to tokenize")

In [None]:
#@title Clone RWKV-LorA and the tokenizer we will use
!git clone https://github.com/Blealtan/RWKV-LM-LoRA.git RWKV-LorA
!git clone https://github.com/EleutherAI/gpt-neox.git GPT-NeoX

# RWKV
!pip install deepspeed==0.8.3 --quiet
!pip install pytorch-lightning==1.9.1 --quiet
!pip install torch transformers wandb ninja --quiet

# NeoX
!pip install -r ./GPT-NeoX/requirements/requirements.txt --quiet

# Load models

In [None]:
#@title Load checkpoint or start a new { display-mode: "form" }
#@markdown Check `restore_checkpoint` to load a checkpoint from your `tuned_dir`
%cd /content/
base_model_name = "RWKV-4-Pile-430M" #@param ["RWKV-4-Raven-1B5-v8", "RWKV-4-Pile-430M", "RWKV-4-Pile-169M"]
restore_checkpoint = False #@param {type:"boolean"}

model_filename = f"{base_model_name}.pth"

from huggingface_hub import hf_hub_url
if base_model_name == "RWKV-4-Pile-169M":
    base_model_repo = f"BlinkDL/rwkv-4-pile-169m"
    model_url = hf_hub_url(repo_id=f"{base_model_repo}", filename="RWKV-4-Pile-169M-20220807-8023.pth")
    n_layer = 12
    n_embd = 768
if base_model_name == "RWKV-4-Pile-430M":
    base_model_repo = f"BlinkDL/rwkv-4-pile-430m"
    model_url = hf_hub_url(repo_id=f"{base_model_repo}", filename="RWKV-4-Pile-430M-20220808-8066.pth")
    n_layer = 24
    n_embd = 1024
if base_model_name == "RWKV-4-Raven-1B5-v8":
    base_model_repo = f"BlinkDL/rwkv-4-raven"
    model_url = hf_hub_url(repo_id=f"{base_model_repo}", filename="RWKV-4-Raven-1B5-v8-Eng-20230408-ctx4096.pth")
    n_layer = 24
    n_embd = 2048

import os
if os.path.isfile(model_filename) == False:
  !curl -L $model_url -o $model_filename

from glob import glob
model_path = glob(f"/content/{base_model_name}/*.pth")[0]
if restore_checkpoint == True:
  checkpoint_path = glob(f"{checkpoint_dir}/*.pth")[0]
  print(f"Using {checkpoint_path} as LorA checkpoint")
  print(f"Set epoch_begin manually to your last epoch +1")
print(f"Using {model_path} as base")

# Training

In [None]:
#@title Create dataset out of jsonl files { display-mode: "form" }
#@markdown In case you mounted your drive before copying the jsonl files over, re-run the setup part again before running this.
use_ftfy = False #@param {type:"boolean"}

%cd /content/
print(f"Tokenizing jsonl files in {dataset_raw}")

if use_ftfy:
  !python ./GPT-NeoX/tools/preprocess_data.py \
  --input $dataset_raw \
  --tokenizer-type HFTokenizer \
  --vocab-file ./RWKV-LorA/RWKV-v4neo/20B_tokenizer.json \
  --output-prefix $dataset_raw \
  --append-eod \
  --dataset-impl mmap \
  --ftfy
else:
  !python ./GPT-NeoX/tools/preprocess_data.py \
  --input $dataset_raw \
  --tokenizer-type HFTokenizer \
  --vocab-file ./RWKV-LorA/RWKV-v4neo/20B_tokenizer.json \
  --output-prefix $dataset_raw \
  --append-eod \
  --dataset-impl mmap

In [None]:
#@title Finetune { display-mode: "form" }
#@markdown Add `MAX_JOBS=1` in front of the python command if you're tuning 1B5 or above.
epoch_count = 1000 #@param {type:"integer"}
epoch_begin = 0 #@param {type:"integer"}
epoch_steps = 500 #@param {type:"integer"}
epoch_save_frequency = 1 #@param {type:"integer"}
micro_bsz =  10 #@param {type:"integer"} 
ctx_len = 1024 #@param {type:"integer"}
precision = 'bf16' #@param ['fp16', 'fp32', 'tf32', 'bf16'] {type:"string"}
strategy = 'deepspeed_stage_2' #@param ['deepspeed_stage_2', 'deepspeed_stage_2_offload', 'ddp_find_unused_parameters_false'] {type:"string"}
grad_cp = "1" #@param [0, 1] {type:"string"}
lora_r = 8 #@param {type:"integer"}
lora_alpha = 16 #@param {type:"integer"}
lora_dropout = 0.01 #@param {type:"number"}

# https://github.com/Blealtan/RWKV-LM-LoRA/blob/main/RWKV-v4neo/train.py#L32
%cd /content/RWKV-LorA/RWKV-v4neo/
if restore_checkpoint == False:
  !python train.py \
  --load_model $model_path \
  --proj_dir $checkpoint_dir \
  --data_file $dataset_file \
  --data_type "binidx" \
  --vocab_size 50277 \
  --ctx_len $ctx_len \
  --epoch_steps $epoch_steps \
  --epoch_count $epoch_count \
  --epoch_begin $epoch_begin \
  --epoch_save $epoch_save_frequency \
  --micro_bsz $micro_bsz \
  --n_layer $n_layer \
  --n_embd $n_embd \
  --pre_ffn 0 \
  --head_qk 0 \
  --lr_init 1e-5 \
  --lr_final 1e-5 \
  --warmup_steps 0 \
  --beta1 0.9 \
  --beta2 0.999 \
  --adam_eps 1e-8 \
  --accelerator gpu \
  --devices 1 \
  --precision $precision \
  --strategy $strategy \
  --grad_cp $grad_cp \
  --lora \
  --lora_r $lora_r \
  --lora_alpha $lora_alpha \
  --lora_dropout $lora_dropout \
  --lora_parts=att,ffn,time,ln
else:
  !python train.py \
  --load_model $model_path \
  --proj_dir $checkpoint_dir \
  --data_file $dataset_file \
  --data_type "binidx" \
  --vocab_size 50277 \
  --ctx_len $ctx_len \
  --epoch_steps $epoch_steps \
  --epoch_count $epoch_count \
  --epoch_begin $epoch_begin \
  --epoch_save $epoch_save_frequency \
  --micro_bsz $micro_bsz \
  --n_layer $n_layer \
  --n_embd $n_embd \
  --pre_ffn 0 \
  --head_qk 0 \
  --lr_init 1e-5 \
  --lr_final 1e-5 \
  --warmup_steps 0 \
  --beta1 0.9 \
  --beta2 0.999 \
  --adam_eps 1e-8 \
  --accelerator gpu \
  --devices 1 \
  --precision $precision \
  --strategy $strategy \
  --grad_cp $grad_cp \
  --lora \
  --lora_r $lora_r \
  --lora_alpha $lora_alpha \
  --lora_dropout $lora_dropout \
  --lora_load $checkpoint_path

# --lora_load <lora checkpoint to continue training> \ # optional
# --lora_parts=att,ffn,time,ln # configure which parts to finetune