# [Updates](https://github.com/h-a-s-k/RWKV-Notebooks)


# Setup

In [None]:
#@title Create and mount folders { display-mode: "form" }
drive_dir = '/content/drive'

#@markdown Avoid spaces<br>`model_dir` is relative to your Google Drive root folder`
model_dir = 'AI/RWKV' #@param {type:"string"}
#@markdown relative to `model_dir`
tuned_dir = 'infctx' #@param {type:"string"}
#@markdown relative to `tuned_dir`
dataset_folder = 'mydata' #@param {type:"string"}

#@markdown ---

#@markdown Leave empty if you don't understand
wandb_project = '' #@param {type:"string"}

output_path = f"{drive_dir}/MyDrive/{model_dir}"
checkpoint_dir = f"{output_path}/{tuned_dir}"

dataset_location = f"{checkpoint_dir}/dataset"
jsonl_folder = f"{dataset_location}/{dataset_folder}"
binidx_filename = f"{dataset_folder}_text_document" # hardcoded suffix

from google.colab import drive
drive.mount(drive_dir, force_remount=True)

from os import makedirs
makedirs(f"{checkpoint_dir}", exist_ok=True)
makedirs(f"{jsonl_folder}", exist_ok=True)

print(f"-----\nSaving checkpoints to {checkpoint_dir}")
print(f"-----\nIf you want to tokenize into binidx,\nplace your jsonl dataset file(s) in {jsonl_folder}")
print(f"otherwise move both pre-made .idx and .bin files in {dataset_location}")

In [None]:
#@title Clone needed repos and install packages
#!nvidia-smi

!rm -rf RWKV
!git clone https://github.com/PicoCreator/RWKV-LM-LoRA.git RWKV
%cd RWKV
!git switch picocreator-dev-infctx
!git pull
%cd /content/

###### RWKV
#  https://hub.docker.com/r/uilicious/rwkv-dev-infctx-env/tags
#  https://github.com/PicoCreator/RWKV-LM-LoRA/tree/cd69f85cc94d353d6883c1ec2deb30c1b5d4e48e#environment-setup
#  pytorch
!pip install torch==2.0.1 torchvision torchaudio lightning==2.0.4 deepspeed==0.9.5 --quiet
#  other
!pip install datasets transformers ninja wandb numexpr sentencepiece jsonargparse 'jsonargparse[signatures]' --quiet

###### json2binidx_tool
!git clone https://github.com/Abel2076/json2binidx_tool.git GPT-NeoX
!pip install 'tokenizers>=0.13.*' 'numpy>=1.23.*' 'lm_dataformat>=0.0.20' 'ftfy>=6.1.*' 'tqdm>=4.65.*' --quiet

###### Extra
!pip install huggingface_hub --quiet

!python -c "import torch; print(torch.__version__)"

In [None]:
#@title Load models
%cd /content/
model_name = "RWKV-4-World-0.4B" #@param ["RWKV-4-World-1B5-v1", "RWKV-4-Raven-1B5-v12", "RWKV-4-World-0.4B", "RWKV-4-Pile-430M", "RWKV-4-World-0.1B", "RWKV-4-Pile-169M"]
#@markdown Check to continue from a previous checkpoint in your `tuned_dir`
restore_checkpoint = False #@param {type:"boolean"}

import huggingface_hub
from huggingface_hub import hf_hub_url, hf_hub_download

model_filename = f"{model_name}.pth"
if model_name == "RWKV-4-Pile-169M":
    model_repo = f"BlinkDL/rwkv-4-pile-169m"
    model_url = hf_hub_url(repo_id=f"{model_repo}", filename="RWKV-4-Pile-169M-20220807-8023.pth")
    n_layer = 12
    n_embd = 768
    model_world = False
if model_name == "RWKV-4-World-0.1B":
    model_repo = f"BlinkDL/rwkv-4-world"
    model_url = hf_hub_url(repo_id=f"{model_repo}", filename="RWKV-4-World-0.1B-v1-20230520-ctx4096.pth")
    n_layer = 12
    n_embd = 768
    model_world = True
if model_name == "RWKV-4-Pile-430M":
    model_repo = f"BlinkDL/rwkv-4-pile-430m"
    model_url = hf_hub_url(repo_id=f"{model_repo}", filename="RWKV-4-Pile-430M-20220808-8066.pth")
    n_layer = 24
    n_embd = 1024
    model_world = False
if model_name == "RWKV-4-World-0.4B":
    model_repo = f"BlinkDL/rwkv-4-world"
    model_url = hf_hub_url(repo_id=f"{model_repo}", filename="RWKV-4-World-0.4B-v1-20230529-ctx4096.pth")
    n_layer = 24
    n_embd = 1024
    model_world = True
if model_name == "RWKV-4-Raven-1B5-v12":
    model_repo = f"BlinkDL/rwkv-4-raven"
    model_url = hf_hub_url(repo_id=f"{model_repo}", filename="RWKV-4-Raven-1B5-v12-Eng98%-Other2%-20230520-ctx4096.pth")
    n_layer = 24
    n_embd = 2048
    model_world = False
if model_name == "RWKV-4-World-1B5-v1":
    model_repo = f"BlinkDL/rwkv-4-world"
    model_url = hf_hub_url(repo_id=f"{model_repo}", filename="RWKV-4-World-1.5B-v1-fixed-20230612-ctx4096.pth")
    n_layer = 24
    n_embd = 2048
    model_world = True

from os.path import isfile, getctime, getmtime, basename
if isfile(model_filename) == False:
  print(f"Downloading {model_url}")
  !curl -L $model_url -o $model_filename

!ls -alh /content/$model_filename

checkpoint_final = f'{checkpoint_dir}/{model_name}'
!mkdir -p $checkpoint_final

from glob import glob
if restore_checkpoint == True:
  checkpoint_folders = glob(f"{checkpoint_dir}/{model_name}/*.ckpt")
  if len(checkpoint_folders) < 1:
    print(f"-----\nNo checkpoint folders found inside {checkpoint_dir}/{model_name}/")
  else:
    checkpoint_path = max(checkpoint_folders, key=getmtime)
    print(f"-----\nRestoring from {checkpoint_path}")

# Training

In [None]:
#@title Create dataset out of jsonl files { display-mode: "form" }
#@markdown In case you mounted your drive before copying the jsonl files over, re-run the setup part again before running this.<br><br><i>"The goal of ftfy is to take in bad Unicode and output good Unicode, for use in your Unicode-aware code."</i>
use_ftfy = True #@param {type:"boolean"}
append_endoftext = True #@param {type:"boolean"}
#@markdown <hr>example-1.jsonl file contents:
#@markdown <br><code>{"text":"Text train one"}<br>{"text":"Text two"}<br>{"text":"Text another three"}</code>

%cd /content/
print(f"Tokenizing jsonl files in {jsonl_folder}")

if model_world == True:
  tokenizer = 'RWKVTokenizer'
  vocab = './GPT-NeoX/rwkv_vocab_v20230424.txt'
else:
  tokenizer = 'HFTokenizer'
  vocab = './GPT-NeoX/20B_tokenizer.json'

cmd = f'''
python ./GPT-NeoX/tools/preprocess_data.py \
--input {jsonl_folder} \
--output-prefix {jsonl_folder} \
--dataset-impl mmap \
--tokenizer-type {tokenizer} \
--vocab {vocab}'''

if append_endoftext:
  cmd += ' --append-eod'
if use_ftfy:
  cmd += ' --ftfy'

print(f"{cmd}")
!{cmd}

In [None]:
#@title Finetune { display-mode: "form" }
#@markdown Steps of `trainer/global_step`, not the progress bar
save_every_global_steps = 20 #@param {type:"integer"}

#@markdown ---
#@markdown ### Main
#@markdown Either `target_batch_size` OR `accumulate_grad_batches` (leave one empty)
target_batch_size = '40' #@param {type:"string"}
accumulate_grad_batches = "" #@param {type:"string"}
ctx_len = '1024' #@param ['128', '256', '512', '1024', '2048', '3072', '4096', '5120', '6144', '7168', '8192', '9216', '10240'] {type:"string"}
max_epochs = 1000 #@param {type:"integer"}
precision = 'bf16-mixed' #@param ['16-mixed', 'bf16-mixed', '32-true', '64-true'] {type:"string"}
weight_decay = "0.02" #@param {type:"string"}
grad_cp = True #@param {type:"boolean"}
#@markdown ---
#@markdown ### Strategy
#@markdown `deepspeed_stage_1`: Each of your GPUs has too much vram, and you do not know what to do
#@markdown <br>`deepspeed_stage_2` : Optimal distributed training strategy, across multiple gpu each with sufficient vram
#@markdown <br>`deepspeed_stage_2_offload`: Reduce vram usage by offloading the optimizer state and work to cpu
#@markdown <br>`deepspeed_stage_3`: Split up the model across multiple gpu, useful for large models, at a performance cost
#@markdown <br>`deepspeed_stage_3_offload`: Additional offloading, for even greater performance cost
strategy = 'deepspeed_stage_1' #@param ['deepspeed_stage_1', 'deepspeed_stage_2', 'deepspeed_stage_2_offload', 'deepspeed_stage_3', 'deepspeed_stage_3_offload'] {type:"string"}

#@markdown ---
#@markdown ### Extra
lr_init = "1e-5" #@param {type:"string"}
lr_final = "1e-5" #@param {type:"string"}
adam_eps = "1e-8" #@param {type:"string"}
beta1 = 0.9 #@param {type:"number"}
beta2 = 0.99 #@param {type:"number"}
#@markdown <hr>Number or leave empty for random
seed = '' #@param {type:"string"}

from datetime import datetime
from os import environ
environ["RWKV_CUDA_ON"] = "1"
environ["RWKV_DEEPSPEED"] = "1"
#environ["NCCL_P2P_DISABLE"] = "1"

%cd /content/

# https://github.com/PicoCreator/RWKV-LM-LoRA/blob/3c9f2f0fff92c7f683b07defdc54c04d39b643bf/notebook/trainer-validation/config/baseline-1024.yaml
config_location = f'{checkpoint_dir}/{model_name}-config.yaml'
!rm $config_location

config = open(f'{config_location}', "w")
config.write(f'''
# Generated
seed_everything: {seed if len(seed) > 0 else 'true'} # 3941088705
model:
  load_model: '/content/{model_filename}'
  n_embd: {n_embd}
  n_layer: {n_layer}
  vocab_size: {65536 if model_world else 50277}
  ctx_len: {ctx_len}
  lr_init: {lr_init}
  lr_final: {lr_final}
  adam_eps: {adam_eps}
  beta1: {beta1}
  beta2: {beta2}
  weight_decay: {weight_decay}
  grad_cp: {str(grad_cp).lower()}
  torch_set_float32_matmul_precision: 'high'
  bptt_learning: true # or 'true if offloading'
  bptt_learning_range: -1
data:
  data_path: '{dataset_location}/{dataset_folder}_HF'
  source: '{dataset_location}/{binidx_filename}'
  tokenizer: binidx
  custom_text_key: 'text'

  test_split: 0.01
  test_split_shuffle: false

trainer:
  accelerator: gpu
  devices: auto
  strategy: {strategy}
  precision: {precision}
  max_epochs: {max_epochs}
  {f'accumulate_grad_batches: {accumulate_grad_batches}' if len(accumulate_grad_batches) > 0 else f'target_batch_size: {target_batch_size}'}

  logger:
    class_path: lightning.pytorch.loggers.WandbLogger
    init_args:
      name: '{model_name} {ctx_len}ctx | {datetime.now().strftime("%d %b %H:%M:%S")}'
      project: '{wandb_project if len(wandb_project) > 0 else ''}'
      save_dir: .
      version: null
      log_model: all
    dict_kwargs:
      group: 'lightning 2.0.2'

  callbacks:
  - class_path: lightning.pytorch.callbacks.ModelCheckpoint
    init_args:
      dirpath: '{checkpoint_final}'
      monitor: 'step'
      mode: max

      save_on_train_epoch_end: true
      save_top_k: 1
      save_last: true
      every_n_train_steps: {save_every_global_steps}

ckpt_path: {checkpoint_path if restore_checkpoint else 'null'}
''')
config.close()

# Convert dataset to HF datapath format and save it
cmd = f'''python3 /content/RWKV/RWKV-v4neo/preload_datapath.py \
{config_location}
'''
print(f"{cmd}")
!{cmd}

# Train
cmd = f'''
python3 /content/RWKV/RWKV-v4neo/lightning_trainer.py fit \
-c {config_location}
'''
print(f"{cmd}")
!{cmd}

# Exporting checkpoint
> Run "Setup" category before this if you haven't

In [None]:
#@title Export
%cd /content/

from glob import glob
checkpoint_folders = glob(f"{checkpoint_dir}/{model_name}/*.ckpt")
if len(checkpoint_folders) < 1:
  print(f"-----\nNo checkpoint folders found inside {checkpoint_dir}/{model_name}/")
else:
  checkpoint_path = max(checkpoint_folders, key=getmtime)
  print(f"-----\nRestoring from {checkpoint_path}")
  cmd = f'python3 /content/RWKV/RWKV-v4neo/export_checkpoint.py -d {checkpoint_path} /content/RWKV-Exported.pth'
  print(f"{cmd}")
  !{cmd}

In [None]:
#@title Move to drive
cmd = f'cp /content/RWKV-Exported.pth {checkpoint_dir}/RWKV-Exported.pth'
print(f"{cmd}")
!{cmd}
!ls -alh $checkpoint_dir/RWKV-Exported.pth