In [1]:
# declare a list tasks whose products you want to use as inputs
upstream = None

In [2]:
# Parameters
chess_player = "f'carlsen'"
dataset = "./data/tokens/carlsen_max_768_bos.tok"
product = {"nb": "/teamspace/studios/this_studio/ChessOps/output/finetune.ipynb"}


In [3]:
from transformers import AutoModelForCausalLM
from src.train import ChessTrainer
from peft import LoraConfig, get_peft_model


In [4]:
## HYPERPARAMETERS
BATCH_SIZE = 16  # use the largest batch size that fits on your GPU
SAVE_STEPS = 2000  # how often to save a checkpoint
LOGGING_STEPS = 50  # how often to validate model and publish it to Weights & Biases
EPOCHS = 10  # how many epochs to train for - how many times to go through the dataset
LEARNING_RATE = 0.0001  # learning rate - how fast the model should learn
SKIP_VALIDATION = True  # skip validation and only save model checkpoints
WEIGHTS_AND_BIASES_ENABLED = True  # enable logging to Weights & Biases
USE_FP16 = True  # enable mixed precision training (GPU only)
XLANPLUS_ENABLED = True  # use xLanPlus tokenizer

In [5]:
## MODEL
PEFT_BASE_MODEL = "Leon-LLM/Leon-Chess-350k-Plus"  # base model to be loaded (from hugging face) for fine-tuning

In [6]:
## CONFIG FOR FINE-TUNING
R = 128  # lower means faster training, but might underfit because of less complexity (experiments don't show that training time increases, which is rather weird)
LORA_ALPHA = 32  # scaling factor that adjusts the magnitude of the combined result (balances the pretrained model’s knowledge and the new task-specific adaptation)
LORA_DROPOUT = 0.1

In [7]:
## PATHS
# model_name = f"{PEFT_BASE_MODEL.split('/')[1]}_LoRA_{chess_player}".replace("'", "")
model_name = f"{PEFT_BASE_MODEL.split('/')[1]}_LoRA_{chess_player}_{EPOCHS}E_{LEARNING_RATE}LR".replace(
    "'", ""
)
output_path = "models/"

In [8]:
def create_model(debug=True):
    peft_config = LoraConfig(  # https://huggingface.co/docs/peft/v0.10.0/en/package_reference/lora#peft.LoraConfig
        task_type="CAUSAL_LM",  # This does not need to be changed for our use case
        inference_mode=False,  # don't change this for training, only later for inference
        r=R,  # lower means faster training, but might underfit because of less complexity (experiments don't show that training time increases, which is rather weird)
        lora_alpha=LORA_ALPHA,  # scaling factor that adjusts the magnitude of the combined result (balances the pretrained model’s knowledge and the new task-specific adaptation)
        lora_dropout=LORA_DROPOUT,
    )

    peft_model = get_peft_model(
        AutoModelForCausalLM.from_pretrained(PEFT_BASE_MODEL), peft_config
    )

    if debug:
        print(f"peft_model created: {peft_model}")

    return peft_model

In [9]:
def train_model(model, dataset, output_dir, debug=True):
    if debug:
        print(f"model: {model}")
        print(f"dataset: {dataset}")
        print(f"output_dir: {output_dir}")

    trainer = ChessTrainer(
        batch_size=BATCH_SIZE,
        learning_rate=LEARNING_RATE,
        epochs=EPOCHS,
        input_file=dataset,
        output_dir=output_dir,
        save_steps=SAVE_STEPS,
        logging_steps=LOGGING_STEPS,
        skip_validation=SKIP_VALIDATION,
        weight_and_biases=WEIGHTS_AND_BIASES_ENABLED,
        use_FP16=USE_FP16,
        notation="xLANplus" if XLANPLUS_ENABLED else "xLAN",
        peft=model,
    )

    trainer.train()

In [10]:
def push_model_to_hf(model, name, debug=True):
    if debug:
        print(f"push_model_to_hf(model={model}, name={name})")
    model.push_to_hub(model_name)

In [11]:
model = create_model()
train_model(model, dataset, output_path + model_name)
push_model_to_hf(model, model_name)



peft_model created: PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): GPT2LMHeadModel(
      (transformer): GPT2Model(
        (wte): Embedding(82, 768)
        (wpe): Embedding(1024, 768)
        (drop): Dropout(p=0.1, inplace=False)
        (h): ModuleList(
          (0-11): 12 x GPT2Block(
            (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (attn): GPT2Attention(
              (c_attn): lora.Linear(
                (base_layer): Conv1D()
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=768, out_features=128, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=128, out_features=2304, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
     

[34m[1mwandb[0m: Currently logged in as: [33mschmila7[0m ([33mleon-llm[0m). Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Tracking run with wandb version 0.17.0


[34m[1mwandb[0m: Run data is saved locally in [35m[1m/teamspace/studios/this_studio/ChessOps/wandb/run-20240519_161357-6alogt8k[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.


[34m[1mwandb[0m: Syncing run [33mmodels[0m


[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/leon-llm/ChessOps[0m


[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/leon-llm/ChessOps/runs/6alogt8k[0m


Step,Training Loss
50,3.1389
100,2.0241
150,1.799
200,1.6411
250,1.4711
300,1.2046
350,1.1171
400,1.0578
450,1.0225
500,0.9786






[34m[1mwandb[0m: - 0.004 MB of 0.004 MB uploaded

[34m[1mwandb[0m: \ 0.004 MB of 0.004 MB uploaded

[34m[1mwandb[0m: | 0.026 MB of 0.026 MB uploaded

[34m[1mwandb[0m:                                                                                


[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:         train/epoch ▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
[34m[1mwandb[0m:   train/global_step ▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
[34m[1mwandb[0m:     train/grad_norm ▄▁▃▃▃▄▃▅▅▄▃▃▃▄▅▆▅▅█▃▃▃▅▃▆▃▄▄▃▂▅▄▃▃▃▂▄▃▂▂
[34m[1mwandb[0m: train/learning_rate ███▇▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▁▁▁
[34m[1mwandb[0m:          train/loss █▅▄▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run summary:
[34m[1mwandb[0m:               total_flos 2.312359537333248e+16
[34m[1mwandb[0m:              train/epoch 10.0
[34m[1mwandb[0m:        train/global_step 3500
[34m[1mwandb[0m:          train/grad_norm 0.23497
[34m[1mwandb[0m:      train/learning_rate 0.0
[34m[1mwandb[0m:               train/loss 0.6409
[34m[1mwandb[0m:               train_loss 0.81137
[34m[1mwandb[0m:            train_runtime 2008.1916
[34m[1mwandb[0m: train_samples_per_second 27.871
[34m

[34m[1mwandb[0m: 🚀 View run [33mmodels[0m at: [34m[4mhttps://wandb.ai/leon-llm/ChessOps/runs/6alogt8k[0m
[34m[1mwandb[0m: ⭐️ View project at: [34m[4mhttps://wandb.ai/leon-llm/ChessOps[0m
[34m[1mwandb[0m: Synced 6 W&B file(s), 0 media file(s), 0 artifact file(s) and 0 other file(s)


[34m[1mwandb[0m: Find logs at: [35m[1m./wandb/run-20240519_161357-6alogt8k/logs[0m


push_model_to_hf(model=PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): GPT2LMHeadModel(
      (transformer): GPT2Model(
        (wte): Embedding(82, 768)
        (wpe): Embedding(1024, 768)
        (drop): Dropout(p=0.1, inplace=False)
        (h): ModuleList(
          (0-11): 12 x GPT2Block(
            (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (attn): GPT2Attention(
              (c_attn): lora.Linear(
                (base_layer): Conv1D()
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=768, out_features=128, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=128, out_features=2304, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
  

adapter_model.safetensors:   0%|          | 0.00/18.9M [00:00<?, ?B/s]