In [1]:
# declare a list tasks whose products you want to use as inputs
upstream = None

In [2]:
# Parameters
chess_player = "f'carlsen'"
product = {"nb": "/Users/larscarlschmid/Documents/_repos/ChessOps/output/finetune.ipynb"}


In [3]:
import torch
from transformers import AutoModelForCausalLM
from src.train import ChessTrainer
from peft import LoraConfig, get_peft_model


In [4]:
## HYPERPARAMETERS
BATCH_SIZE = 16  # use the largest batch size that fits on your GPU
SAVE_STEPS = 2000  # how often to save a checkpoint
LOGGING_STEPS = 50  # how often to validate model and publish it to Weights & Biases
EPOCHS = 1  # how many epochs to train for - how many times to go through the dataset
LEARNING_RATE = 0.0001  # learning rate - how fast the model should learn
SKIP_VALIDATION = True  # skip validation and only save model checkpoints
WEIGHTS_AND_BIASES_ENABLED = False  # enable logging to Weights & Biases
USE_FP16 = True  # enable mixed precision training (GPU only)
XLANPLUS_ENABLED = True  # use xLanPlus tokenizer

In [5]:
## MODEL
PEFT_BASE_MODEL = "Leon-LLM/Leon-Chess-350k-BOS"

In [6]:
## CONFIG FOR FINE-TUNING
R = 128  # lower means faster training, but might underfit because of less complexity (experiments don't show that training time increases, which is rather weird)
LORA_ALPHA = 32  # scaling factor that adjusts the magnitude of the combined result (balances the pretrained model’s knowledge and the new task-specific adaptation)
LORA_DROPOUT = 0.1

In [7]:
## PATHS
# dataset = "/Users/cyrilgabriele/Documents/School/00_Courses/03_MLOPS/04_Project/ChessOps/data/tokens/carlsen_max_768.tok"
dataset = "./data/tokens/carlsen_max_768.tok"
# output_dir = f"/Users/cyrilgabriele/Documents/School/00_Courses/03_MLOPS/04_Project/ChessOps/models/"
output_dir = f"models/"
model_name = f"{PEFT_BASE_MODEL.split('/')[1]}_LoRA_{chess_player}"

In [8]:
def create_model():
    peft_config = LoraConfig(  # https://huggingface.co/docs/peft/v0.10.0/en/package_reference/lora#peft.LoraConfig
        task_type="CAUSAL_LM",  # This does not need to be changed for our use case
        inference_mode=False,  # don't change this for training, only later for inference
        r=R,
        lora_alpha=LORA_ALPHA,
        lora_dropout=LORA_DROPOUT,
    )

    peft_model = get_peft_model(
        AutoModelForCausalLM.from_pretrained(PEFT_BASE_MODEL), peft_config
    )

    return peft_model

In [9]:
def train_model(model, dataset, output_dir, debug=True):
    if debug:
        print(f"model: {model}")
        print(f"dataset: {dataset}")
        print(f"output_dir: {output_dir}")

    trainer = ChessTrainer(
        batch_size=BATCH_SIZE,
        learning_rate=LEARNING_RATE,
        epochs=EPOCHS,
        input_file=dataset,
        output_dir=output_dir,
        save_steps=SAVE_STEPS,
        logging_steps=LOGGING_STEPS,
        skip_validation=SKIP_VALIDATION,
        weight_and_biases=WEIGHTS_AND_BIASES_ENABLED,
        use_FP16=USE_FP16,
        notation="xLANplus" if XLANPLUS_ENABLED else "xLAN",
        peft=model,
    )

    # trainer.train() # TODO: uncomment later
    print("trainer.train()")

In [10]:
def push_model_to_hf(model, name):
    # TODO: handle login...
    # model.push_to_hub("your-name/bigscience/mt0-large-lora")
    print(f"push_model_to_hf(model={model}, name={name})")
    # pass

In [11]:
model = create_model()
train_model(model, dataset, output_dir + model_name)
push_model_to_hf(model, model_name)

model: PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): GPT2LMHeadModel(
      (transformer): GPT2Model(
        (wte): Embedding(76, 768)
        (wpe): Embedding(512, 768)
        (drop): Dropout(p=0.1, inplace=False)
        (h): ModuleList(
          (0-11): 12 x GPT2Block(
            (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (attn): GPT2Attention(
              (c_attn): lora.Linear(
                (base_layer): Conv1D()
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=768, out_features=128, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=128, out_features=2304, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
   

