In [1]:
# Code adapted from https://github.com/huggingface/trl/blob/main/examples/research_projects/stack_llama/scripts/supervised_finetuning.py
# and https://huggingface.co/blog/gemma-peft
import argparse
import multiprocessing
import os
from datasets import load_dataset
import torch
import transformers
from accelerate import PartialState
from datasets import load_dataset
from peft import LoraConfig
from transformers import (
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    logging,
    set_seed,
)
import datasets
from datasets import Dataset, DatasetDict
from trl import SFTTrainer, SFTConfig

In [2]:
ds2 = datasets.load_from_disk("tbricksnext2linesver5/tbricks")
ds2["train"].__getitem__(0)

{'content': '<fim-prefix>#pragma once #include <strategy/stream/TCPStream.h> #include <random> #include "BinaryStreamBuffer.h" #include <simple-websocket-server/crypto.hpp> #include <simple-websocket-server/utility.hpp> class IWebsocketHandler { public: virtual ~IWebsocketHandler() = default; virtual void HandleWebsocketConnected() = 0; virtual void HandleWebsocketDisconnected(const tbricks::String & reason) = 0; virtual void HandleError(const tbricks::String & error) = 0; virtual void HandleMessage(std::string_view message) = 0; }; class TbWebsocketClient : public tbricks::TCPStream::IHandler { public: struct Config { /// Timeout on request handling. Defaults to no timeout. long timeout_request = 0; /// Idle timeout. Defaults to no timeout. long timeout_idle = 0; /// Maximum size of incoming messages. Defaults to architecture maximum. /// Exceeding this limit will result in a message_size error code and the connection will be closed. std::size_t max_message_size = (std::numeric_limits

In [3]:
len(ds2['train'])

19012

In [4]:
parser = argparse.ArgumentParser()
parser.add_argument("--model_id", type=str, default="bigcode/starcoder2-7b")
#parser.add_argument("--dataset_name", type=str, default="bigcode/the-stack-smol")    # construct the dataset
#parser.add_argument("--subset", type=str, default="data/sql")                        # construct the subset   
#parser.add_argument("--split", type=str, default="train")                            # construct the split   
parser.add_argument("--dataset_text_field", type=str, default="content")

parser.add_argument("--max_seq_length", type=int, default=512)
parser.add_argument("--max_steps", type=int, default=10000)
parser.add_argument("--micro_batch_size", type=int, default=2)
parser.add_argument("--gradient_accumulation_steps", type=int, default=4)
parser.add_argument("--weight_decay", type=float, default=0.01)
parser.add_argument("--fp16", type=bool, default=True)

parser.add_argument("--attention_dropout", type=float, default=0.1)
parser.add_argument("--learning_rate", type=float, default=2e-4)
parser.add_argument("--lr_scheduler_type", type=str, default="cosine")
parser.add_argument("--warmup_steps", type=int, default=100)
parser.add_argument("--seed", type=int, default=0)
parser.add_argument("--output_dir", type=str, default="finetune_starcoder2")
parser.add_argument("--num_proc", type=int, default=2)#was none since i have 2 gpus why not
parser.add_argument("--push_to_hub", type=bool, default=False)

args = parser.parse_args(args=[])

In [5]:
args

Namespace(model_id='bigcode/starcoder2-7b', dataset_text_field='content', max_seq_length=512, max_steps=10000, micro_batch_size=2, gradient_accumulation_steps=4, weight_decay=0.01, fp16=True, attention_dropout=0.1, learning_rate=0.0002, lr_scheduler_type='cosine', warmup_steps=100, seed=0, output_dir='finetune_starcoder2', num_proc=2, push_to_hub=False)

In [6]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )


In [7]:
def main(args):
    # config
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )
    lora_config = LoraConfig(
        r=8,
        target_modules=[
            "q_proj",
            "o_proj",
            "k_proj",
            "v_proj",
            "gate_proj",
            "up_proj",
            "down_proj",
        ],
        task_type="CAUSAL_LM",
    )

    # load model and dataset
    
    model = AutoModelForCausalLM.from_pretrained(
        args.model_id,
        quantization_config=bnb_config,
        device_map={"": PartialState().process_index},
        attention_dropout=args.attention_dropout,
    )
    print_trainable_parameters(model)

    # data = load_dataset(
    #     args.dataset_name,
    #     data_dir=args.subset,
    #     split=args.split,
    #     token=token,
    #     num_proc=args.num_proc if args.num_proc else multiprocessing.cpu_count(),
    # )

    # setup the trainer
    trainer = SFTTrainer(
        model=model,
        train_dataset=ds2['train'],
       #max_seq_length=args.max_seq_length,
        peft_config=lora_config,
        args = SFTConfig(
            per_device_train_batch_size=args.micro_batch_size,
            gradient_accumulation_steps=args.gradient_accumulation_steps,
            warmup_steps=args.warmup_steps,
            max_steps=args.max_steps,
            learning_rate=args.learning_rate,
            lr_scheduler_type=args.lr_scheduler_type,
            weight_decay=args.weight_decay,
            fp16=args.fp16,
            logging_strategy="steps",
            logging_steps=10,
            output_dir=args.output_dir,
            optim="paged_adamw_8bit",
            seed=args.seed,
            run_name=f"train-{args.model_id.split('/')[-1]}",
            report_to="wandb",
            
            dataset_text_field=args.dataset_text_field,            
        )
        # args=transformers.TrainingArguments(

        # ),

    )

    # launch
    print("Training...")
    trainer.train()

    print("Saving the last checkpoint of the model")
    model.save_pretrained(os.path.join(args.output_dir, "final_checkpoint/"))
    if args.push_to_hub:
        trainer.push_to_hub("Upload model")
    print("Training Done! 💥")

In [8]:
set_seed(args.seed)
os.makedirs(args.output_dir, exist_ok=True)

logging.set_verbosity_error()

main(args)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

trainable params: 227091456 || all params: 3701040128 || trainable%: 6.13588202629723


Converting train dataset to ChatML:   0%|          | 0/19012 [00:00<?, ? examples/s]

Adding EOS to train dataset:   0%|          | 0/19012 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/19012 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/19012 [00:00<?, ? examples/s]

Training...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[34m[1mwandb[0m: Currently logged in as: [33msyuchicago[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


{'loss': 1.7006, 'grad_norm': 0.10989875346422195, 'learning_rate': 1.8e-05, 'num_tokens': 38574.0, 'mean_token_accuracy': 0.659176979213953, 'epoch': 0.004207868714496108}
{'loss': 1.6696, 'grad_norm': 0.1516902893781662, 'learning_rate': 3.8e-05, 'num_tokens': 76719.0, 'mean_token_accuracy': 0.6673619046807289, 'epoch': 0.008415737428992216}
{'loss': 1.7006, 'grad_norm': 0.2079722285270691, 'learning_rate': 5.8e-05, 'num_tokens': 113913.0, 'mean_token_accuracy': 0.6529517292976379, 'epoch': 0.012623606143488323}
{'loss': 1.5565, 'grad_norm': 0.2374574989080429, 'learning_rate': 7.800000000000001e-05, 'num_tokens': 149591.0, 'mean_token_accuracy': 0.6694134712219239, 'epoch': 0.016831474857984433}
{'loss': 1.4664, 'grad_norm': 0.3967311382293701, 'learning_rate': 9.8e-05, 'num_tokens': 182845.0, 'mean_token_accuracy': 0.6804351001977921, 'epoch': 0.021039343572480537}
{'loss': 1.2869, 'grad_norm': 0.4414127767086029, 'learning_rate': 0.000118, 'num_tokens': 220985.0, 'mean_token_accur