In [1]:
!pip install --upgrade xllm



In [2]:
!wandb login

[34m[1mwandb[0m: Currently logged in as: [33mhetarthvader[0m ([33mcomplex_dnn[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [3]:
import torch
import xllm

cuda_is_available = torch.cuda.is_available()

print(f"X—LLM version: {xllm.__version__}\nTorch version: {torch.__version__}\nCuda is available: {cuda_is_available}")
assert cuda_is_available

  from .autonotebook import tqdm as notebook_tqdm


X—LLM version: 0.1.7
Torch version: 2.1.0+cu121
Cuda is available: True


In [4]:
from xllm import Config
from xllm.datasets import GeneralDataset
from xllm.experiments import Experiment
from datasets import load_dataset
from tqdm import tqdm 
from torch.utils.data import IterableDataset
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, logging, set_seed

In [5]:
config = {
    "model_path": "bigcode/starcoderbase-1b",
    "dataset_name": "ArmelR/stack-exchange-instruction",
    "subset": "data/finetune",
    "split": "train",
    "size_valid_set": 1000,
    "streaming": True,
    "shuffle_buffer": 5000,  # This value is not provided in the command, so the default value is used
    "input_column_name": "input",
    "output_column_name": "output",
    "seq_length": 1024,
    "max_steps": 500,
    "batch_size": 1,
    "gradient_accumulation_steps": 8,
    "eos_token_id": 49152,  # This value is not provided in the command, so the default value is used
    "lora_r": 16,  # This value is not provided in the command, so the default value is used
    "lora_alpha": 32,  # This value is not provided in the command, so the default value is used
    "lora_dropout": 0.05,  # This value is not provided in the command, so the default value is used
    "learning_rate": 1e-4,
    "lr_scheduler_type": "cosine",
    "num_warmup_steps": 10,
    "weight_decay": 0.05,
    "local_rank": 0,  # This value is not provided in the command, so the default value is used
    "no_fp16": False,  # This value is not provided in the command, so the default value is used
    "bf16": True,  # This value is not provided in the command, so the default value is used
    "no_gradient_checkpointing": False,  # This value is not provided in the command, so the default value is used
    "seed": 0,  # This value is not provided in the command, so the default value is used
    "num_workers": None,  # This value is not provided in the command, so the default value is used
    "output_dir": "./checkpoints",  # This value is not provided in the command, so the default value is used
    "log_freq": 1,  # This value is not provided in the command, so the default value is used
    "eval_freq": 1000,  # This value is not provided in the command, so the default value is used
    "save_freq": 1000  # This value is not provided in the command, so the default value is used
}

In [13]:
def chars_token_ratio(dataset, tokenizer, input_column_name="prompt", output_column_name="completion", nb_examples=400):
    """
    Estimate the average number of characters per token in the dataset.
    """
    total_characters, total_tokens = 0, 0
    for _, example in tqdm(zip(range(nb_examples), iter(dataset)), total=nb_examples):
        text = prepare_sample_text(example, input_column_name, output_column_name)
        total_characters += len(text)
        if tokenizer.is_fast:
            total_tokens += len(tokenizer(text).tokens())
        else:
            total_tokens += len(tokenizer.tokenize(text))

    return total_characters / total_tokens


def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )


def prepare_sample_text(example, input_column_name="prompt", output_column_name="completion"):
    """Prepare the text from a sample of the dataset."""
    text = f"Question: {example[input_column_name]}\n\nAnswer: {example[output_column_name]}"
    return text


class ConstantLengthDataset(IterableDataset):
    """
    Iterable dataset that returns constant length chunks of tokens from stream of text files.
        Args:
            tokenizer (Tokenizer): The processor used for proccessing the data.
            dataset (dataset.Dataset): Dataset with text files.
            infinite (bool): If True the iterator is reset after dataset reaches end else stops.
            seq_length (int): Length of token sequences to return.
            num_of_sequences (int): Number of token sequences to keep in buffer.
            chars_per_token (int): Number of characters per token used to estimate number of tokens in text buffer.
    """

    def __init__(
        self,
        tokenizer,
        dataset,
        infinite=False,
        seq_length=1024,
        num_of_sequences=1024,
        chars_per_token=3.6,
        input_column_name="prompt",
        output_column_name="completion"
    ):
        self.tokenizer = tokenizer
        self.concat_token_id = tokenizer.eos_token_id if tokenizer.eos_token_id is not None else args.eos_token_id
        self.dataset = dataset
        self.seq_length = seq_length
        self.infinite = infinite
        self.current_size = 0
        self.max_buffer_size = seq_length * chars_per_token * num_of_sequences
        self.input_column_name = input_column_name
        self.output_column_name = output_column_name

    def __iter__(self):
        iterator = iter(self.dataset)
        more_examples = True
        while more_examples:
            buffer, buffer_len = [], 0
            while True:
                if buffer_len >= self.max_buffer_size:
                    break
                try:
                    buffer.append(prepare_sample_text(next(iterator), self.input_column_name, self.output_column_name))
                    buffer_len += len(buffer[-1])
                except StopIteration:
                    if self.infinite:
                        iterator = iter(self.dataset)
                    else:
                        more_examples = False
                        break
            tokenized_inputs = self.tokenizer(buffer, truncation=False)["input_ids"]
            all_token_ids = []
            for tokenized_input in tokenized_inputs:
                all_token_ids.extend(tokenized_input + [self.concat_token_id])
            for i in range(0, len(all_token_ids), self.seq_length):
                input_ids = all_token_ids[i : i + self.seq_length]
                if len(input_ids) == self.seq_length:
                    self.current_size += 1
                    yield {
                        "input_ids": torch.LongTensor(input_ids),
                        "labels": torch.LongTensor(input_ids),
                    }


def create_datasets(tokenizer, config):
    dataset = load_dataset('json', data_files='/home/bzd2/ansible-scraping/data/ftdata.json',
        # config["dataset_name"],
        # data_dir=config["subset"],
        # split=config["split"],
        use_auth_token=True,
        # num_proc=config["num_workers"] if not config["streaming"] else None,
        # streaming=config["streaming"],
    )
    return dataset 
    # if config["streaming"]:
    #     print("Loading the dataset in streaming mode")
    #     valid_data = dataset.take(config["size_valid_set"])
    #     train_data = dataset.skip(config["size_valid_set"])
    #     for i, sample in enumerate(train_data):
    #         print("<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<FOR DEBUGGING PURPOSES ONLY!>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
    #         print(sample)
    #         if i ==10:
    #             break

    #     train_data = train_data.shuffle(buffer_size=config["shuffle_buffer"], seed=config["seed"])
    # else:
    #     train_data = dataset["train"]
    #     valid_data = dataset["test"]
    #     print(f"Size of the train set: {len(train_data)}. Size of the validation set: {len(valid_data)}")

    # chars_per_token = chars_token_ratio(train_data, tokenizer, config["input_column_name"], config["output_column_name"])
    # print(f"The character to token ratio of the dataset is: {chars_per_token:.2f}")

    # train_dataset = ConstantLengthDataset(
    #     tokenizer,
    #     train_data,
    #     infinite=True,
    #     seq_length=config["seq_length"],
    #     chars_per_token=chars_per_token,
    #     input_column_name=config["input_column_name"],
    #     output_column_name=config["output_column_name"]
    # )
    # valid_dataset = ConstantLengthDataset(
    #     tokenizer,
    #     valid_data,
    #     infinite=False,
    #     seq_length=config["seq_length"],
    #     chars_per_token=chars_per_token,
    #     input_column_name=config["input_column_name"],
    #     output_column_name=config["output_column_name"]
    # )
    # return train_dataset, valid_dataset



tokenizer = AutoTokenizer.from_pretrained(config["model_path"], use_auth_token=True)
dataset=create_datasets(tokenizer, config)
# train_dataset, eval_dataset = create_datasets(tokenizer, config)



In [21]:
train_data = list()

for sample in dataset["train"]:
    train_data.append({"text": sample["input"].strip() + "\n \n \n"+sample['output'.strip()]})

In [22]:
train_data

[{'text': 'name: Cleanup health monitor\n \n \na10.acos_axapi.a10_class_list:\n  name: a10_class_list\n  state: absent\n'},
 {'text': 'name: Create class list for acos\n \n \na10.acos_axapi.a10_class_list:\n  name: a10_class_list\n  ntype: ac\n  ac_list:\n  - ac_match_type: contains\n    ac_key_string: apple.com\n  - ac_match_type: equals\n    ac_key_string: logmein123.com\n  - ac_match_type: ends-with\n    ac_key_string: office.com\nregister: class_list\n'},
 {'text': 'name: Cleanup a10.acos_axapi.a10_delete_bw_list instance\n \n \na10.acos_axapi.a10_delete_bw_list:\n  file_name: bw_list.txt\n'},
 {'text': 'name: Create a10.acos_axapi.a10_file_bw_list instance\n \n \na10.acos_axapi.a10_file_bw_list:\n  state: present\n  file: bw_list.txt\n  file_path: ../ansible/roles/bw_list/tasks/bw_list.txt\n  file_handle: bw_list.txt\n  action: import\n  dst_file: bw_list.txt\nregister: bw_list\n'},
 {'text': 'name: Cleanup router bgp\n \n \na10.acos_axapi.a10_router_bgp:\n  as_number: 106\n  stat

In [25]:
train_dataset = GeneralDataset(data=train_data, separator="\n\n")

In [27]:
# # model must be sharded
# backbone_model_name = "TinyPixel/Llama-2-7B-bf16-sharded"
# backbone_model_name = "bn22/Mistral-7B-v0.1-sharded"
backbone_model_name = "bigcode/starcoderbase-1b"
push_to_hub_while_training = True
# lora_hub_model_id = "BobaZooba/AntModel-7B-XLLM-Demo-LoRA"
# hub_model_id = "BobaZooba/AntModel-7B-XLLM-Demo"

max_steps = 100
save_steps = 25
warmup_steps = 5

report_to_wandb = False
wandb_project = None
wandb_entity = None

In [28]:
if report_to_wandb and wandb_project:
    print("Please set at least wandb_project for W&B tracking. wandb_entity is your or your company username at W&B")

In [29]:
config = Config(
    use_gradient_checkpointing=True,
    model_name_or_path=backbone_model_name,
    use_flash_attention_2=False,  # not supported in colab
    load_in_4bit=True,
    prepare_model_for_kbit_training=True,
    apply_lora=True,
    warmup_steps=warmup_steps,
    max_steps=max_steps,
    save_steps=save_steps,
    logging_steps=1,

    per_device_train_batch_size=2,
    gradient_accumulation_steps=2,
    max_length=2048,

    # tokenizer_padding_side="right",  # good for llama2

    # push_to_hub=push_to_hub_while_training,
    # hub_model_id=lora_hub_model_id,
    # hub_private_repo=False,

    # W&B
    report_to_wandb=False,
    wandb_project=wandb_project,
    wandb_entity=wandb_entity,
)

In [30]:
experiment = Experiment(config=config, train_dataset=train_dataset)

In [31]:
experiment.build()

[32m2023-11-29 17:07:08.283[0m | [1mINFO    [0m | [36mxllm.utils.logger[0m:[36minfo[0m:[36m86[0m - [1mExperiment building has started[0m
[32m2023-11-29 17:07:08.285[0m | [1mINFO    [0m | [36mxllm.utils.logger[0m:[36minfo[0m:[36m86[0m - [1mConfig:
{
  "experiment_key": "base",
  "save_safetensors": true,
  "max_shard_size": "10GB",
  "local_rank": 0,
  "use_gradient_checkpointing": true,
  "trainer_key": "lm",
  "force_fp32": false,
  "force_fp16": false,
  "from_gptq": false,
  "huggingface_hub_token": null,
  "deepspeed_stage": 0,
  "deepspeed_config_path": null,
  "fsdp_strategy": "",
  "fsdp_offload": true,
  "seed": 42,
  "stabilize": false,
  "norm_fp32": false,
  "path_to_env_file": "./.env",
  "prepare_dataset": true,
  "lora_hub_model_id": null,
  "lora_model_local_path": null,
  "fused_model_local_path": null,
  "fuse_after_training": false,
  "quantization_dataset_id": null,
  "quantization_max_samples": 1024,
  "quantized_model_path": "./quantized_mode

TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'neftune_noise_alpha'