In [10]:
!git clone https://github.com/daniel/r2ai-model.git


Cloning into 'r2ai-model'...
git@github.com: Permission denied (publickey).
fatal: Could not read from remote repository.

Please make sure you have the correct access rights
and the repository exists.


In [20]:
import datasets
dataset = datasets.load_dataset("json", data_files="r2ai-model/data/radare2/radare2_train.jsonl", split="train")
split = dataset.train_test_split(test_size=0.1)
split.save_to_disk('r2_dataset')


Saving the dataset (0/1 shards):   0%|          | 0/3400 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/378 [00:00<?, ? examples/s]

In [21]:
dataset = datasets.load_from_disk('r2_dataset') 

DatasetDict({
    train: Dataset({
        features: ['messages'],
        num_rows: 3400
    })
    test: Dataset({
        features: ['messages'],
        num_rows: 378
    })
})

In [23]:
%%writefile datasource.py

import datasets
import copy
import itertools

EOT_ID = 128009

def mask_target(target,seq):
    for i in range(len(seq)-len(target)):
        if seq[i:i+len(target)] == target:
            seq[i:i+len(target)] = [-100] * len(target)
    return seq

def get_custom_dataset(dataset_config, tokenizer, split):
    
    def tokenize_function(messages):
        dialog_tokens = tokenizer.apply_chat_template(messages)
        eot_indices = [i for i,n in enumerate(dialog_tokens) if n == EOT_ID]
        labels = copy.copy(dialog_tokens)
        system_or_user = (tokenizer.encode("system")[-1], tokenizer.encode("user")[-1])
        labels[0] = -100 # bos token
        last_idx = 1
        for n, idx in enumerate(eot_indices):
            role_token = labels[last_idx+1]
            if role_token in system_or_user:
                # Set labels to -100 for system and user tokens to ignore in loss function
                labels[last_idx:idx+1] = [-100] * (idx-last_idx+1)
            last_idx = idx + 1
        mask_target(tokenizer.encode("<|start_header_id|>assistant<|end_header_id|>", add_special_tokens=False), labels)
        dialog_tokens = [dialog_tokens]
        labels_tokens = [labels]
        combined_tokens = {
            "input_ids": list(itertools.chain(*(t for t in dialog_tokens))),
            "labels": list(itertools.chain(*(t for t in labels_tokens))),
        }
    
        return dict(combined_tokens, attention_mask=[1]*len(combined_tokens["input_ids"]))

    dataset = datasets.load_from_disk('r2_dataset')
    if split == 'train':
        dataset = dataset['train']
    else:
        dataset = dataset['test']
    dataset = dataset.map(lambda x: tokenize_function(x['messages']), remove_columns=['messages'])
    return dataset



Overwriting datasource.py


In [3]:

from huggingface_hub import interpreter_login

interpreter_login()



    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|



In [4]:
%%writefile finetuning.py

import fire
from llama_recipes.finetuning import main

if __name__ == "__main__":
    fire.Fire(main)

Writing finetuning.py


In [2]:
!echo $HF_TOKEN




In [26]:

!TOKENIZER_PARALLELISM=1 torchrun --nnodes 1 --nproc_per_node 4 finetuning.py \\
  --enable_fsdp \\
  --model_name meta-llama/Llama-3.2-1B-Instruct \\
  --dist_checkpoint_root_folder model_checkpoints \\
  --dist_checkpoint_folder fine-tuned \\
  --fsdp_config.pure_bf16 \\
  --use_fast_kernels \\
  --dataset "custom_dataset" \\
  --custom_dataset.file "datasource.py" \\
  --batching_strategy "padding"
  


W1031 16:08:33.721000 125173606377280 torch/distributed/run.py:779] 
W1031 16:08:33.721000 125173606377280 torch/distributed/run.py:779] *****************************************
W1031 16:08:33.721000 125173606377280 torch/distributed/run.py:779] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
W1031 16:08:33.721000 125173606377280 torch/distributed/run.py:779] *****************************************
  from torch.distributed._shard.checkpoint import (
  from torch.distributed._shard.checkpoint import (
  from torch.distributed._shard.checkpoint import (
  from torch.distributed._shard.checkpoint import (
Clearing GPU cache for all ranks
--> Running with torch dist debug set to detail
--> Model meta-llama/Llama-3.2-1B-Instruct

--> meta-llama/Llama-3.2-1B-Instruct has 1235.8144 Million params

bFloat16 enabled for mixed precis