In [None]:
!git clone https://github.com/daniel/r2ai-model.git


In [20]:
import datasets
dataset = datasets.load_dataset("json", data_files="r2ai-model/data/radare2/radare2_train.jsonl", split="train")
split = dataset.train_test_split(test_size=0.1)
split.save_to_disk('r2_dataset')


Saving the dataset (0/1 shards):   0%|          | 0/3400 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/378 [00:00<?, ? examples/s]

In [21]:
dataset = datasets.load_from_disk('r2_dataset') 

DatasetDict({
    train: Dataset({
        features: ['messages'],
        num_rows: 3400
    })
    test: Dataset({
        features: ['messages'],
        num_rows: 378
    })
})

In [23]:
%%writefile datasource.py

import datasets
import copy
import itertools

EOT_ID = 128009

def mask_target(target,seq):
    for i in range(len(seq)-len(target)):
        if seq[i:i+len(target)] == target:
            seq[i:i+len(target)] = [-100] * len(target)
    return seq

def get_custom_dataset(dataset_config, tokenizer, split):
    
    def tokenize_function(messages):
        dialog_tokens = tokenizer.apply_chat_template(messages)
        eot_indices = [i for i,n in enumerate(dialog_tokens) if n == EOT_ID]
        labels = copy.copy(dialog_tokens)
        system_or_user = (tokenizer.encode("system")[-1], tokenizer.encode("user")[-1])
        labels[0] = -100 # bos token
        last_idx = 1
        for n, idx in enumerate(eot_indices):
            role_token = labels[last_idx+1]
            if role_token in system_or_user:
                # Set labels to -100 for system and user tokens to ignore in loss function
                labels[last_idx:idx+1] = [-100] * (idx-last_idx+1)
            last_idx = idx + 1
        mask_target(tokenizer.encode("<|start_header_id|>assistant<|end_header_id|>", add_special_tokens=False), labels)
        dialog_tokens = [dialog_tokens]
        labels_tokens = [labels]
        combined_tokens = {
            "input_ids": list(itertools.chain(*(t for t in dialog_tokens))),
            "labels": list(itertools.chain(*(t for t in labels_tokens))),
        }
    
        return dict(combined_tokens, attention_mask=[1]*len(combined_tokens["input_ids"]))

    dataset = datasets.load_from_disk('r2_dataset')
    if split == 'train':
        dataset = dataset['train']
    else:
        dataset = dataset['test']
    dataset = dataset.map(lambda x: tokenize_function(x['messages']), remove_columns=['messages'])
    return dataset



Overwriting datasource.py


In [3]:

from huggingface_hub import interpreter_login

interpreter_login()



    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|



In [4]:
%%writefile finetuning.py

import fire
from llama_recipes.finetuning import main

if __name__ == "__main__":
    fire.Fire(main)

Writing finetuning.py


In [34]:
model_name = "meta-llama/Llama-3.2-1B-Instruct"
name = 'r2ai-3.2-1B-Instruct'
num_epochs = 1
max_train_step = 0
batching_strategy = "padding"
num_nodes = 1
num_processes = 4
dist_checkpoint_root_folder = "/mnt/efs/checkpoints"
dist_checkpoint_folder = name
learning_rate = 1e-5


!TOKENIZER_PARALLELISM=1 torchrun --nnodes {num_nodes} --nproc_per_node {num_processes} finetuning.py --lr {learning_rate} --max_train_step {max_train_step} --enable_fsdp --model_name {model_name} --dist_checkpoint_root_folder {dist_checkpoint_root_folder} --dist_checkpoint_folder {dist_checkpoint_folder} --fsdp_config.pure_bf16 --use_fast_kernels --dataset "custom_dataset" --custom_dataset.file "datasource.py" --batching_strategy {batching_strategy} --num_epochs {num_epochs}
  

W1031 17:44:51.311000 128624762046272 torch/distributed/run.py:779] 
W1031 17:44:51.311000 128624762046272 torch/distributed/run.py:779] *****************************************
W1031 17:44:51.311000 128624762046272 torch/distributed/run.py:779] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
W1031 17:44:51.311000 128624762046272 torch/distributed/run.py:779] *****************************************
  from torch.distributed._shard.checkpoint import (
  from torch.distributed._shard.checkpoint import (
  from torch.distributed._shard.checkpoint import (
  from torch.distributed._shard.checkpoint import (
Clearing GPU cache for all ranks
--> Running with torch dist debug set to detail
--> Model meta-llama/Llama-3.2-1B-Instruct

--> meta-llama/Llama-3.2-1B-Instruct has 1235.8144 Million params

bFloat16 enabled for mixed precis

In [35]:
from transformers import AutoTokenizer

from llama_recipes.inference.model_utils import  load_llama_from_config
from llama_recipes.model_checkpointing import load_sharded_model_single_gpu
dist_checkpoint_root_folder = "/mnt/efs/checkpoints"

model_def = load_llama_from_config(model_name)
model = load_sharded_model_single_gpu(model_def, dist_checkpoint_root_folder + "/" + 'r2ai-3.2-1B-Instruct-meta-llama/Llama-3.2-1B-Instruct')
tokenizer = AutoTokenizer.from_pretrained(model_name)

save_path = dist_checkpoint_root_folder + "/" + 'hf/' + name
tokenizer.save_pretrained(save_path)
hf_model_path = model.save_pretrained(save_path)
print(f"HuggingFace model checkpoints has been saved in {hf_model_path}")

  dist_cp.load_state_dict(


Sharded state checkpoint loaded from /mnt/efs/checkpoints/r2ai-3.2-1B-Instruct-meta-llama/Llama-3.2-1B-Instruct
HuggingFace model checkpoints has been saved in None


In [69]:
eval_messages = [
  [{"role": "user", "content": "What is the capital of France?"}],
  [ 
    {"role": "system", "content": "\n\n***RADARE2 MODE: ON***\n\n"},
    {"role": "user", "content": "List all the functions"},
  ],
  [ 
    {"role": "system", "content": "\n\n***RADARE2 MODE: ON***\n\n"},
    {"role": "user", "content": "disassemble the main function"},
  ],

]

In [108]:
from llama_recipes.inference.model_utils import load_model
import torch
hf_model_path = dist_checkpoint_root_folder + "/" + 'hf/' + name
model = load_model(hf_model_path, None, True)
tokenizer.pad_token = tokenizer.eos_token
model.generation_config.pad_token_id = tokenizer.pad_token_id
tokenizer = AutoTokenizer.from_pretrained(hf_model_path)

for messages in eval_messages:
  print()
  print(messages[-1]['content'])
  prompt_tokens = tokenizer.apply_chat_template(messages)
  prompt_tokens = torch.tensor(prompt_tokens).long()
  prompt_tokens = prompt_tokens.unsqueeze(0).to("cuda")
  attention_mask = torch.ones_like(prompt_tokens)
  output = model.generate(input_ids=prompt_tokens, attention_mask=attention_mask, max_new_tokens=500, temperature=0.5, top_k=20, top_p=1.5, use_cache=True, do_sample=True) 
  print(tokenizer.decode(output[0][len(prompt_tokens[0]):], skip_special_tokens=True).replace('assistant', '').strip())
  



use_fast_kernelsTrue

What is the capital of France?
Paris.

List all the functions
af

disassemble the main function
pdf @ main


In [118]:
# TODO: fix the notebook PATH env so we can put llama.cpp build here
#!python ./llama.cpp/convert_hf_to_gguf.py {hf_model_path} --outtype f16 --outfile {hf_model_path}.fp16.gguf
q_method = "Q4_K_M"
q_path = f"{hf_model_path}.{q_method}.gguf"
!./llama.cpp/llama-quantize {hf_model_path}.fp16.gguf {q_path} {q_method}
q_path


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


main: build = 3998 (0a683e80)
main: built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
main: quantizing '/mnt/efs/checkpoints/hf/r2ai-3.2-1B-Instruct.fp16.gguf' to '/mnt/efs/checkpoints/hf/r2ai-3.2-1B-Instruct.Q4_K_M.gguf' as Q4_K_M
llama_model_loader: loaded meta data with 28 key-value pairs and 147 tensors from /mnt/efs/checkpoints/hf/r2ai-3.2-1B-Instruct.fp16.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = R2Ai 3.2 1B Instruct
llama_model_loader: - kv   3:                           general.finetune str              = Instruct
llama_model_loader: - kv   4:                           general.basename 

'/mnt/efs/checkpoints/hf/r2ai-3.2-1B-Instruct.Q4_K_M.gguf'

In [135]:
for messages in eval_messages[:3]:
  prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
  !./llama.cpp/llama-cli -lv 0 --model {q_path} --prompt "{prompt}"



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
ggml_cuda_init: found 4 CUDA devices:
  Device 0: NVIDIA A10G, compute capability 8.6, VMM: yes
  Device 1: NVIDIA A10G, compute capability 8.6, VMM: yes
  Device 2: NVIDIA A10G, compute capability 8.6, VMM: yes
  Device 3: NVIDIA A10G, compute capability 8.6, VMM: yes
build: 3998 (0a683e80) with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
main: llama backend init
main: load the model and apply lora adapter, if any
llama_load_model_from_file: using device CUDA0 (NVIDIA A10G) - 18975 MiB free
llama_load_model_from_file: using device CUDA1 (NVIDIA A10G) - 18663 MiB free
llama_load_model_from_file: using device CUDA2 (NVIDIA A10G) - 18663 MiB free
llama_load_model_from_file: using device CUDA3 (NVIDIA A10G) - 21993 MiB free
llama_model_loader: loaded meta data with 28 key-value pairs and 147 tensors from /mnt/efs/checkpoints/hf/r2ai-3.2-1B-Instruct.Q4_K_M.gguf (version GGUF V3 (latest))

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
ggml_cuda_init: found 4 CUDA devices:
  Device 0: NVIDIA A10G, compute capability 8.6, VMM: yes
  Device 1: NVIDIA A10G, compute capability 8.6, VMM: yes
  Device 2: NVIDIA A10G, compute capability 8.6, VMM: yes
  Device 3: NVIDIA A10G, compute capability 8.6, VMM: yes
build: 3998 (0a683e80) with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
main: llama backend init
main: load the model and apply lora adapter, if any
llama_load_model_from_file: using device CUDA0 (NVIDIA A10G) - 18975 MiB free
llama_load_model_from_file: using device CUDA1 (NVIDIA A10G) - 18663 MiB free
llama_load_model_from_file: using device CUDA2 (NVIDIA A10G) - 18663 MiB free
llama_load_model_from_file: using device CUDA3 (NVIDIA A10G) - 21993 MiB free
llama_model_loader: loaded meta data with 28 key-value pairs and 147 tensors from /mnt/efs/checkpoints/hf/r2ai-3.2-1B-Instruct.Q4_K_M.gguf (version GGUF V3 (latest))

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
ggml_cuda_init: found 4 CUDA devices:
  Device 0: NVIDIA A10G, compute capability 8.6, VMM: yes
  Device 1: NVIDIA A10G, compute capability 8.6, VMM: yes
  Device 2: NVIDIA A10G, compute capability 8.6, VMM: yes
  Device 3: NVIDIA A10G, compute capability 8.6, VMM: yes
build: 3998 (0a683e80) with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
main: llama backend init
main: load the model and apply lora adapter, if any
llama_load_model_from_file: using device CUDA0 (NVIDIA A10G) - 18975 MiB free
llama_load_model_from_file: using device CUDA1 (NVIDIA A10G) - 18663 MiB free
llama_load_model_from_file: using device CUDA2 (NVIDIA A10G) - 18663 MiB free
llama_load_model_from_file: using device CUDA3 (NVIDIA A10G) - 21993 MiB free
llama_model_loader: loaded meta data with 28 key-value pairs and 147 tensors from /mnt/efs/checkpoints/hf/r2ai-3.2-1B-Instruct.Q4_K_M.gguf (version GGUF V3 (latest))

In [122]:
import huggingface_hub
# huggingface_hub.interpreter_login()
hf_username = huggingface_hub.whoami()['name']
repo_id = f"{hf_username}/{name}-GGUF"
huggingface_hub.create_repo(repo_id=repo_id)
huggingface_hub.upload_file(path_or_fileobj=q_path, path_in_repo=f"{name}.{q_method}.gguf", repo_id=repo_id)


r2ai-3.2-1B-Instruct.Q4_K_M.gguf:   0%|          | 0.00/808M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/dnakov/r2ai-3.2-1B-Instruct-GGUF/commit/1038889bdf85e9c590d85b95ef2f3cb17d027149', commit_message='Upload r2ai-3.2-1B-Instruct.Q4_K_M.gguf with huggingface_hub', commit_description='', oid='1038889bdf85e9c590d85b95ef2f3cb17d027149', pr_url=None, repo_url=RepoUrl('https://huggingface.co/dnakov/r2ai-3.2-1B-Instruct-GGUF', endpoint='https://huggingface.co', repo_type='model', repo_id='dnakov/r2ai-3.2-1B-Instruct-GGUF'), pr_revision=None, pr_num=None)